In [1]:
import os
import sys

In [2]:
%pwd

'e:\\Resume_Screening\\resume_screening_ai\\research'

In [3]:
os.chdir('e:\\Resume_Screening\\resume_screening_ai')

In [4]:
%pwd

'e:\\Resume_Screening\\resume_screening_ai'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [6]:
@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir :Path
    data_path : Path
    transformed_data_path : Path

In [7]:
from src.Resume_Screening.constants import *
from src.Resume_Screening import logger
from src.Resume_Screening.utils.common import create_directory,read_yaml

In [8]:
class Configuration:
    def __init__(self, 
                 config_filepath=CONFIG_FILE_PATH,
                 schema_filepath=SCHEMA_FILE_PATH,
                 params_filepath=PARAMS_FILE_PATH):
        
        self.config = read_yaml(config_filepath)
        self.schema = read_yaml(schema_filepath)
        self.params = read_yaml(params_filepath)
        
        # Create the root directory for artifacts
        create_directory([self.config.artifacts_root])
        print([self.config.artifacts_root])

    def get_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation
        create_directory([config.root_dir])
        data_transformation_config = DataTransformationConfig(
            root_dir = config.root_dir,
            data_path = config.data_path,
            transformed_data_path = config.transformed_data_path
        )
        
        return data_transformation_config
        
        

In [9]:
from src.Resume_Screening.constants.skills import skills_list
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords 
import pandas as pd

In [10]:

class DataTransformation:
    def __init__(self, config : DataTransformationConfig):
        self.config = config
        self.skills_list = skills_list or []
        nltk.download('stopwords')
        self.stop_words = set(stopwords.words('english'))
    
    def load_data(self) :
        df = pd.read_csv(self.config.data_path)
        return df
    
    def clean_text(self,text):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z ]', '', text)  
        text = " ".join([word for word in text.split() if word not in self.stop_words])
        return text
    
    def extract_skills_from_cleaned_text(self, cleaned_text, skills_list):
        found_skills = []
        for skill in skills_list:
            if skill in cleaned_text:
                found_skills.append(skill)
        return found_skills
    
    def extract_experience_in_months(self, text):
        text = text.lower()

        # Extract years
        year_matches = re.findall(r'(\d+(?:\.\d+)?)\s*(?:\+)?\s*(?:years?|yrs?|yr)', text)
        total_years = sum(float(y) for y in year_matches)

        # Extract months
        month_matches = re.findall(r'(\d+(?:\.\d+)?)\s*(?:months?|mos?)', text)
        total_months = sum(float(m) for m in month_matches)

        return round(total_years * 12 + total_months)

    def extract_skills(self, text):
        found_skills = []
        text = text.lower()
        for skill in self.skills_list:
            if skill.lower() in text:
                found_skills.append(skill)
        return list(set(found_skills)) 
    
    def compute_skill_match(self, df, category_column='Category', skill_column='extracted_skills'):
    
        role_skill_map = (
            df.groupby(category_column)[skill_column]
            .sum()  # Combine all skill lists
            .apply(lambda x: list(set(x)))  # Remove duplicates
            .to_dict()
        )
        return role_skill_map
    
    def get_matching_skills(self,row,role_skill_map):
        role_skills = set(role_skill_map.get(row['Category'], []))
        resume_skills = set(row['extracted_skills'])
        match_count = len(role_skills.intersection(resume_skills))
        total_role_skills = len(role_skills)
        match_percent = (match_count / total_role_skills * 100) if total_role_skills > 0 else 0
        return pd.Series([match_count, round(match_percent, 2)])
    
    
    def compute_resume_score(self, row, role_skill_map, max_exp_months=140, skill_weight=0.7, exp_weight=0.3):
        role_skills = set(role_skill_map.get(row['Category'], []))
        resume_skills = set(row['extracted_skills'])

        match_count = len(role_skills.intersection(resume_skills))
        total_role_skills = len(role_skills)
        skill_match_percent = (match_count / total_role_skills * 100) if total_role_skills > 0 else 0

        exp_months = row.get('experience_months', 0)
        exp_score = min(exp_months / max_exp_months * 100, 100)

        final_score = (skill_match_percent * skill_weight) + (exp_score * exp_weight)

        return pd.Series([
            match_count,
            round(skill_match_percent, 2),
            round(exp_score, 2),
            round(final_score, 2)
        ])


In [11]:
try:
    config = Configuration()
    data_transformation_config = config.get_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data = data_transformation.load_data()
    data['cleaned_text'] = data['Resume'].apply(data_transformation.clean_text)
    data['extracted_skills'] = data['cleaned_text'].apply(lambda x: data_transformation.extract_skills_from_cleaned_text(x, skills_list))
    data['experience_months'] = data['Resume'].apply(data_transformation.extract_experience_in_months)
    role_skill_map =  data_transformation.compute_skill_match(data)
    data[['matching_skill_count', 'matching_skill_percent']] = data.apply( lambda row: data_transformation.get_matching_skills(row, role_skill_map),
    axis=1)
    data[['matching_skill_count', 'matching_skill_percent', 'experience_score', 'resume_score']] = data.apply(data_transformation.compute_resume_score,axis=1,role_skill_map=role_skill_map
)

except Exception as e :
    raise e


[2025-07-17 19:06:25,723] - INFO : common  : yaml file : config\config.yaml loaded successfully
[2025-07-17 19:06:25,765] - INFO : common  : yaml file : schema.yaml loaded successfully
[2025-07-17 19:06:25,798] - INFO : common  : yaml file : params.yaml loaded successfully
[2025-07-17 19:06:25,801] - INFO : common  : directory created at artifacts
['artifacts']
[2025-07-17 19:06:25,806] - INFO : common  : directory created at artifacts/data_transformation


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sush0\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
data.head()

Unnamed: 0,Category,Resume,cleaned_text,extracted_skills,experience_months,matching_skill_count,matching_skill_percent,experience_score,resume_score
0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...,"[python, java, javascript, go, html, css, angu...",72,21.0,53.85,51.43,53.12
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may may uitrgpvdata scientis...,"[python, machine learning, keras, r, aws, gith...",84,7.0,17.95,60.0,30.56
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas interest deep learning control system de...,"[python, java, go, sql, mysql, machine learnin...",96,14.0,35.9,68.57,45.7
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...,"[python, go, swift, css, sql, machine learning...",149,13.0,33.33,100.0,53.33
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...,"[python, java, data analysis, r]",72,4.0,10.26,51.43,22.61


In [13]:

role_skill_map

{'Advocate': ['erp', 'aws', 'go', 'adaptability', 'r'],
 'Arts': ['communication', 'excel', 'erp', 'sas', 'go', 'adaptability', 'r'],
 'Automation Testing': ['sql',
  'emr',
  'go',
  'mysql',
  'r',
  'java',
  'excel',
  'erp',
  'python',
  'jenkins',
  'jira',
  'agile',
  'time management',
  'machine learning',
  'autocad',
  'communication',
  'critical thinking',
  'kanban',
  'scrum',
  'oracle',
  'html',
  'javascript'],
 'Blockchain': ['sql',
  'postgresql',
  'aws',
  'go',
  'mysql',
  'computer vision',
  'mongodb',
  'r',
  'css',
  'java',
  'react',
  'erp',
  'python',
  'sap',
  'docker',
  'machine learning',
  'communication',
  'php',
  'angular',
  'problem solving',
  'oracle',
  'html',
  'javascript'],
 'Business Analyst': ['sql',
  'forecasting',
  'business analysis',
  'go',
  'r',
  'java',
  'excel',
  'erp',
  'prototyping',
  'jira',
  'agile',
  'autocad',
  'communication',
  'accounting',
  'php',
  'problem solving',
  'scrum',
  'oracle',
  'html'

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_role_similarity(resume_text, role_title):
    resume_emb = model.encode([resume_text], convert_to_tensor=True)
    role_emb = model.encode([role_title], convert_to_tensor=True)
    similarity = cosine_similarity(resume_emb, role_emb)
    return float(similarity[0][0]) * 100  # Return as percentage


  from .autonotebook import tqdm as notebook_tqdm


[2025-07-17 19:07:38,558] - INFO : SentenceTransformer  : Use pytorch device_name: cpu
[2025-07-17 19:07:38,560] - INFO : SentenceTransformer  : Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [None]:
data['role_similarity_score'] =  data.apply(
    lambda row: compute_role_similarity(row['Resume'], row['Category']),
    axis=1
)

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.94s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 18.23it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 45.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.60it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 77.11it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.29it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 62.67it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 17.59it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 52.77it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 16.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 66.84it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  9.64it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 41.78it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.82it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 25.71it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 22.27it/s]
Batches: 1

In [None]:
data

Unnamed: 0,Category,Resume,cleaned_text,extracted_skills,experience_months,matching_skill_count,matching_skill_percent,experience_score,resume_score,role_similarity_score
0,Data Science,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...,"[python, java, javascript, go, html, css, angu...",72,21.0,53.85,51.43,53.12,51.241100
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may may uitrgpvdata scientis...,"[python, machine learning, keras, r, aws, gith...",84,7.0,17.95,60.00,30.56,35.249740
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",areas interest deep learning control system de...,"[python, java, go, sql, mysql, machine learnin...",96,14.0,35.90,68.57,45.70,41.283131
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills r python sap hana tableau sap hana sql ...,"[python, go, swift, css, sql, machine learning...",149,13.0,33.33,100.00,53.33,39.343917
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...,"[python, java, data analysis, r]",72,4.0,10.26,51.43,22.61,46.317798
...,...,...,...,...,...,...,...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skills proficient ms office word basi...,"[go, excel, r, creativity]",18,4.0,28.57,12.86,23.86,32.314083
958,Testing,â Willingness to accept the challenges. â ...,willingness accept challenges positive thinkin...,"[go, r, erp, communication]",24,4.0,28.57,17.14,25.14,51.463270
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skills quick learner eagerness learn ...,"[go, r, leadership]",0,3.0,21.43,0.00,15.00,38.546628
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skills software knowledge mspower poi...,"[r, matlab]",18,2.0,14.29,12.86,13.86,16.739747
