### Imports & Load Cleaned Data

In [1]:
import pandas as pd
import numpy as np
import re
import os

# Load cleaned datasets
hr_df = pd.read_csv("processed/hr_clean.csv")
job_df = pd.read_csv("processed/job_descriptions_clean.csv")
reviews_df = pd.read_csv("processed/employee_reviews_clean.csv")
resumes_df = pd.read_csv("processed/resumes_clean.csv")

print("HR:", hr_df.shape)
print("Jobs:", job_df.shape)
print("Reviews:", reviews_df.shape)
print("Resumes:", resumes_df.shape)

HR: (1470, 35)
Jobs: (22000, 15)
Reviews: (355, 7)
Resumes: (2484, 4)


### FEATURE ENGINEERING FOR HR DATASET

In [2]:
# Convert Yes/No to 1/0
if 'Attrition' in hr_df.columns:
    hr_df['Attrition_Flag'] = hr_df['Attrition'].map({'Yes': 1, 'No': 0})


In [3]:
# If YearsAtCompany exists
if 'YearsAtCompany' in hr_df.columns:
    hr_df['TenureMonths'] = hr_df['YearsAtCompany'] * 12


### Encode Categorical Features

In [4]:
categorical_cols = hr_df.select_dtypes(include=['object']).columns.tolist()

# Remove text columns like Attrition, Over18
drop_cols = ['Attrition', 'EmployeeCount', 'Over18', 'EmployeeNumber']
categorical_cols = [col for col in categorical_cols if col not in drop_cols]

hr_categorical = pd.get_dummies(hr_df[categorical_cols], drop_first=True)

print("Encoded categorical columns:", hr_categorical.shape)


Encoded categorical columns: (1470, 21)


### Scale Numerical Columns

In [5]:
from sklearn.preprocessing import StandardScaler

numeric_cols = hr_df.select_dtypes(include=['int64','float64']).columns
numeric_cols = [col for col in numeric_cols if col not in ['Attrition_Flag']]

scaler = StandardScaler()
hr_numeric = pd.DataFrame(scaler.fit_transform(hr_df[numeric_cols]), 
                          columns=numeric_cols)

print("Scaled numeric columns:", hr_numeric.shape)


Scaled numeric columns: (1470, 27)


### Combine Final HR Feature Matrix

In [6]:
hr_final = pd.concat([hr_numeric, hr_categorical, hr_df['Attrition_Flag']], axis=1)

print("Final HR feature matrix:", hr_final.shape)
hr_final.head()


Final HR feature matrix: (1470, 49)


Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes,Attrition_Flag
0,0.44635,0.742527,-1.010909,-0.891688,0.0,-1.701283,-0.660531,1.383138,0.379672,-0.057788,...,False,False,False,False,True,False,False,True,True,1
1,1.322365,-1.297775,-0.14715,-1.868426,0.0,-1.699621,0.254625,-0.240677,-1.026167,-0.057788,...,False,False,False,True,False,False,True,False,False,0
2,0.008343,1.414363,-0.887515,-0.891688,0.0,-1.696298,1.169781,1.284725,-1.026167,-0.961486,...,False,False,False,False,False,False,False,True,True,1
3,-0.429664,1.461466,-0.764121,1.061787,0.0,-1.694636,1.169781,-0.486709,0.379672,-0.961486,...,False,False,False,True,False,False,True,False,True,0
4,-1.086676,-0.524295,-0.887515,-1.868426,0.0,-1.691313,-1.575686,-1.274014,0.379672,-0.961486,...,False,False,False,False,False,False,True,False,False,0


### FEATURE ENGINEERING FOR JOB DESCRIPTIONS

In [7]:
def normalize_text(t):
    t = t.lower()
    t = re.sub(r'[^a-zA-Z0-9 ]', ' ', t)
    t = re.sub(r'\s+', ' ', t)
    return t.strip()

job_df['normalized_description'] = job_df['clean_description'].astype(str).apply(normalize_text)

job_df[['clean_description', 'normalized_description']].head()



Unnamed: 0,clean_description,normalized_description
0,TeamSoft is seeing an IT Support Specialist to...,teamsoft is seeing an it support specialist to...
1,The Wisconsin State Journal is seeking a flexi...,the wisconsin state journal is seeking a flexi...
2,Report this job About the Job DePuy Synthes Co...,report this job about the job depuy synthes co...
3,Why Join Altec? If you’re considering a career...,why join altec if you re considering a career ...
4,Position ID# 76162 # Positions 1 State CT City...,position id 76162 positions 1 state ct city fa...


### FEATURE ENGINEERING FOR RESUMES

In [8]:
# Resume length
resumes_df['resume_length'] = resumes_df['clean_text'].astype(str).apply(len)

# Word count
resumes_df['word_count'] = resumes_df['clean_text'].astype(str).apply(lambda x: len(x.split()))

resumes_df[['filename', 'resume_length', 'word_count']].head()


Unnamed: 0,filename,resume_length,word_count
0,10554236.pdf,24149,3466
1,10674770.pdf,7488,1047
2,11163645.pdf,4742,628
3,11759079.pdf,5917,849
4,12065211.pdf,5561,783


### SKILL EXTRACTION (BASIC)

In [9]:
skill_keywords = [
    "python", "java", "sql", "machine learning", "deep learning",
    "data analysis", "nlp", "pandas", "numpy", "excel", "communication",
    "react", "django", "flask", "aws", "docker", "git"
]


In [10]:
def extract_skills(text):
    found = []
    t = text.lower()
    for skill in skill_keywords:
        if skill in t:
            found.append(skill)
    return list(set(found))

resumes_df['skills_extracted'] = resumes_df['clean_text'].astype(str).apply(extract_skills)
resumes_df[['filename','skills_extracted']].head()


Unnamed: 0,filename,skills_extracted
0,10554236.pdf,"[communication, excel, aws]"
1,10674770.pdf,[excel]
2,11163645.pdf,"[communication, excel]"
3,11759079.pdf,[excel]
4,12065211.pdf,"[excel, sql, communication]"


### FEATURE ENGINEERING FOR REVIEWS

In [11]:
reviews_df['review_length'] = reviews_df['clean_review'].astype(str).apply(len)
reviews_df['word_count'] = reviews_df['clean_review'].astype(str).apply(lambda x: len(x.split()))

reviews_df[['clean_review', 'review_length', 'word_count']].head()


Unnamed: 0,clean_review,review_length,word_count
0,Pricing Analyst,15,2
1,Lead AI / ML / Data Science Engineer,36,8
2,Data Scientist,14,2
3,Research Fellow in data analysis and machine l...,53,8
4,Senior Data Scientist | Media & Entertainment.,46,7


### SAVE PROCESSED OUTPUTS

In [12]:
os.makedirs("features", exist_ok=True)

hr_final.to_csv("features/hr_features.csv", index=False)
job_df.to_csv("features/job_features.csv", index=False)
resumes_df.to_csv("features/resume_features.csv", index=False)
reviews_df.to_csv("features/review_features.csv", index=False)

print("All feature files saved successfully!")


All feature files saved successfully!
