In [8]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [9]:
df = pd.read_csv('data/resumes.csv')
print("dataset shape:", df.shape)
df.head()


dataset shape: (1000, 11)


Unnamed: 0,Resume_ID,Name,Skills,Experience (Years),Education,Certifications,Job Role,Recruiter Decision,Salary Expectation ($),Projects Count,AI Score (0-100)
0,1,Ashley Ali,"TensorFlow, NLP, Pytorch",10,B.Sc,,AI Researcher,Hire,104895,8,100
1,2,Wesley Roman,"Deep Learning, Machine Learning, Python, SQL",10,MBA,Google ML,Data Scientist,Hire,113002,1,100
2,3,Corey Sanchez,"Ethical Hacking, Cybersecurity, Linux",1,MBA,Deep Learning Specialization,Cybersecurity Analyst,Hire,71766,7,70
3,4,Elizabeth Carney,"Python, Pytorch, TensorFlow",7,B.Tech,AWS Certified,AI Researcher,Hire,46848,0,95
4,5,Julie Hill,"SQL, React, Java",4,PhD,,Software Engineer,Hire,87441,9,100


In [10]:
df.columns

Index(['Resume_ID', 'Name', 'Skills', 'Experience (Years)', 'Education',
       'Certifications', 'Job Role', 'Recruiter Decision',
       'Salary Expectation ($)', 'Projects Count', 'AI Score (0-100)'],
      dtype='object')

In [11]:
bias_columns=[
    "Name","Recruiter Decision","Salary Expectation ($)"]
df_fair = df.drop(columns=bias_columns)
df_fair.head()

Unnamed: 0,Resume_ID,Skills,Experience (Years),Education,Certifications,Job Role,Projects Count,AI Score (0-100)
0,1,"TensorFlow, NLP, Pytorch",10,B.Sc,,AI Researcher,8,100
1,2,"Deep Learning, Machine Learning, Python, SQL",10,MBA,Google ML,Data Scientist,1,100
2,3,"Ethical Hacking, Cybersecurity, Linux",1,MBA,Deep Learning Specialization,Cybersecurity Analyst,7,70
3,4,"Python, Pytorch, TensorFlow",7,B.Tech,AWS Certified,AI Researcher,0,95
4,5,"SQL, React, Java",4,PhD,,Software Engineer,9,100


In [12]:
df_fair["selected"]=df_fair["AI Score (0-100)"].apply(lambda x: 1 if x>=70 else 0)
df_fair[["AI Score (0-100)","selected"]].head()


Unnamed: 0,AI Score (0-100),selected
0,100,1
1,100,1
2,70,1
3,95,1
4,100,1


In [16]:
X_text=df_fair["Skills"]
y=df_fair["selected"]

print("target distribution:")
y.value_counts()

target distribution:


selected
1    773
0    227
Name: count, dtype: int64

In [17]:
tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=1000
)

X = tfidf.fit_transform(X_text)

print("TF-IDF feature matrix shape:", X.shape)


TF-IDF feature matrix shape: (1000, 15)


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [20]:
model=LogisticRegression(max_iter=1000)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)


In [21]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.775

Classification Report:

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        45
           1       0.78      1.00      0.87       155

    accuracy                           0.78       200
   macro avg       0.39      0.50      0.44       200
weighted avg       0.60      0.78      0.68       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
df_fair["Selection_Probability"] = model.predict_proba(X)[:, 1]

ranked_resumes = df_fair.sort_values(
    "Selection_Probability",
    ascending=False
)

ranked_resumes[
    ["Resume_ID", "Skills", "Experience (Years)", "Projects Count", "Selection_Probability"]
].head(5)


Unnamed: 0,Resume_ID,Skills,Experience (Years),Projects Count,Selection_Probability
265,266,"NLP, TensorFlow",7,1,0.833046
963,964,"TensorFlow, NLP",6,8,0.833046
74,75,"TensorFlow, NLP",9,4,0.833046
788,789,"TensorFlow, NLP",3,10,0.833046
746,747,"NLP, TensorFlow",6,10,0.833046


### Conclusion
This project demonstrates a Bias-Aware Resume Screening System using a Kaggle dataset.
Sensitive attributes were removed, resumes were evaluated using skill-based features,
and candidates were ranked fairly using a machine learning model.
