### Custom Dataset code

In [1]:
import pandas as pd
import random
import numpy as np

# Setting seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)

# Generating synthetic data
names = ["Alex", "Jamie", "Chris", "Jordan", "Taylor", "Morgan", "Casey", "Drew", "Pat", "Cameron"]
genders = ["Male", "Female", "Non-binary"]
ages = list(range(21, 60))
races = ["White", "Black", "Asian", "Hispanic", "Other"]
resumetxt = [
    "Experienced software engineer with expertise in Python and Java.",
    "Marketing specialist with a strong background in digital campaigns.",
    "Data analyst proficient in machine learning and visualization tools.",
    "Project manager with a history of successful team leadership.",
    "Graphic designer skilled in Adobe Creative Suite and UX design.",
    "Sales associate with exceptional communication and negotiation skills.",
    "Business analyst with expertise in process optimization and strategy.",
    "Content writer specializing in SEO and technical writing.",
    "IT support specialist with certifications in networking and security.",
    "Research scientist with publications in artificial intelligence."
]

# Defining bias probabilities based on NBER and related research
def calculateprob(gender, race, age):
    # Base probability for a person
    prob = 0.5

    # Gender bias
    if gender == "Male":
        prob += 0.3  # +30% for males
    elif gender == "Female":
        prob -= 0.3  # -30% for females
    elif gender == "Non-binary":
        prob -= 0.6  # -60% for non-binary

    # Race bias
    if race == "White":
        prob += 0.3  # +30% for White candidates
    elif race in ["Black", "Hispanic"]:
        prob -= 0.6  # -60% for Black and Hispanic candidates
    # Asian and Other are neutral (no penalty or boost according to research)

    # Age bias
    if age < 30:
        prob += 0.3  # +30% for young candidates (<30)
    elif 30 <= age <= 45:
        prob += 0.1  # +10% for mid-age candidates (30-50)
    elif age > 45:
        prob -= 0.6  # -60% for older candidates (>50)

    # Clamping probability between 0 and 1
    return min(max(prob, 0), 1)  

# Creating dataset
nsamp = 2000
samp = nsamp // 2

# For a balanced dataset, I am dividing the samples equally
data0 = []
data1 = []

while len(data0) < samp or len(data1) < samp:
    name = random.choice(names) + " " + random.choice(["Smith", "Johnson", "Brown", "Lee", "Garcia"])
    gender = random.choice(genders)
    age = random.choice(ages)
    race = random.choice(races)
    rstxt = random.choice(resumetxt)
    rstxt = f"{name}, a {gender}, aged {age}, of {race} ethnicity. {rstxt}"

    shortlistprob = calculateprob(gender, race, age)
    shortlisted = 1 if random.random() < shortlistprob else 0

    if shortlisted == 1 and len(data1) < samp:
        data1.append([name, gender, age, race, rstxt, shortlisted])
    elif shortlisted == 0 and len(data0) < samp:
        data0.append([name, gender, age, race, rstxt, shortlisted])

# Combining and shuffling the data
data = data0 + data1
random.shuffle(data)

# Creating a data frame for data
df = pd.DataFrame(data, columns=["Name", "Gender", "Age", "Race", "Resume Text", "Shortlisted"])

# Saving to CSV
df.to_csv("balanced_biased_resume_dataset.csv", index=False)
print("Balanced biased dataset saved as 'balanced_biased_resume_dataset.csv'.")


Balanced biased dataset saved as 'balanced_biased_resume_dataset.csv'.


### Base line model code

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
import scipy.sparse as sp

# Loading the dataset
df = pd.read_csv('balanced_biased_resume_dataset.csv')

# Preprocessing
df['Resume Text'] = df['Resume Text'].str.replace(r'\W', ' ')
df['Resume Text'] = df['Resume Text'].str.lower()

# Encoding categorical columns: 'Gender' and 'Race'
legender = LabelEncoder()
df['Genderen'] = legender.fit_transform(df['Gender'])

lerace = LabelEncoder()
df['Raceen'] = lerace.fit_transform(df['Race'])

# Bucketing 'Age' into categories (e.g., 20-30, 31-40, etc.)
df['Agebucket'] = pd.cut(df['Age'], bins=[20, 30, 40, 50, 60, 100], labels=['20-30', '31-40', '41-50', '51-60', '60+'])

# Encoding Age categories
label_encoder_age = LabelEncoder()
df['Ageen'] = label_encoder_age.fit_transform(df['Agebucket'])

# Features and target seperation
X = df[['Resume Text', 'Genderen', 'Ageen', 'Raceen']]
y = df['Shortlisted']  # 1 = shortlisted, 0 = not shortlisted

# Splitting data into train and test sets
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

# Extracting text features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
Xtraintxt = vectorizer.fit_transform(Xtrain['Resume Text'])
Xtesttxt = vectorizer.transform(Xtest['Resume Text'])

# Combining text features with the encoded protected attributes (Gender, Age, Race)
Xtraincomb = sp.hstack([Xtraintxt, Xtrain[['Genderen', 'Ageen', 'Raceen']].values])
Xtestcomb = sp.hstack([Xtesttxt, Xtest[['Genderen', 'Ageen', 'Raceen']].values])

# Training Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(Xtraincomb, ytrain)

# Predicting and evaluating the model
ypred = model.predict(Xtestcomb)

# Outputing evaluation metrics
print(f"Accuracy: {accuracy_score(ytest, ypred):.2f}")
print("Classification Report:")
print(classification_report(ytest, ypred))

# Here is the Bias Detection which analyzing the effect of Gender, Age, and Race on predictions
coefficients = model.coef_

# We have to check if certain protected groups are associated with higher/lower likelihoods of being shortlisted
print("Model Coefficients for Protected Attributes (Gender, Age, Race):")
print(f"Gender Coefficients: {coefficients[0, -3:-2]}")  # Coefficient related to Gender
print(f"Age Coefficients: {coefficients[0, -2:-1]}")     # Coefficient related to Age
print(f"Race Coefficients: {coefficients[0, -1:]}")     # Coefficient related to Race

Accuracy: 0.85
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       216
           1       0.84      0.84      0.84       184

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400

Model Coefficients for Protected Attributes (Gender, Age, Race):
Gender Coefficients: [-0.15639141]
Age Coefficients: [-1.10251346]
Race Coefficients: [0.42356932]
