In [1]:
import os
directory = "datasets"
files = os.listdir(directory)

In [2]:
import pandas as pd

directory = "datasets"

data = pd.concat(
    [
        pd.read_excel(os.path.join(directory, file)).rename(
            columns=lambda x: x.strip().replace(" ", "_").translate(str.maketrans("", "", r"""!"#$%&'()*+,./:;<=>?@[\]^`{|}~"""))
        )
        for file in os.listdir(directory)
        if file.endswith((".xlsx", ".xls"))
    ],
    ignore_index=True, 
)



In [3]:
# Calculate resume and job description similarity (Cosine Similarity)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()
resume_jd_similarity = []
for i in range(len(data)):
    resume = data['Resume'][i]
    jd = data['Job_Description'][i]
    similarity = cosine_similarity(vectorizer.fit_transform([resume, jd]))[0, 1]
    resume_jd_similarity.append(similarity)
data['resume_jd_similarity'] = resume_jd_similarity

In [4]:
# Calculate resume and transcript similarity (Cosine Similarity)
resume_transcript_similarity = []
for i in range(len(data)):
    resume = data['Resume'][i]
    transcript = data['Transcript'][i]
    similarity = cosine_similarity(vectorizer.fit_transform([resume, transcript]))[0, 1]
    resume_transcript_similarity.append(similarity)
data['resume_transcript_similarity'] = resume_transcript_similarity

In [5]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Perform sentiment analysis on each transcript
data['sentiment'] = data['Transcript'].apply(lambda transcript: sia.polarity_scores(transcript)['compound'])

# Overall average sentiment score
average_sentiment = data['sentiment'].mean()




In [6]:
# Function to calculate lexical diversity
def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words)

# Compute lexical diversity for each transcript
data['lexical_diversity'] = data['Transcript'].apply(lexical_diversity)

# Calculate statistics
average_diversity = data['lexical_diversity'].mean()


'''Lexical diversity measures the variety of unique words in a text relative to the total number of words. 
'''


'Lexical diversity measures the variety of unique words in a text relative to the total number of words. \n'

In [7]:
# Length of transcript (number of words)
data['transcript_length_words'] = data['Transcript'].apply(lambda x: len(x.split()))

# Calculate statistics
average_length = data['transcript_length_words'].mean()
min_length = data['transcript_length_words'].min()
max_length = data['transcript_length_words'].max()


In [8]:
#Reason for Decision Length
data['reason_length'] = data['Reason_for_decision'].str.split().apply(len)


In [9]:
# Resume length (number of words)
data['resume_length'] = data['Resume'].apply(lambda x: len(x.split()))

In [10]:
#Word Count Ratio
data['word_count_ratio'] = data['transcript_length_words'] / data['resume_length']


In [11]:
#Role to Transcript Similarity

def text_similarity(text1, text2):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(vectors[0], vectors[1])[0, 0]

data['role_transcript_similarity'] = data.apply(
    lambda row: text_similarity(row['Role'], row['Transcript']), axis=1
)


In [12]:
from textblob import TextBlob
data['cultural_fit_sentiment'] = data['Reason_for_decision'].apply(lambda x: TextBlob(x).sentiment.polarity)


In [13]:
#Job Description to Transcript Sentiment Gap
data['jd_transcript_sentiment_gap'] = data['sentiment'] - data['cultural_fit_sentiment']


In [14]:
#Job Description Length
data['job_desc_length'] = data['Job_Description'].str.split().apply(len)


In [15]:
#Role to Resume Similarity
data['role_resume_similarity'] = data.apply(
    lambda row: text_similarity(row['Role'], row['Resume']), axis=1
)


In [16]:
#Combined Text Similarity
data['combined_text_similarity'] = (
    data['resume_jd_similarity'] + data['resume_transcript_similarity']
) / 2


In [17]:
#Sentiment to Lexical Diversity Ratio
data['sentiment_to_diversity_ratio'] = data['sentiment'] / data['lexical_diversity']


In [18]:
#clarity score
import textstat
data['clarity_score'] = data['Transcript'].apply(lambda x: textstat.flesch_reading_ease(x))


In [19]:
#confidence score
data['confidence_score'] = data['Transcript'].apply(lambda x: x.count('I think') + x.count('Maybe'))


In [20]:
#Clarity and Confidence Interaction
data['clarity_confidence_interaction'] = data['clarity_score'] * data['confidence_score']


In [21]:
#Soft Skills
from textblob import TextBlob

data['soft_skills_sentiment'] = data['Transcript'].apply(lambda x: TextBlob(x).sentiment.polarity)


In [22]:
# Length of transcript (number of characters)
data['transcript_length_characters'] = data['Transcript'].apply(len)

# Calculate statistics
average_length = data['transcript_length_characters'].mean()
min_length = data['transcript_length_characters'].min()
max_length = data['transcript_length_characters'].max()

In [23]:
# Function to compute similarity score between Resume and Job Description
def compute_similarity(text1, text2):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

# Calculate technical skill matching score
data['technical_skill_match'] = data.apply(lambda row: compute_similarity(row['Resume'], row['Job_Description']), axis=1)


In [24]:
# Function to compute similarity score between Resume and Job Description
def compute_similarity(text1, text2):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

# Calculate technical skill matching score
data['technical_skill_match2'] = data.apply(lambda row: compute_similarity(row['Resume'], row['Transcript']), axis=1)


In [25]:
# Function to compute similarity score between Resume and Job Description
def compute_similarity(text1, text2):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

# Calculate technical skill matching score
data['technical_skill_match3'] = data.apply(lambda row: compute_similarity(row['Job_Description'], row['Transcript']), axis=1)


In [26]:
# Job Description Experience Match (Simple matching based on keywords, could be improved)
data['job_description_experience_match'] = data.apply(lambda row: len(set(row['Resume'].split()) & set(row['Job_Description'].split())), axis=1)


In [27]:
#job score
def job_fit_analysis(job_desc, transcript):
    # You can use similarity or keyword matching here
    job_keywords = job_desc.split()
    transcript_keywords = transcript.split()
    common_keywords = set(job_keywords).intersection(transcript_keywords)
    return len(common_keywords) / len(job_keywords)

data['job_fit_score'] = data.apply(lambda row: job_fit_analysis(row['Job_Description'], row['Transcript']), axis=1)


In [28]:
#job description complexity
import textstat
data['job_desc_complexity'] = data['Job_Description'].apply(lambda x: textstat.flesch_reading_ease(x))

In [29]:
#interaction quality check
data['interaction_quality'] = data['num_words_in_transcript'] * data['sentiment']

In [30]:
# Text complexity (resume and transcript - using a simple metric like Flesch Reading Ease)
def text_complexity(text):
    # Implement text complexity (e.g., Flesch Reading Ease)
    # Here's a placeholder function:
    return len(text.split()) / len(set(text.split()))  # A basic metric

data['text_complexity_transcript'] = data['Transcript'].apply(text_complexity)
data['text_complexity_resume'] = data['Resume'].apply(text_complexity)


In [31]:
# Encoding the target variable (select/reject)
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
data['decision'] = le.fit_transform(data['decision'])  # 0: reject, 1: select


In [34]:
data.columns

Index(['ID', 'Name', 'Role', 'Transcript', 'Resume', 'decision',
       'Reason_for_decision', 'Job_Description', 'num_words_in_transcript',
       'resume_jd_similarity', 'resume_transcript_similarity', 'sentiment',
       'lexical_diversity', 'transcript_length_words', 'reason_length',
       'resume_length', 'word_count_ratio', 'role_transcript_similarity',
       'cultural_fit_sentiment', 'jd_transcript_sentiment_gap',
       'job_desc_length', 'role_resume_similarity', 'combined_text_similarity',
       'sentiment_to_diversity_ratio', 'clarity_score', 'confidence_score',
       'clarity_confidence_interaction', 'soft_skills_sentiment',
       'transcript_length_characters', 'technical_skill_match',
       'technical_skill_match2', 'technical_skill_match3',
       'job_description_experience_match', 'job_fit_score',
       'job_desc_complexity', 'interaction_quality',
       'text_complexity_transcript', 'text_complexity_resume'],
      dtype='object')

In [36]:
from transformers import BertTokenizer, BertModel
import torch

In [37]:
from transformers import DistilBertTokenizer, DistilBertModel

In [38]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [39]:
def get_bert_embeddings_batch(texts, tokenizer, model, batch_size=32):
    embeddings = []
    total_batches = (len(texts) + batch_size - 1) // batch_size  # Total number of batches
    print(f"Total Batches: {total_batches}")
    
    for i in range(total_batches):
        # Print progress
        print(f"Processing batch {i + 1}/{total_batches}...")
        
        # Get the current batch
        batch = texts[i * batch_size:(i + 1) * batch_size]
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Use the [CLS] token representation for each text in the batch
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(batch_embeddings)
    
    return embeddings

In [41]:
# Apply batch processing
texts = data['Transcript'].tolist()

# Generate BERT embeddings
batch_embeddings = get_bert_embeddings_batch(texts, tokenizer, model, batch_size=32)

# Convert embeddings to a list of Python lists
batch_embeddings = [emb.tolist() for emb in batch_embeddings]

# Add embeddings as a new column in the DataFrame
data['bert_embeddings_trans'] = batch_embeddings


Total Batches: 100
Processing batch 1/100...
Processing batch 2/100...
Processing batch 3/100...
Processing batch 4/100...
Processing batch 5/100...
Processing batch 6/100...
Processing batch 7/100...
Processing batch 8/100...
Processing batch 9/100...
Processing batch 10/100...
Processing batch 11/100...
Processing batch 12/100...
Processing batch 13/100...
Processing batch 14/100...
Processing batch 15/100...
Processing batch 16/100...
Processing batch 17/100...
Processing batch 18/100...
Processing batch 19/100...
Processing batch 20/100...
Processing batch 21/100...
Processing batch 22/100...
Processing batch 23/100...
Processing batch 24/100...
Processing batch 25/100...
Processing batch 26/100...
Processing batch 27/100...
Processing batch 28/100...
Processing batch 29/100...
Processing batch 30/100...
Processing batch 31/100...
Processing batch 32/100...
Processing batch 33/100...
Processing batch 34/100...
Processing batch 35/100...
Processing batch 36/100...
Processing batch 3

In [43]:
# Apply batch processing
texts = data['Resume'].tolist()

# Generate BERT embeddings
batch_embeddings = get_bert_embeddings_batch(texts, tokenizer, model, batch_size=32)

# Convert embeddings to a list of Python lists
batch_embeddings = [emb.tolist() for emb in batch_embeddings]

# Add embeddings as a new column in the DataFrame
data['bert_embeddings_resume'] = batch_embeddings


Total Batches: 100
Processing batch 1/100...
Processing batch 2/100...
Processing batch 3/100...
Processing batch 4/100...
Processing batch 5/100...
Processing batch 6/100...
Processing batch 7/100...
Processing batch 8/100...
Processing batch 9/100...
Processing batch 10/100...
Processing batch 11/100...
Processing batch 12/100...
Processing batch 13/100...
Processing batch 14/100...
Processing batch 15/100...
Processing batch 16/100...
Processing batch 17/100...
Processing batch 18/100...
Processing batch 19/100...
Processing batch 20/100...
Processing batch 21/100...
Processing batch 22/100...
Processing batch 23/100...
Processing batch 24/100...
Processing batch 25/100...
Processing batch 26/100...
Processing batch 27/100...
Processing batch 28/100...
Processing batch 29/100...
Processing batch 30/100...
Processing batch 31/100...
Processing batch 32/100...
Processing batch 33/100...
Processing batch 34/100...
Processing batch 35/100...
Processing batch 36/100...
Processing batch 3

In [45]:
# Apply batch processing
texts = data['Job_Description'].tolist()

# Generate BERT embeddings
batch_embeddings = get_bert_embeddings_batch(texts, tokenizer, model, batch_size=32)

# Convert embeddings to a list of Python lists
batch_embeddings = [emb.tolist() for emb in batch_embeddings]

# Add embeddings as a new column in the DataFrame
data['bert_embeddings_jd'] = batch_embeddings


Total Batches: 100
Processing batch 1/100...
Processing batch 2/100...
Processing batch 3/100...
Processing batch 4/100...
Processing batch 5/100...
Processing batch 6/100...
Processing batch 7/100...
Processing batch 8/100...
Processing batch 9/100...
Processing batch 10/100...
Processing batch 11/100...
Processing batch 12/100...
Processing batch 13/100...
Processing batch 14/100...
Processing batch 15/100...
Processing batch 16/100...
Processing batch 17/100...
Processing batch 18/100...
Processing batch 19/100...
Processing batch 20/100...
Processing batch 21/100...
Processing batch 22/100...
Processing batch 23/100...
Processing batch 24/100...
Processing batch 25/100...
Processing batch 26/100...
Processing batch 27/100...
Processing batch 28/100...
Processing batch 29/100...
Processing batch 30/100...
Processing batch 31/100...
Processing batch 32/100...
Processing batch 33/100...
Processing batch 34/100...
Processing batch 35/100...
Processing batch 36/100...
Processing batch 3

In [128]:
trans_expanded = pd.DataFrame(data['bert_embeddings_trans'].tolist(), index=data.index)
trans_expanded.columns = [f'trans_emb_{i}' for i in range(trans_expanded.shape[1])]

In [129]:
resume_expanded = pd.DataFrame(data['bert_embeddings_resume'].tolist(), index=data.index)
resume_expanded.columns = [f'resume_emb_{i}' for i in range(resume_expanded.shape[1])]

In [130]:
jd_expanded = pd.DataFrame(data['bert_embeddings_jd'].tolist(), index=data.index)
jd_expanded.columns = [f'jd_emb_{i}' for i in range(resume_expanded.shape[1])]

In [131]:
df_expanded = pd.concat([data, trans_expanded, resume_expanded], axis=1)

In [132]:
label_encoder = LabelEncoder()
df_expanded['decision_encoded'] = label_encoder.fit_transform(df_expanded['decision'])


In [139]:
# Define features and target
feature_columns = [col for col in df_expanded.columns if col.startswith('trans_emb_') or col.startswith('resume_emb_') or col.startswith('jd_emb_')]
X = df_expanded[feature_columns]
y = df_expanded['decision_encoded']

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [147]:
import xgboost as xgb
import numpy as np

# Convert data to NumPy arrays
X_train_np = X_train.to_numpy()
X_test_np = X_test.to_numpy()
y_train_np = y_train.to_numpy()
y_test_np = y_test.to_numpy()

# Create DMatrix
dtrain = xgb.DMatrix(X_train_np, label=y_train_np)
dtest = xgb.DMatrix(X_test_np, label=y_test_np)

# Set parameters and train the model
xgb_params = {
    'objective': 'multi:softmax',
    'num_class': len(np.unique(y_train_np)),
    'max_depth': 6,
    'eta': 0.1,
    'eval_metric': 'mlogloss',
    'seed': 42
}
num_rounds = 100
xgb_model = xgb.train(xgb_params, dtrain, num_rounds)

# Predict using the XGBoost model
y_pred_xgb = xgb_model.predict(dtest)

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb)*100)


XGBoost Accuracy: 83.30708661417323


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define ANN model
ann_model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(len(np.unique(y_train)), activation='softmax')
])

# Compile the ANN model
ann_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the ANN model
ann_model.fit(X_train, y_train, epochs=1000, batch_size=32, verbose=1)

# Predict using the ANN model
y_pred_ann_prob = ann_model.predict(X_test)
y_pred_ann = np.argmax(y_pred_ann_prob, axis=1)

print("ANN Accuracy:", accuracy_score(y_test, y_pred_ann)*100)



Epoch 1/1000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.5955 - loss: 0.6648
Epoch 2/1000
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7807 - loss: 0.4319
Epoch 3/1000
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8168 - loss: 0.3616
Epoch 4/1000
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8065 - loss: 0.3496
Epoch 5/1000
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8311 - loss: 0.3163
Epoch 6/1000
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8276 - loss: 0.2988
Epoch 7/1000
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8330 - loss: 0.3000
Epoch 8/1000
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8316 - loss: 0.2949
Epoch 9/1000
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━

In [149]:
from sklearn.metrics import accuracy_score

# Combine predictions (simple average ensemble)
y_pred_ensemble = (y_pred_xgb + y_pred_ann) / 2
y_pred_ensemble = np.round(y_pred_ensemble).astype(int)

# Evaluate accuracy
print("Ensemble Accuracy:", accuracy_score(y_test, y_pred_ensemble)*100)


Ensemble Accuracy: 83.46456692913385


In [150]:
from sklearn.metrics import classification_report

# Assuming y_test are the true labels and y_pred_ensemble are the predictions from your ensemble model
print("\nClassification Report:\n", classification_report(y_test, y_pred_ensemble))



Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.89      0.85       322
           1       0.87      0.78      0.82       313

    accuracy                           0.83       635
   macro avg       0.84      0.83      0.83       635
weighted avg       0.84      0.83      0.83       635

