In [2]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import os

In [3]:
df_position=pd.read_csv("D:/position_control.csv")
df_smart =pd.read_csv("D:/position_control.csv")
df_people=pd.read_csv("D:/peoplesoft.csv")
vacancy_summary=pd.read_csv("D:/vacancy_audit.csv")

In [4]:
vacancy_summary

Unnamed: 0,department,Filled,Vacant,vacancy_rate
0,Finance,8,6,0.428571
1,Human Resources,13,5,0.277778
2,IT,5,6,0.545455
3,Operations,8,6,0.428571
4,Planning,13,4,0.235294
5,Public Works,4,5,0.555556
6,Water Resources,11,6,0.352941


In [5]:
df_people

Unnamed: 0,employee_id,first_name,last_name,department,job_title,hire_date,employment_status,supervisor_id,gender,race_ethnicity,salary
0,10040,John,Tran,Water Resources,Engineer,2024-12-15,Active,10010,Non-binary,Black,146871.92
1,10041,Erin,Wilson,Operations,HR Analyst,2025-11-14,Active,10009,Female,Asian,146937.01
2,10138,Michelle,Rasmussen,Public Works,Data Analyst,2025-03-23,Active,10006,Female,Asian,104293.79
3,10147,Ryan,Meyer,Public Works,Clerk,2025-10-13,Active,10008,Female,White,108717.57
4,10161,Laura,Moyer,Water Resources,HR Analyst,2025-09-22,Active,10006,Male,Asian,132354.75
5,10187,Eric,Lewis,Water Resources,Manager,2025-05-11,Active,10010,Male,Black,99323.37


In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import hstack, csr_matrix
import matplotlib.pyplot as plt
import os

# -------------------------------
# 1Ô∏è‚É£ Load Data
# -------------------------------
df_position = pd.read_csv("D:/position_control.csv")
df_smart = pd.read_csv("D:/smartrecruiters.csv")  # correct SmartRecruiters path
df_people = pd.read_csv("D:/peoplesoft.csv")
vacancy_summary = pd.read_csv("D:/vacancy_audit.csv")

OUTPUT_DIR = "D:/hr_ml_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Data loaded successfully ‚úÖ")

# -------------------------------
# 2Ô∏è‚É£ Resume Text Clustering (KMeans)
# -------------------------------
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X_text = tfidf.fit_transform(df_smart['resume_text'].astype(str))

# Reduce dimensionality for visualization
svd = TruncatedSVD(n_components=50, random_state=42)
X_reduced = svd.fit_transform(X_text)

k = 5  # number of clusters
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df_smart['cluster'] = kmeans.fit_predict(X_reduced)

# 2D scatter for visualization
df_smart['svd_1'] = X_reduced[:, 0]
df_smart['svd_2'] = X_reduced[:, 1]

# Cluster label summary (top terms)
terms = tfidf.get_feature_names_out()
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
cluster_label_map = {}
for i in range(k):
    top_terms = [terms[ind] for ind in order_centroids[i, :3]]
    cluster_label_map[i] = ', '.join(top_terms)
df_smart['cluster_label'] = df_smart['cluster'].map(cluster_label_map)
# -------------------------------
# 3Ô∏è‚É£ Classification Model ‚Äì Predict Hire Likelihood
# -------------------------------
df_smart['hired_flag'] = df_smart['stage'].apply(lambda x: 1 if x.lower() == 'hired' else 0)

# Encode categorical structured features
le_edu = LabelEncoder()
df_smart['education_encoded'] = le_edu.fit_transform(df_smart['education_level'].astype(str))

le_dept = LabelEncoder()
df_smart['dept_encoded'] = le_dept.fit_transform(df_smart['department'].astype(str))

le_job = LabelEncoder()
df_smart['job_encoded'] = le_job.fit_transform(df_smart['job_title'].astype(str))

# Combine text + structured data
structured = df_smart[['experience_years', 'education_encoded', 'dept_encoded', 'job_encoded']].astype(float).values
X = hstack([X_text, csr_matrix(structured)], format='csr')
y = df_smart['hired_flag'].values

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train model
clf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("\nüìä Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Add prediction probabilities
df_smart['hire_probability'] = clf.predict_proba(X)[:, 1]

Data loaded successfully ‚úÖ





üìä Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        39
           1       0.00      0.00      0.00         1

    accuracy                           0.97        40
   macro avg       0.49      0.50      0.49        40
weighted avg       0.95      0.97      0.96        40

Accuracy: 0.975


In [20]:
df_smart[['svd_1', 'svd_2', 'cluster']].head()

Unnamed: 0,svd_1,svd_2,cluster
0,0.319414,0.033052,4
1,0.443582,-0.150802,1
2,0.311083,0.159932,0
3,0.401802,0.11743,0
4,0.473791,-0.261416,1


In [21]:
df_smart = df_smart.replace([np.inf, -np.inf], np.nan).dropna(subset=['svd_1', 'svd_2'])

In [22]:
df_smart[['resume_text','cluster','cluster_label','svd_1','svd_2','hired_flag','hire_probability']]

Unnamed: 0,resume_text,cluster,cluster_label,svd_1,svd_2,hired_flag,hire_probability
0,"Danielle Johnson skilled in ServiceNow, Excel,...",4,"10, alan, aguilar",0.319414,0.033052,0,0.000
1,"John Taylor skilled in Excel, Recruiting, Powe...",1,"10, adams, allen",0.443582,-0.150802,0,0.030
2,"Erica Mcclain skilled in Recruiting, Python, P...",0,"10, 12, alvarado",0.311083,0.159932,0,0.050
3,"Brittany Johnson skilled in Python, Data Analy...",0,"10, 12, alvarado",0.401802,0.117430,0,0.000
4,"Jeffery Wagner skilled in SQL, HRIS, ServiceNo...",1,"10, adams, allen",0.473791,-0.261416,0,0.000
...,...,...,...,...,...,...,...
195,"Jeffrey Robinson skilled in ServiceNow, SQL, M...",2,"10, analysis, anderson",0.399584,-0.243500,0,0.000
196,"Todd Price skilled in Data Analysis, Power BI,...",4,"10, alan, aguilar",0.425893,-0.034388,0,0.015
197,"Erin Powell skilled in Excel, HRIS, Power BI, ...",4,"10, alan, aguilar",0.397558,-0.115357,0,0.000
198,"Mary Calhoun skilled in Recruiting, Leadership...",4,"10, alan, aguilar",0.360959,-0.057617,0,0.005


In [30]:
# -------------------------------
# 6Ô∏è‚É£ Visualizations (Show + Save)
# -------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ========== (1) Resume Cluster Scatter ==========
plt.figure(figsize=(8, 6))
for cl in sorted(df_smart['cluster'].unique()):
    subset = df_smart[df_smart['cluster'] == cl]
    plt.scatter(subset['svd_1'], subset['svd_2'], label=f"{cl}: {cluster_label_map[cl]}", alpha=0.6)
plt.legend(fontsize=8, loc='best')
plt.xlabel("SVD 1")
plt.ylabel("SVD 2")
plt.title("Candidate Resume Clusters")
plt.tight_layout()

# Save + show
scatter_path = os.path.join(OUTPUT_DIR, "resume_clusters.png")
plt.savefig(scatter_path, dpi=300)
plt.show()
print(f"‚úÖ Cluster scatter plot saved to: {scatter_path}")

# ========== (2) Cluster Summary Bar Chart ==========
cluster_summary = (
    df_smart.groupby('cluster_label')
    .agg(avg_hire_prob=('hire_probability', 'mean'),
         candidate_count=('resume_text', 'count'))
    .reset_index()
    .sort_values('avg_hire_prob', ascending=False)
)

plt.figure(figsize=(10, 5))
sns.barplot(x='cluster_label', y='avg_hire_prob', data=cluster_summary, palette='viridis')
plt.xticks(rotation=45, ha='right')
plt.title("Average Hire Probability by Resume Cluster")
plt.ylabel("Average Hire Probability")
plt.xlabel("Cluster (Top Terms)")
plt.tight_layout()

bar_path = os.path.join(OUTPUT_DIR, "cluster_hire_probability.png")
plt.savefig(bar_path, dpi=300)
plt.show()
print(f"‚úÖ Cluster hire probability bar chart saved to: {bar_path}")

# ========== (3) Feature Importance ==========
feature_importances = clf.feature_importances_
n_features = X.shape[1]
top_indices = np.argsort(feature_importances)[-10:][::-1]
plt.figure(figsize=(8, 5))
plt.barh(range(len(top_indices)), feature_importances[top_indices][::-1])
plt.yticks(range(len(top_indices)), [f"Feature {i}" for i in top_indices[::-1]])
plt.title("Top 10 Most Important Features (Text + Structured)")
plt.tight_layout()

feature_path = os.path.join(OUTPUT_DIR, "feature_importance.png")
plt.savefig(feature_path, dpi=300)
plt.show()
print(f"‚úÖ Feature importance plot saved to: {feature_path}")

print("\nüéØ All visuals displayed and saved successfully!")

  plt.show()

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='cluster_label', y='avg_hire_prob', data=cluster_summary, palette='viridis')


‚úÖ Cluster scatter plot saved to: D:/hr_ml_outputs\resume_clusters.png


  plt.show()


‚úÖ Cluster hire probability bar chart saved to: D:/hr_ml_outputs\cluster_hire_probability.png
‚úÖ Feature importance plot saved to: D:/hr_ml_outputs\feature_importance.png

üéØ All visuals displayed and saved successfully!


  plt.show()


In [31]:
# -------------------------------
# 7Ô∏è‚É£ Confusion Matrix Visualization
# -------------------------------
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = ['Not Hired (0)', 'Hired (1)']

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix ‚Äì Hire Prediction Model')
plt.tight_layout()

cm_path = os.path.join(OUTPUT_DIR, "confusion_matrix.png")
plt.savefig(cm_path, dpi=300)
plt.show()
print(f"‚úÖ Confusion matrix saved to: {cm_path}")

‚úÖ Confusion matrix saved to: D:/hr_ml_outputs\confusion_matrix.png


  plt.show()
