In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [2]:
# Load job skill dataset
df = pd.read_csv("job_skills_dataset_corrected.csv")

# Prepare synthetic training data (you can replace with real labels)
X_raw = df["Skills Required"].str.lower().str.replace(r"[^\w\s,]", "", regex=True)
job_roles = df["Job Title"].str.lower().unique()

In [None]:
# Generate pairs (resume skills, jd skills, required skills)
samples = []
labels = []

# Replace the lambda with a named function
from utils import comma_tokenizer
vectorizer = TfidfVectorizer(tokenizer=comma_tokenizer)
X_tfidf = vectorizer.fit_transform(X_raw)

In [11]:
# Create synthetic training data
for i in range(2000):
    i1 = np.random.randint(0, len(X_tfidf.toarray()))
    i2 = np.random.randint(0, len(X_tfidf.toarray()))
    resume_vec = X_tfidf[i1].toarray()[0]
    jd_vec = X_tfidf[i2].toarray()[0]
    role_vec = X_tfidf[i2].toarray()[0]  # assume JD and role are from same record
    exp = np.random.randint(0, 11)  # 0–10 years

    combined = np.concatenate([resume_vec, jd_vec, role_vec, [exp / 10]])
    match_score = np.dot(resume_vec, role_vec) / (np.linalg.norm(resume_vec) * np.linalg.norm(role_vec) + 1e-6)
    match_score = min(1.0, max(0.0, match_score + exp * 0.02))  # boost for experience

    samples.append(combined)
    labels.append(match_score)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(samples, labels, test_size=0.2, random_state=42)


In [13]:
# Build model
model = Sequential([
    Dense(256, activation='relu', input_shape=(len(samples[0]),)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')  # score between 0–1
])

In [14]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.fit(np.array(X_train), np.array(y_train), epochs=15, batch_size=32, validation_split=0.1)

Epoch 1/15
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - loss: 0.0723 - mae: 0.2194 - val_loss: 0.0258 - val_mae: 0.1046
Epoch 2/15
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0253 - mae: 0.1029 - val_loss: 0.0235 - val_mae: 0.0926
Epoch 3/15
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0182 - mae: 0.0837 - val_loss: 0.0203 - val_mae: 0.0912
Epoch 4/15
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0148 - mae: 0.0722 - val_loss: 0.0181 - val_mae: 0.0807
Epoch 5/15
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0107 - mae: 0.0594 - val_loss: 0.0158 - val_mae: 0.0762
Epoch 6/15
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0070 - mae: 0.0514 - val_loss: 0.0131 - val_mae: 0.0723
Epoch 7/15
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0065 

<keras.src.callbacks.history.History at 0x27fa3b9ac60>

In [15]:
model.save("match_score_model.h5")



In [16]:
# Save vectorizer
import joblib
joblib.dump(vectorizer, "match_vectorizer.pkl")

['match_vectorizer.pkl']