In [22]:
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from scipy.sparse import hstack, vstack
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt

In [23]:
dataset = load_dataset("bwbayu/job_cv_supervised")

In [24]:
df = dataset['train'].to_pandas().reset_index(drop=True)

In [25]:
del df['Unnamed: 0']

In [26]:
df = df.rename(columns={'clean_cv': 'resume_text'})
df = df.rename(columns={'clean_jd': 'job_description_text'})

In [27]:
df['combined_text'] = df['resume_text'] + " [SEP] " + df['job_description_text']

### Split the data

In [28]:
#Get unique CVs
unique_cvs = df['resume_text'].unique()

# Split CVs
train_cvs, temp_cvs = train_test_split(
    unique_cvs, test_size=0.3, random_state=1
)

val_cvs, test_cvs = train_test_split(
    temp_cvs, test_size=0.5, random_state=1
)

# Create splits
df_train = df[df['resume_text'].isin(train_cvs)]
df_val   = df[df['resume_text'].isin(val_cvs)]
df_test  = df[df['resume_text'].isin(test_cvs)]

# Reset index
df_train = df_train.reset_index(drop=True)
df_val   = df_val.reset_index(drop=True)
df_test  = df_test.reset_index(drop=True)

y_train = df_train.label.values
y_val = df_val.label.values
y_test = df_test.label.values

del df_train['label']
del df_val['label']
del df_test['label']

### tf-idf and cosine similarity

In [29]:
# TF-IDF Vectorizer
# Create SHARED vocabulary for cosine similarity
shared_tfidf = TfidfVectorizer(max_features=5000)

# Fit on BOTH resumes and JDs
all_train_text = pd.concat([
    df_train['resume_text'], 
    df_train['job_description_text']
])
shared_tfidf.fit(all_train_text)


X_train_resume = shared_tfidf.transform(df_train['resume_text'])
X_train_jd     = shared_tfidf.transform(df_train['job_description_text'])

X_val_resume = shared_tfidf.transform(df_val['resume_text'])
X_val_jd     = shared_tfidf.transform(df_val['job_description_text'])

X_test_resume = shared_tfidf.transform(df_test['resume_text'])
X_test_jd     = shared_tfidf.transform(df_test['job_description_text'])


# Function to compute cosine similarity row-wise
def rowwise_cosine(X1, X2):
    X1_norm = normalize(X1)
    X2_norm = normalize(X2)
    return np.array(X1_norm.multiply(X2_norm).sum(axis=1)).ravel()

# Compute cosine similarity efficiently
cos_sim_train = rowwise_cosine(X_train_resume, X_train_jd)
cos_sim_val   = rowwise_cosine(X_val_resume, X_val_jd)
cos_sim_test  = rowwise_cosine(X_test_resume, X_test_jd)

In [30]:
tfidf = TfidfVectorizer(max_features=5000)

X_train_combined = tfidf.fit_transform(df_train['combined_text'])
X_val_combined   = tfidf.transform(df_val['combined_text'])
X_test_combined  = tfidf.transform(df_test['combined_text'])

X_train_final = hstack([X_train_combined, cos_sim_train.reshape(-1, 1)])
X_val_final = hstack([X_val_combined, cos_sim_val.reshape(-1, 1)])
X_test_final = hstack([X_test_combined, cos_sim_test.reshape(-1, 1)])

### Using the model:xgboost

In [31]:
tfidf_features = list(tfidf.get_feature_names_out())
features = tfidf_features + ["cosine_similarity"]

# Combine train + val
X_trainval_final = vstack([X_train_final, X_val_final])
y_trainval = np.concatenate([y_train, y_val])

dtrain = xgb.DMatrix(X_train_final, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val_final, label=y_val, feature_names=features)
dtrainval = xgb.DMatrix(X_trainval_final, label=y_trainval, feature_names=features)
dtest = xgb.DMatrix(X_test_final, label=y_test, feature_names=features)

In [38]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

xgb_params = {
        'eta': 0.15,
        'max_depth': 6,
        'min_child_weight': 2,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'nthread': 8,
        'seed': 1,
    }

model = xgb.train(xgb_params, dtrainval, num_boost_round=400)

In [37]:
y_test_proba = model.predict(dtest)
y_train_proba = model.predict(dtrainval)
y_pred_test  = (y_test_proba >= 0.5).astype(int)
auc_test = roc_auc_score(y_test, y_test_proba)
auc_train = roc_auc_score(y_trainval, y_train_proba)
print(auc_test, auc_train)

0.9147878610658566 0.9999993338517782


In [14]:
report = classification_report(y_test, y_pred_test)
print(report)

              precision    recall  f1-score   support

           0       0.96      0.53      0.68      2598
           1       0.65      0.97      0.78      2322

    accuracy                           0.74      4920
   macro avg       0.80      0.75      0.73      4920
weighted avg       0.81      0.74      0.73      4920



### make a prediction

In [15]:
sample = df_test.iloc[0]

resume_text = sample['resume_text']
job_text = sample['job_description_text']

In [16]:
datapoint = {
    "resume_text": df_test.iloc[0]["resume_text"],
    "job_description_text": df_test.iloc[0]["job_description_text"]
}
datapoint

{'resume_text': 'result oriented organized bilingual accounting finance professional extensive diverse accounting auditing finance experience experience aspect financial reporting accrual managerial cost accounting reporting system operational analysis human resource function acceptance expanded responsibility exceptional performance knowledge accounting theory principle practice regulation including fasb gaap sox compliance cpa candidate analytical problem solving decision making performance productivity improvement team building leadership payroll accounting tax accounting financial analysis strategic planning project inventory management staff management skill work history current company name city state full service accounting firm service include assistance tax business affair compile analyze financial information prepare financial statement formulation corporate tax return private corporate client maintaining general ledger including posting adjusting closing journal entry analyz

In [17]:
datapoint = {'resume_text': 'result oriented organized bilingual accounting finance professional extensive diverse accounting auditing finance experience experience aspect financial reporting accrual managerial cost accounting reporting system operational analysis human resource function acceptance expanded responsibility exceptional performance knowledge accounting theory principle practice regulation including fasb gaap sox compliance cpa candidate analytical problem solving decision making performance productivity improvement team building leadership payroll accounting tax accounting financial analysis strategic planning project inventory management staff management skill work history current company name city state full service accounting firm service include assistance tax business affair compile analyze financial information prepare financial statement formulation corporate tax return private corporate client maintaining general ledger including posting adjusting closing journal entry analyze financial transaction ensure recorded appropriate general ledger account make necessary correction journal entry needed properly reflect financial position company continued mercado accountant current company name city state largest stone structural steel fabricator south suburb service include design fabricate architectural stone array client includes residential commercial building financial institution accountant oversee financial accounting function million construction company report directly owner providing financial data analytical report maximize profit cost saving alternative responsibility include assist preparation coordination closing ensuring financial statement accurate compliance generally accepted accounting gaap requirement assist owner production financials management report executive reporting package responsible fixed asset management includes entering new asset booking depreciation asset disposal inventory analysis audit finished good raw material perform financial analysis track variance profit initiative generate profitability report full year forecast report participate budgeting process perform cost plan track report management manage full cycle ap disbursement including bank account reconciliation journal entry accrual general ledger formulate payroll year end ensure compliance federal state local tax prepare tax return sale use tax year end corporate payroll tax return compliance irs requirement oversee employee benefit including health dental vision insurance 401k commercial insurance play key role annual audit preparation audit schedule documentation external insurance auditor participated various project improve process efficiency overall timeliness accuracy financial information participated evaluation development cost saving revenue generating opportunity recognized potential problem implemented innovative solution trading assistant company name city state joint venture created former nissho iwai american corporation metal one america supply chain management focused steel steel related product service include global material sourcing logistics coordination trade finance program management prepared report analyzed audited internal billing coordinating delivery accuracy great attention detail maintained high level customer satisfaction business communication international subsidiary mexico japan preparation documentation import export shipment executed account receivable reporting enhancement reconciliation procedure improved customer service satisfaction streamlined inventory system operation performing thorough inventory tracking designed implemented cost control improve profitability negotiated contract including delivery point term price export import duty education master business administration accounting lewis university city state gpa cum laude accounting cum laude bachelor art robert morris university city state gpa accounting magna cum laude accounting magna cum laude block tax course wiley cpa excel review affiliation acfe association certified forensic examiner skill accounting accountant account receivable accrual ad ap attention detail benefit billing budgeting business communication closing contract cpa client customer satisfaction customer service delivery documentation essbase finance financials financial financial accounting financial analysis financial statement fixed asset general ledger hyperion insurance inventory logistics material access excel powerpoint microsoft word enterprise oracle payroll profit program management quickbooks reporting sale sap fluent spanish supply chain management tax tax turbo tax view vision year end',
 'job_description_text': 'minimum education requirement bachelor degree accounting major least one two year work experience manufacturing company preferably wood processing company knowledge accounting information system finance accounting flowchart experience using accounting software understand able create financial report master cost accounting costing willing work officially placed outside city special criterion age twenty five thirty year domiciled j abode tab ek neat appearance well behaved polite gpa min two seventy five'}


In [18]:
def predict_from_datapoint(datapoint):
    combined = datapoint["resume_text"] + " [SEP] " + datapoint["job_description_text"]
    
    X_combined = tfidf.transform([combined])
    
    X_resume = shared_tfidf.transform([datapoint["resume_text"]])
    X_jd = shared_tfidf.transform([datapoint["job_description_text"]])
    
    cos_sim = rowwise_cosine(X_resume, X_jd)[0]
    
    X_final = hstack([X_combined, [[cos_sim]]])
    
    dmat = xgb.DMatrix(X_final, feature_names=features)
    
    proba = model.predict(dmat)[0]
    return proba, int(proba >= 0.5)

In [21]:
print(predict_from_datapoint(datapoint)[0])
if predict_from_datapoint(datapoint)[1] == 1:
    print("Good Fit")
else:
    print("Bad Fit")

0.99588364
Good Fit


In [None]:
import xgboost as xgb
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from scipy.sparse import hstack
import numpy as np

# --- 1. TF-IDF for combined text ---
tfidf = TfidfVectorizer(max_features=5000)
X_train_combined = tfidf.fit_transform(df_train['resume_text'] + " [SEP] " + df_train['job_description_text'])
X_val_combined   = tfidf.transform(df_val['resume_text'] + " [SEP] " + df_val['job_description_text'])
X_test_combined  = tfidf.transform(df_test['resume_text'] + " [SEP] " + df_test['job_description_text'])

# --- 2. TF-IDF for cosine similarity ---
shared_tfidf = TfidfVectorizer(max_features=5000)
shared_tfidf.fit(pd.concat([df_train['resume_text'], df_train['job_description_text']]))

def cosine_sim(X1_text, X2_text):
    X1 = shared_tfidf.transform(X1_text)
    X2 = shared_tfidf.transform(X2_text)
    return np.array(normalize(X1).multiply(normalize(X2)).sum(axis=1)).ravel()

cos_train = cosine_sim(df_train['resume_text'], df_train['job_description_text']).reshape(-1,1)
cos_val   = cosine_sim(df_val['resume_text'], df_val['job_description_text']).reshape(-1,1)
cos_test  = cosine_sim(df_test['resume_text'], df_test['job_description_text']).reshape(-1,1)

# --- 3. Combine features ---
X_train_final = hstack([X_train_combined, cos_train])
X_val_final   = hstack([X_val_combined, cos_val])
X_test_final  = hstack([X_test_combined, cos_test])

# --- 4. XGBoost ---
dtrain = xgb.DMatrix(X_train_final, label=y_train)
dval   = xgb.DMatrix(X_val_final, label=y_val)
dtest  = xgb.DMatrix(X_test_final, label=y_test)

params = {
        'eta': 0.15,
        'max_depth': 6,
        'min_child_weight': 2,
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'nthread': 8,
        'seed': 1,
    }

model = xgb.train(params, dtrain, num_boost_round=300)
