In [None]:
import pandas as pd
import numpy as np

from datasets import load_dataset

dataset = load_dataset("cnamuangtoun/resume-job-description-fit")


df = pd.concat(
    [
        dataset["train"].to_pandas(),
        dataset["test"].to_pandas()
    ],
    ignore_index=True
)


import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    
    text = re.sub(r'\S+@\S+', ' email ', text)  # replace emails
    text = re.sub(r'\+?\d[\d\s\-\(\)]{7,}\d', ' phone ', text)  # replace phone numbers
    text = re.sub(r'www\.\S+|https?://\S+|\S+\.com', ' url ', text)  # Mask URLs (simple patterns)

    # remove noisy characters
    text = re.sub(r"[^\w\s\.\'\-\/\+\[\]]", ' ', text)

    #text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS]) # Remove stopwords

    # Normalize spaces again
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['resume_text'] = df['resume_text'].apply(preprocess_text)
df['job_description_text'] = df['job_description_text'].apply(preprocess_text)


df['combined_text'] = df['resume_text'] + " [SEP] " + df['job_description_text']

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

del df['label']


from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.15, random_state=1, stratify=df['label_encoded'])
df_train, df_val = train_test_split(df_full_train,test_size=0.176, random_state=1, stratify=df_full_train['label_encoded'])

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.label_encoded.values
y_val = df_val.label_encoded.values
y_test = df_test.label_encoded.values

del df_train['label_encoded']
del df_val['label_encoded']
del df_test['label_encoded']


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorizer
tfidf_resume = TfidfVectorizer(
    min_df=2,    #ignore very few words
    max_df=0.9,  #ignore very frequent words 
    max_features=5000,  
    ngram_range=(1,2),
    stop_words='english'
)

tfidf_jd = TfidfVectorizer(
    min_df=2,    #ignore very few words
    max_df=0.9,  #ignore very frequent words 
    max_features=5000,  
    ngram_range=(1,2),
    stop_words='english'
)


X_train_resume = tfidf_resume.fit_transform(df_train['resume_text'])
X_train_jd     = tfidf_jd.fit_transform(df_train['job_description_text'])

# Cosine similarity 
cos_sim_train = cosine_similarity(X_train_resume, X_train_jd).diagonal()

# Same for val and test (transform using train TF-IDF)
X_val_resume = tfidf_resume.transform(df_val['resume_text'])
X_val_jd     = tfidf_jd.transform(df_val['job_description_text'])

cos_sim_val   = cosine_similarity(X_val_resume, X_val_jd).diagonal()

X_test_resume = tfidf_resume.transform(df_test['resume_text'])
X_test_jd     = tfidf_jd.transform(df_test['job_description_text'])

cos_sim_test = cosine_similarity(X_test_resume, X_test_jd).diagonal()

# Add as a new column
df_train['cosine_sim'] = cos_sim_train
df_val['cosine_sim']   = cos_sim_val
df_test['cosine_sim']  = cos_sim_test



tfidf = TfidfVectorizer(
    min_df=2,    #ignore very few words
    max_df=0.9,  #ignore very frequent words 
    max_features=5000,  
    ngram_range=(1,2),
    stop_words='english'
)

X_train_combined = tfidf.fit_transform(df_train['combined_text'])
X_val_combined   = tfidf.transform(df_val['combined_text'])
X_test_combined  = tfidf.transform(df_test['combined_text'])

#df_train = df_train.drop(['resume_text','job_description_text','combined_text'], axis=1)
#df_val   = df_val.drop(['resume_text','job_description_text','combined_text'], axis=1)
#df_test  = df_test.drop(['resume_text','job_description_text','combined_text'], axis=1)

## Import libraries, dataset and read it

In [115]:
import pandas as pd
import numpy as np

In [116]:
from datasets import load_dataset

dataset = load_dataset("cnamuangtoun/resume-job-description-fit")

In [117]:
dataset

DatasetDict({
    train: Dataset({
        features: ['resume_text', 'job_description_text', 'label'],
        num_rows: 6241
    })
    test: Dataset({
        features: ['resume_text', 'job_description_text', 'label'],
        num_rows: 1759
    })
})

In [118]:
df = pd.concat(
    [
        dataset["train"].to_pandas(),
        dataset["test"].to_pandas()
    ],
    ignore_index=True
)
df

Unnamed: 0,resume_text,job_description_text,label
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit
...,...,...,...
7995,SummaryHighly motivated Sales Associate with e...,"Position Type: Full-Time, W2 Direct Hire. (Mus...",Good Fit
7996,SummaryWireless communications engineer with e...,"Location: Tampa, FL\nExp: 7-10 Yrs\nSPOC: Tush...",Good Fit
7997,Professional ProfileCapable International Tax ...,"Backed by a leading growth equity firm, an LA ...",Good Fit
7998,SummaryData Engineeringwith experience in Desi...,Allergan Data Labs is on a mission to transfor...,Good Fit


## EDA and preprocessing

In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   resume_text           8000 non-null   object
 1   job_description_text  8000 non-null   object
 2   label                 8000 non-null   object
dtypes: object(3)
memory usage: 187.6+ KB


In [120]:
df.head()

Unnamed: 0,resume_text,job_description_text,label
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit


In [121]:
df.describe()

Unnamed: 0,resume_text,job_description_text,label
count,8000,8000,8000
unique,643,351,3
top,SummaryFinancial Accountant specializing in fi...,Calling all innovators find your future at Fi...,No Fit
freq,82,111,4000


In [122]:
# Check label distribution
print(df['label'].value_counts())

label
No Fit           4000
Potential Fit    2000
Good Fit         2000
Name: count, dtype: int64


In [123]:
# Check text length

df_text = pd.DataFrame()

df_text['resume_len'] = df['resume_text'].apply(len)
df_text['job_len'] = df['job_description_text'].apply(len)

print(df_text[['resume_len', 'job_len']].describe())


         resume_len      job_len
count   8000.000000  8000.000000
mean    5773.369000  2777.030875
std     2958.109675  1777.249968
min      897.000000    72.000000
25%     4234.000000  1309.000000
50%     5123.000000  2401.000000
75%     6603.000000  3985.000000
max    25364.000000  8171.000000


## Text preprocessing

In [124]:
#Remove punctuation and special characters

import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    
    text = re.sub(r'\S+@\S+', ' email ', text)  # replace emails
    text = re.sub(r'\+?\d[\d\s\-\(\)]{7,}\d', ' phone ', text)  # replace phone numbers
    text = re.sub(r'www\.\S+|https?://\S+|\S+\.com', ' url ', text)  # Mask URLs (simple patterns)

    # remove noisy characters
    text = re.sub(r"[^\w\s\.\'\-\/\+\[\]]", ' ', text)

    #text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS]) # Remove stopwords

    # Normalize spaces again
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['resume_text'] = df['resume_text'].apply(preprocess_text)
df['job_description_text'] = df['job_description_text'].apply(preprocess_text)

In [125]:
# Check text length

df_text = pd.DataFrame()

df_text['resume_len'] = df['resume_text'].apply(len)
df_text['job_len'] = df['job_description_text'].apply(len)

print(df_text[['resume_len', 'job_len']].describe())


         resume_len      job_len
count   8000.000000  8000.000000
mean    5647.272750  2724.711125
std     2900.199669  1748.179645
min      889.000000    72.000000
25%     4110.000000  1277.000000
50%     5018.000000  2369.000000
75%     6425.250000  3916.000000
max    25199.000000  8074.000000


In [126]:
df.head()

Unnamed: 0,resume_text,job_description_text,label
0,summaryhighly motivated sales associate with e...,net2source inc. is an award-winning total work...,No Fit
1,professional summarycurrently working with cat...,at salas obrien we tell our clients that were ...,No Fit
2,summaryi started my construction career in jun...,schweitzer engineering laboratories sel infras...,No Fit
3,summarycertified electrical foremanwith thirte...,mizick miller company inc. is looking for a dy...,No Fit
4,summarywith extensive experience in business/r...,life at capgemini capgemini supports all aspec...,No Fit


In [127]:
# Combine text columns
df['combined_text'] = df['resume_text'] + " [SEP] " + df['job_description_text']

In [128]:
df.head()

Unnamed: 0,resume_text,job_description_text,label,combined_text
0,summaryhighly motivated sales associate with e...,net2source inc. is an award-winning total work...,No Fit,summaryhighly motivated sales associate with e...
1,professional summarycurrently working with cat...,at salas obrien we tell our clients that were ...,No Fit,professional summarycurrently working with cat...
2,summaryi started my construction career in jun...,schweitzer engineering laboratories sel infras...,No Fit,summaryi started my construction career in jun...
3,summarycertified electrical foremanwith thirte...,mizick miller company inc. is looking for a dy...,No Fit,summarycertified electrical foremanwith thirte...
4,summarywith extensive experience in business/r...,life at capgemini capgemini supports all aspec...,No Fit,summarywith extensive experience in business/r...


In [129]:
print(df['combined_text'][0])

summaryhighly motivated sales associate with extensive customer service and sales experience. outgoing sales professional with track record of driving increased sales improving buying experience and elevating company profile with target market. highlights-soft skills public speaking public relations team building project management procedure writing staff supervision and management ability to interface with professionals on all levels. accomplishments honors and activities -board of directors member for the food bank of corpus christi from november 2010 to april 2013. -held life insurance license -basketball official referee high school varsity level. experienceaccountant 08/2014-05/2015aspirus owen wi perform daily and routine accounting functions for two main companies and five small royalty companies. responsibilities include but are not limited to the following accounts payable accounts receivable manage and reconcile funds for multiple banks accounts payroll perform detail audits 

### Encode label columns

In [130]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

del df['label']

In [131]:
df.head()

Unnamed: 0,resume_text,job_description_text,combined_text,label_encoded
0,summaryhighly motivated sales associate with e...,net2source inc. is an award-winning total work...,summaryhighly motivated sales associate with e...,1
1,professional summarycurrently working with cat...,at salas obrien we tell our clients that were ...,professional summarycurrently working with cat...,1
2,summaryi started my construction career in jun...,schweitzer engineering laboratories sel infras...,summaryi started my construction career in jun...,1
3,summarycertified electrical foremanwith thirte...,mizick miller company inc. is looking for a dy...,summarycertified electrical foremanwith thirte...,1
4,summarywith extensive experience in business/r...,life at capgemini capgemini supports all aspec...,summarywith extensive experience in business/r...,1


In [132]:
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
label_mapping

{'Good Fit': np.int64(0), 'No Fit': np.int64(1), 'Potential Fit': np.int64(2)}

## Split the data

In [133]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.15, random_state=1, stratify=df['label_encoded'])
df_train, df_val = train_test_split(df_full_train,test_size=0.176, random_state=1, stratify=df_full_train['label_encoded'])

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.label_encoded.values
y_val = df_val.label_encoded.values
y_test = df_test.label_encoded.values

del df_train['label_encoded']
del df_val['label_encoded']
del df_test['label_encoded']

In [134]:
df_train

Unnamed: 0,resume_text,job_description_text,combined_text
0,summaryfull stack software engineer with 8+ ye...,pay range 81k - 102k depending on experience.t...,summaryfull stack software engineer with 8+ ye...
1,professional summarytalented leader and attorn...,who we are founded in 2017 gatik is the leader...,professional summarytalented leader and attorn...
2,professional summarywith the attitude of learn...,hi reply requested for below job opportunity.i...,professional summarywith the attitude of learn...
3,summarydetail oriented and team focused epidem...,a little about this gig agility partners is se...,summarydetail oriented and team focused epidem...
4,professional summaryqa test analyst/ developme...,primary location melbourne florida v-soft cons...,professional summaryqa test analyst/ developme...
...,...,...,...
5598,professional profileclaire 6.1 csa from pegasy...,description what were looking for at appfolio ...,professional profileclaire 6.1 csa from pegasy...
5599,professional summarywith the attitude of learn...,role details we are seeking a senior software ...,professional summarywith the attitude of learn...
5600,summarycapable fund accountant successful at m...,position cost accountant reports to president ...,summarycapable fund accountant successful at m...
5601,profilehighly motivated sales associate with e...,position business analyst ivlocation st. louis...,profilehighly motivated sales associate with e...


### Converting Resume Text and Job Description Text into Vectors and Computing Cosine Similarity as a New Feature

#### TF-IDF Vectorizer
- Converts text into numbers that reflect how important each word is in a document relative to the whole collection of documents.
- **Output:** A numeric vector for each document, where each element represents the TF-IDF score of a word from the vocabulary.

#### Cosine Similarity
- Computes the angle between the two TF-IDF vectors.
- `1` → texts are very similar
- `0` → texts are completely different

In [135]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorizer
tfidf_resume = TfidfVectorizer(
    min_df=2,    #ignore very few words
    max_df=0.9,  #ignore very frequent words 
    max_features=5000,  
    ngram_range=(1,2),
    stop_words='english'
)

tfidf_jd = TfidfVectorizer(
    min_df=2,    #ignore very few words
    max_df=0.9,  #ignore very frequent words 
    max_features=5000,  
    ngram_range=(1,2),
    stop_words='english'
)


X_train_resume = tfidf_resume.fit_transform(df_train['resume_text'])
X_train_jd     = tfidf_jd.fit_transform(df_train['job_description_text'])

# Cosine similarity 
cos_sim_train = cosine_similarity(X_train_resume, X_train_jd).diagonal()

# Same for val and test (transform using train TF-IDF)
X_val_resume = tfidf_resume.transform(df_val['resume_text'])
X_val_jd     = tfidf_jd.transform(df_val['job_description_text'])

cos_sim_val   = cosine_similarity(X_val_resume, X_val_jd).diagonal()

X_test_resume = tfidf_resume.transform(df_test['resume_text'])
X_test_jd     = tfidf_jd.transform(df_test['job_description_text'])

cos_sim_test = cosine_similarity(X_test_resume, X_test_jd).diagonal()

# Add as a new column
df_train['cosine_sim'] = cos_sim_train
df_val['cosine_sim']   = cos_sim_val
df_test['cosine_sim']  = cos_sim_test

In [136]:
tfidf = TfidfVectorizer(
    min_df=2,    #ignore very few words
    max_df=0.9,  #ignore very frequent words 
    max_features=5000,  
    ngram_range=(1,2),
    stop_words='english'
)

X_train_combined = tfidf.fit_transform(df_train['combined_text'])
X_val_combined   = tfidf.transform(df_val['combined_text'])
X_test_combined  = tfidf.transform(df_test['combined_text'])

#df_train = df_train.drop(['resume_text','job_description_text','combined_text'], axis=1)
#df_val   = df_val.drop(['resume_text','job_description_text','combined_text'], axis=1)
#df_test  = df_test.drop(['resume_text','job_description_text','combined_text'], axis=1)

In [137]:
df_train['cosine_sim']

0       0.021497
1       0.021520
2       0.010928
3       0.048911
4       0.056693
          ...   
5598    0.025049
5599    0.017403
5600    0.022930
5601    0.012576
5602    0.058545
Name: cosine_sim, Length: 5603, dtype: float64