In [68]:
import pandas as pd
import numpy as np

from datasets import load_dataset

dataset = load_dataset("cnamuangtoun/resume-job-description-fit")


df = pd.concat(
    [
        dataset["train"].to_pandas(),
        dataset["test"].to_pandas()
    ],
    ignore_index=True
)


import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    
    text = re.sub(r'\S+@\S+', ' email ', text)  # replace emails
    text = re.sub(r'\+?\d[\d\s\-\(\)]{7,}\d', ' phone ', text)  # replace phone numbers
    text = re.sub(r'www\.\S+|https?://\S+|\S+\.com', ' url ', text)  # Mask URLs (simple patterns)

    # remove noisy characters
    text = re.sub(r"[^\w\s\.\'\-\/\+\[\]]", ' ', text)

    #text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS]) # Remove stopwords

    # Normalize spaces again
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['resume_text'] = df['resume_text'].apply(preprocess_text)
df['job_description_text'] = df['job_description_text'].apply(preprocess_text)


df['combined_text'] = df['resume_text'] + " [SEP] " + df['job_description_text']

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

del df['label']


from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.15, random_state=1, stratify=df['label_encoded'])
df_train, df_val = train_test_split(df_full_train,test_size=0.176, random_state=1, stratify=df_full_train['label_encoded'])

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.label_encoded.values
y_val = df_val.label_encoded.values
y_test = df_test.label_encoded.values

del df_train['label_encoded']
del df_val['label_encoded']
del df_test['label_encoded']


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorizer
tfidf_resume = TfidfVectorizer(
    min_df=2,    #ignore very few words
    max_df=0.9,  #ignore very frequent words 
    max_features=5000,  
    ngram_range=(1,2),
    stop_words='english'
)

tfidf_jd = TfidfVectorizer(
    min_df=2,    #ignore very few words
    max_df=0.9,  #ignore very frequent words 
    max_features=5000,  
    ngram_range=(1,2),
    stop_words='english'
)


X_train_resume = tfidf_resume.fit_transform(df_train['resume_text'])
X_train_jd     = tfidf_jd.fit_transform(df_train['job_description_text'])

# Cosine similarity 
cos_sim_train = cosine_similarity(X_train_resume, X_train_jd).diagonal()

# Same for val and test (transform using train TF-IDF)
X_val_resume = tfidf_resume.transform(df_val['resume_text'])
X_val_jd     = tfidf_jd.transform(df_val['job_description_text'])

cos_sim_val   = cosine_similarity(X_val_resume, X_val_jd).diagonal()

X_test_resume = tfidf_resume.transform(df_test['resume_text'])
X_test_jd     = tfidf_jd.transform(df_test['job_description_text'])

cos_sim_test = cosine_similarity(X_test_resume, X_test_jd).diagonal()

# Add as a new column
df_train['cosine_sim'] = cos_sim_train
df_val['cosine_sim']   = cos_sim_val
df_test['cosine_sim']  = cos_sim_test



tfidf = TfidfVectorizer(
    min_df=2,    #ignore very few words
    max_df=0.9,  #ignore very frequent words 
    max_features=5000,  
    ngram_range=(1,2),
    stop_words='english'
)

X_train_combined = tfidf.fit_transform(df_train['combined_text'])
X_val_combined   = tfidf.transform(df_val['combined_text'])
X_test_combined  = tfidf.transform(df_test['combined_text'])

#df_train = df_train.drop(['resume_text','job_description_text','combined_text'], axis=1)
#df_val   = df_val.drop(['resume_text','job_description_text','combined_text'], axis=1)
#df_test  = df_test.drop(['resume_text','job_description_text','combined_text'], axis=1)

## Import libraries, dataset and read it

In [69]:
import pandas as pd
import numpy as np

In [70]:
from datasets import load_dataset

dataset = load_dataset("cnamuangtoun/resume-job-description-fit")

In [71]:
dataset

DatasetDict({
    train: Dataset({
        features: ['resume_text', 'job_description_text', 'label'],
        num_rows: 6241
    })
    test: Dataset({
        features: ['resume_text', 'job_description_text', 'label'],
        num_rows: 1759
    })
})

In [72]:
df = pd.concat(
    [
        dataset["train"].to_pandas(),
        dataset["test"].to_pandas()
    ],
    ignore_index=True
)
df

Unnamed: 0,resume_text,job_description_text,label
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit
...,...,...,...
7995,SummaryHighly motivated Sales Associate with e...,"Position Type: Full-Time, W2 Direct Hire. (Mus...",Good Fit
7996,SummaryWireless communications engineer with e...,"Location: Tampa, FL\nExp: 7-10 Yrs\nSPOC: Tush...",Good Fit
7997,Professional ProfileCapable International Tax ...,"Backed by a leading growth equity firm, an LA ...",Good Fit
7998,SummaryData Engineeringwith experience in Desi...,Allergan Data Labs is on a mission to transfor...,Good Fit


## EDA and preprocessing

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   resume_text           8000 non-null   object
 1   job_description_text  8000 non-null   object
 2   label                 8000 non-null   object
dtypes: object(3)
memory usage: 187.6+ KB


In [74]:
df.head()

Unnamed: 0,resume_text,job_description_text,label
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit


In [75]:
df.describe()

Unnamed: 0,resume_text,job_description_text,label
count,8000,8000,8000
unique,643,351,3
top,SummaryFinancial Accountant specializing in fi...,Calling all innovators find your future at Fi...,No Fit
freq,82,111,4000


In [76]:
# Check label distribution
print(df['label'].value_counts())

label
No Fit           4000
Potential Fit    2000
Good Fit         2000
Name: count, dtype: int64


In [77]:
# Check text length

df_text = pd.DataFrame()

df_text['resume_len'] = df['resume_text'].apply(len)
df_text['job_len'] = df['job_description_text'].apply(len)

print(df_text[['resume_len', 'job_len']].describe())


         resume_len      job_len
count   8000.000000  8000.000000
mean    5773.369000  2777.030875
std     2958.109675  1777.249968
min      897.000000    72.000000
25%     4234.000000  1309.000000
50%     5123.000000  2401.000000
75%     6603.000000  3985.000000
max    25364.000000  8171.000000


## Text preprocessing

In [78]:
#Remove punctuation and special characters

import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    
    text = re.sub(r'\S+@\S+', ' email ', text)  # replace emails
    text = re.sub(r'\+?\d[\d\s\-\(\)]{7,}\d', ' phone ', text)  # replace phone numbers
    text = re.sub(r'www\.\S+|https?://\S+|\S+\.com', ' url ', text)  # Mask URLs (simple patterns)

    # remove noisy characters
    text = re.sub(r"[^\w\s\.\'\-\/\+\[\]]", ' ', text)

    #text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS]) # Remove stopwords

    # Normalize spaces again
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['resume_text'] = df['resume_text'].apply(preprocess_text)
df['job_description_text'] = df['job_description_text'].apply(preprocess_text)

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [79]:
# Check text length

df_text = pd.DataFrame()

df_text['resume_len'] = df['resume_text'].apply(len)
df_text['job_len'] = df['job_description_text'].apply(len)

print(df_text[['resume_len', 'job_len']].describe())


         resume_len      job_len
count   8000.000000  8000.000000
mean    4857.516125  2333.126250
std     2471.028405  1502.395502
min      766.000000    64.000000
25%     3564.000000  1093.000000
50%     4338.000000  1990.000000
75%     5523.000000  3385.000000
max    21377.000000  7081.000000


In [80]:
df.head()

Unnamed: 0,resume_text,job_description_text,label
0,summaryhighli motiv sale associ with extens cu...,net2sourc inc. is an award-win total workforc ...,No Fit
1,profession summarycurr work with caterpillar a...,at sala obrien we tell our client that were en...,No Fit
2,summaryi start my construct career in june of ...,schweitzer engin laboratori sel infrastructur ...,No Fit
3,summarycertifi electr foremanwith thirteen yea...,mizick miller compani inc. is look for a dynam...,No Fit
4,summarywith extens experi in business/requir a...,life at capgemini capgemini support all aspect...,No Fit


In [81]:
# Combine text columns
df['combined_text'] = df['resume_text'] + " [SEP] " + df['job_description_text']

In [82]:
df.head()

Unnamed: 0,resume_text,job_description_text,label,combined_text
0,summaryhighli motiv sale associ with extens cu...,net2sourc inc. is an award-win total workforc ...,No Fit,summaryhighli motiv sale associ with extens cu...
1,profession summarycurr work with caterpillar a...,at sala obrien we tell our client that were en...,No Fit,profession summarycurr work with caterpillar a...
2,summaryi start my construct career in june of ...,schweitzer engin laboratori sel infrastructur ...,No Fit,summaryi start my construct career in june of ...
3,summarycertifi electr foremanwith thirteen yea...,mizick miller compani inc. is look for a dynam...,No Fit,summarycertifi electr foremanwith thirteen yea...
4,summarywith extens experi in business/requir a...,life at capgemini capgemini support all aspect...,No Fit,summarywith extens experi in business/requir a...


In [83]:
print(df['combined_text'][0])

summaryhighli motiv sale associ with extens custom servic and sale experience. outgo sale profession with track record of drive increas sale improv buy experi and elev compani profil with target market. highlights-soft skill public speak public relat team build project manag procedur write staff supervis and manag abil to interfac with profession on all levels. accomplish honor and activ -board of director member for the food bank of corpu christi from novemb 2010 to april 2013. -held life insur licens -basketbal offici refere high school varsiti level. experienceaccount 08/2014-05/2015aspiru owen wi perform daili and routin account function for two main compani and five small royalti companies. respons includ but are not limit to the follow account payabl account receiv manag and reconcil fund for multipl bank account payrol perform detail audit and adjust of balanc sheet and incom statement account audit and pay monthli sale tax inventori reconcili and budgeting.district administr ma

### Encode label columns

In [84]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

del df['label']

In [85]:
df.head()

Unnamed: 0,resume_text,job_description_text,combined_text,label_encoded
0,summaryhighli motiv sale associ with extens cu...,net2sourc inc. is an award-win total workforc ...,summaryhighli motiv sale associ with extens cu...,1
1,profession summarycurr work with caterpillar a...,at sala obrien we tell our client that were en...,profession summarycurr work with caterpillar a...,1
2,summaryi start my construct career in june of ...,schweitzer engin laboratori sel infrastructur ...,summaryi start my construct career in june of ...,1
3,summarycertifi electr foremanwith thirteen yea...,mizick miller compani inc. is look for a dynam...,summarycertifi electr foremanwith thirteen yea...,1
4,summarywith extens experi in business/requir a...,life at capgemini capgemini support all aspect...,summarywith extens experi in business/requir a...,1


In [86]:
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
label_mapping

{'Good Fit': np.int64(0), 'No Fit': np.int64(1), 'Potential Fit': np.int64(2)}

## Split the data

In [87]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.15, random_state=1, stratify=df['label_encoded'])
df_train, df_val = train_test_split(df_full_train,test_size=0.176, random_state=1, stratify=df_full_train['label_encoded'])

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.label_encoded.values
y_val = df_val.label_encoded.values
y_test = df_test.label_encoded.values

del df_train['label_encoded']
del df_val['label_encoded']
del df_test['label_encoded']

In [88]:
df_train

Unnamed: 0,resume_text,job_description_text,combined_text
0,summaryful stack softwar engin with 8+ year of...,pay rang 81k - 102k depend on experience.thi r...,summaryful stack softwar engin with 8+ year of...
1,profession summarytal leader and attorney by t...,who we are found in 2017 gatik is the leader i...,profession summarytal leader and attorney by t...
2,profession summarywith the attitud of learn i ...,hi repli request for below job opportunity.i k...,profession summarywith the attitud of learn i ...
3,summarydetail orient and team focus epidemiolo...,a littl about thi gig agil partner is seek a q...,summarydetail orient and team focus epidemiolo...
4,profession summaryqa test analyst/ develop tes...,primari locat melbourn florida v-soft consult ...,profession summaryqa test analyst/ develop tes...
...,...,...,...
5598,profession profileclair 6.1 csa from pegasyste...,descript what were look for at appfolio we pad...,profession profileclair 6.1 csa from pegasyste...
5599,profession summarywith the attitud of learn i ...,role detail we are seek a senior softwar engin...,profession summarywith the attitud of learn i ...
5600,summarycap fund account success at manag multi...,posit cost account report to presid the compan...,summarycap fund account success at manag multi...
5601,profilehighli motiv sale associ with extens cu...,posit busi analyst ivloc st. loui mo onsit fro...,profilehighli motiv sale associ with extens cu...


### Converting Resume Text and Job Description Text into Vectors and Computing Cosine Similarity as a New Feature

#### TF-IDF Vectorizer
- Converts text into numbers that reflect how important each word is in a document relative to the whole collection of documents.
- **Output:** A numeric vector for each document, where each element represents the TF-IDF score of a word from the vocabulary.

#### Cosine Similarity
- Computes the angle between the two TF-IDF vectors.
- `1` → texts are very similar
- `0` → texts are completely different

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorizer
# Create SHARED vocabulary for cosine similarity
shared_tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=5000
)

# Fit on BOTH resumes and JDs
all_train_text = pd.concat([
    df_train['resume_text'], 
    df_train['job_description_text']
])
shared_tfidf.fit(all_train_text)


X_train_resume = shared_tfidf.transform(df_train['resume_text'])
X_train_jd     = shared_tfidf.transform(df_train['job_description_text'])

# Cosine similarity 
cos_sim_train = cosine_similarity(X_train_resume, X_train_jd).diagonal()

# Same for val and test (transform using train TF-IDF)
X_val_resume = shared_tfidf.transform(df_val['resume_text'])
X_val_jd     = shared_tfidf.transform(df_val['job_description_text'])

cos_sim_val   = cosine_similarity(X_val_resume, X_val_jd).diagonal()

X_test_resume = shared_tfidf.transform(df_test['resume_text'])
X_test_jd     = shared_tfidf.transform(df_test['job_description_text'])

cos_sim_test = cosine_similarity(X_test_resume, X_test_jd).diagonal()

In [90]:
tfidf = TfidfVectorizer(
    stop_words='english',    # ← This helps
    max_features=5000        # ← This prevents memory issues
)

X_train_combined = tfidf.fit_transform(df_train['combined_text'])
X_val_combined   = tfidf.transform(df_val['combined_text'])
X_test_combined  = tfidf.transform(df_test['combined_text'])

In [91]:
from scipy.sparse import hstack

X_train_final = hstack([X_train_combined, cos_sim_train.reshape(-1, 1)])
X_val_final = hstack([X_val_combined, cos_sim_val.reshape(-1, 1)])
X_test_final = hstack([X_test_combined, cos_sim_test.reshape(-1, 1)])

### check features ranges

In [92]:
# Check TF-IDF range
print("TF-IDF Combined features range:")
print(f"  Min: {X_train_combined.min():.6f}")
print(f"  Max: {X_train_combined.max():.6f}")
print(f"  Mean: {X_train_combined.mean():.6f}")

# Check cosine similarity range  
print("\nCosine similarity range:")
print(f"  Min: {cos_sim_train.min():.3f}")
print(f"  Max: {cos_sim_train.max():.3f}")
print(f"  Mean: {cos_sim_train.mean():.3f}")

TF-IDF Combined features range:
  Min: 0.000000
  Max: 0.824428
  Mean: 0.002912

Cosine similarity range:
  Min: 0.000
  Max: 0.552
  Mean: 0.110


In [93]:
feature_names = list(tfidf.get_feature_names_out()) + ['cosine_similarity']
feature_names

['00',
 '000',
 '01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '0expect',
 '10',
 '100',
 '1000',
 '109',
 '1099',
 '10g',
 '10th',
 '10x',
 '11',
 '116',
 '11g',
 '12',
 '13',
 '130',
 '139',
 '14',
 '145',
 '15',
 '150',
 '16',
 '17',
 '18',
 '180',
 '19',
 '190',
 '1994',
 '1997',
 '1998',
 '1999',
 '1softwar',
 '1st',
 '1to01',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2002to01',
 '2003',
 '2004',
 '2004to01',
 '2005',
 '2006',
 '2007',
 '2007to01',
 '2007to09',
 '2008',
 '2008m',
 '2008r2',
 '2008to04',
 '2008to05',
 '2008to12',
 '2008totobachelor',
 '2009',
 '2009to01',
 '2009to07',
 '2010',
 '2011',
 '2011to04',
 '2011to05',
 '2011to11',
 '2012',
 '2012to01',
 '2012to03',
 '2012to06',
 '2012to07',
 '2012to12',
 '2013',
 '2013senior',
 '2013softwar',
 '2013to01',
 '2013to04',
 '2013to07',
 '2013to08',
 '2013to10',
 '2013to11',
 '2013totobachelor',
 '2014',
 '2014softwar',
 '2014to01',
 '2014to02',
 '2014to05',
 '2014to06',
 '2014to07',
 '2014to08',
 '2014to11',
