## Import libraries, dataset and read it

In [2]:
import pandas as pd
import numpy as np

In [3]:
from datasets import load_dataset

dataset = load_dataset("cnamuangtoun/resume-job-description-fit")

train.csv:   0%|          | 0.00/53.4M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/15.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6241 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1759 [00:00<?, ? examples/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['resume_text', 'job_description_text', 'label'],
        num_rows: 6241
    })
    test: Dataset({
        features: ['resume_text', 'job_description_text', 'label'],
        num_rows: 1759
    })
})

In [184]:
df = pd.concat(
    [
        dataset["train"].to_pandas(),
        dataset["test"].to_pandas()
    ],
    ignore_index=True
)
df

Unnamed: 0,resume_text,job_description_text,label
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit
...,...,...,...
7995,SummaryHighly motivated Sales Associate with e...,"Position Type: Full-Time, W2 Direct Hire. (Mus...",Good Fit
7996,SummaryWireless communications engineer with e...,"Location: Tampa, FL\nExp: 7-10 Yrs\nSPOC: Tush...",Good Fit
7997,Professional ProfileCapable International Tax ...,"Backed by a leading growth equity firm, an LA ...",Good Fit
7998,SummaryData Engineeringwith experience in Desi...,Allergan Data Labs is on a mission to transfor...,Good Fit


## EDA and preprocessing

In [185]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   resume_text           8000 non-null   object
 1   job_description_text  8000 non-null   object
 2   label                 8000 non-null   object
dtypes: object(3)
memory usage: 187.6+ KB


In [186]:
df.head()

Unnamed: 0,resume_text,job_description_text,label
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit


In [187]:
df.describe()

Unnamed: 0,resume_text,job_description_text,label
count,8000,8000,8000
unique,643,351,3
top,SummaryFinancial Accountant specializing in fi...,Calling all innovators find your future at Fi...,No Fit
freq,82,111,4000


In [188]:
# Check label distribution
print(df['label'].value_counts())

label
No Fit           4000
Potential Fit    2000
Good Fit         2000
Name: count, dtype: int64


In [189]:
# Check text length

df_text = pd.DataFrame()

df_text['resume_len'] = df['resume_text'].apply(len)
df_text['job_len'] = df['job_description_text'].apply(len)

print(df_text[['resume_len', 'job_len']].describe())


         resume_len      job_len
count   8000.000000  8000.000000
mean    5773.369000  2777.030875
std     2958.109675  1777.249968
min      897.000000    72.000000
25%     4234.000000  1309.000000
50%     5123.000000  2401.000000
75%     6603.000000  3985.000000
max    25364.000000  8171.000000


## Text preprocessing

In [190]:
#Lowercasing

df['resume_text'] = df['resume_text'].str.lower()
df['job_description_text'] = df['job_description_text'].str.lower()

In [191]:
#Remove punctuation and special characters

import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    text = re.sub(r'\S+@\S+', ' email ', text)  # replace emails
    text = re.sub(r'\+?\d[\d\s\-\(\)]{7,}\d', ' phone ', text)  # replace phone numbers
    text = re.sub(r'www\.\S+|https?://\S+|\S+\.com', ' url ', text)  # Mask URLs (simple patterns)
    text = re.sub(r'\d+\+', r'\g<0>', text)  # optional, keep '6+ years' intact

    # Replace punctuation with space to prevent merging words
    text = re.sub(r'[^\w\s\[\]]', ' ', text)

    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS]) # Remove stopwords

    # Normalize spaces again after punctuation removal
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['resume_text'] = df['resume_text'].apply(preprocess_text)
df['job_description_text'] = df['job_description_text'].apply(preprocess_text)

In [192]:
# Check text length

df_text = pd.DataFrame()

df_text['resume_len'] = df['resume_text'].apply(len)
df_text['job_len'] = df['job_description_text'].apply(len)

print(df_text[['resume_len', 'job_len']].describe())


         resume_len      job_len
count   8000.000000  8000.000000
mean    4899.676250  2187.921750
std     2491.289549  1364.326364
min      798.000000    53.000000
25%     3511.000000   985.000000
50%     4386.000000  1966.000000
75%     5623.500000  3216.000000
max    22520.000000  6222.000000


In [193]:
df.head()

Unnamed: 0,resume_text,job_description_text,label
0,summaryhighly motivated sales associate extens...,net2source award winning total workforce solut...,No Fit
1,professional summarycurrently working caterpil...,salas obrien tell clients engineered impact pa...,No Fit
2,summaryi started construction career june 2017...,schweitzer engineering laboratories sel infras...,No Fit
3,summarycertified electrical foremanwith thirte...,mizick miller company looking dynamic individu...,No Fit
4,summarywith extensive experience business requ...,life capgemini capgemini supports aspects chan...,No Fit


In [194]:
# Combine text columns
df['combined_text'] = df['resume_text'] + " [SEP] " + df['job_description_text']

In [195]:
df.head()

Unnamed: 0,resume_text,job_description_text,label,combined_text
0,summaryhighly motivated sales associate extens...,net2source award winning total workforce solut...,No Fit,summaryhighly motivated sales associate extens...
1,professional summarycurrently working caterpil...,salas obrien tell clients engineered impact pa...,No Fit,professional summarycurrently working caterpil...
2,summaryi started construction career june 2017...,schweitzer engineering laboratories sel infras...,No Fit,summaryi started construction career june 2017...
3,summarycertified electrical foremanwith thirte...,mizick miller company looking dynamic individu...,No Fit,summarycertified electrical foremanwith thirte...
4,summarywith extensive experience business requ...,life capgemini capgemini supports aspects chan...,No Fit,summarywith extensive experience business requ...


In [196]:
print(df['combined_text'][0])

summaryhighly motivated sales associate extensive customer service sales experience outgoing sales professional track record driving increased sales improving buying experience elevating company profile target market highlights soft skills public speaking public relations team building project management procedure writing staff supervision management ability interface professionals levels accomplishments honors activities board directors member food bank corpus christi november 2010 april 2013 held life insurance license basketball official referee high school varsity level experienceaccountant 08 2014 05 2015aspirus owen wi perform daily routine accounting functions main companies small royalty companies responsibilities include limited following accounts payable accounts receivable manage reconcile funds multiple banks accounts payroll perform audits adjustments balance sheet income statement accounts audit pay monthly sales tax inventory reconciliations budgeting district administra