In [8]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import re 
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [6]:
df = pd.read_csv('data/train.csv')

In [7]:
df.head()

Unnamed: 0,resume_text,job_description_text,label
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit


In [9]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [10]:
def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"")
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)

In [13]:
processed_df = df.copy()

In [16]:
processed_df['resume_text'] = processed_df['resume_text'].map(lambda s: preprocess(s))
processed_df['job_description_text'] = processed_df['job_description_text'].map(lambda s: preprocess(s))

In [17]:
processed_df.head()

Unnamed: 0,resume_text,job_description_text,label
0,summaryhighly motivated sales associate extens...,netsource inc award winning total workforce so...,No Fit
1,professional summarycurrently working caterpil...,salas obrien tell clients engineered impact pa...,No Fit
2,summaryi started construction career june jack...,schweitzer engineering laboratories sel infras...,No Fit
3,summarycertified electrical foremanwith thirte...,mizick miller company inc looking dynamic indi...,No Fit
4,summarywith extensive experience business requ...,life capgemini capgemini supports aspects well...,No Fit


In [19]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
processed_df['label'] = le.fit_transform(processed_df['label'])

In [20]:
processed_df.head()

Unnamed: 0,resume_text,job_description_text,label
0,summaryhighly motivated sales associate extens...,netsource inc award winning total workforce so...,1
1,professional summarycurrently working caterpil...,salas obrien tell clients engineered impact pa...,1
2,summaryi started construction career june jack...,schweitzer engineering laboratories sel infras...,1
3,summarycertified electrical foremanwith thirte...,mizick miller company inc looking dynamic indi...,1
4,summarywith extensive experience business requ...,life capgemini capgemini supports aspects well...,1


In [21]:
processed_df['label'].unique()

array([1, 2, 0])

In [29]:
processed_df['combined_text'] = processed_df['resume_text'] + ' ' + processed_df['job_description_text']

## TF-IDF

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [44]:
X = processed_df['combined_text']
y = processed_df['label']

In [45]:
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),  
    ('clf', LogisticRegression(max_iter=500))
])

In [46]:
param_grid = {
    'tfidf__max_features': [1000, 3000, 5000, 10000],
    'clf__C': [0.1, 1, 10]
}

In [47]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X, y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ...............clf__C=0.1, tfidf__max_features=1000; total time=   2.8s
[CV] END ...............clf__C=0.1, tfidf__max_features=1000; total time=   2.8s
[CV] END ...............clf__C=0.1, tfidf__max_features=1000; total time=   3.0s
[CV] END ...............clf__C=0.1, tfidf__max_features=1000; total time=   2.9s
[CV] END ...............clf__C=0.1, tfidf__max_features=1000; total time=   2.9s
[CV] END ...............clf__C=0.1, tfidf__max_features=3000; total time=   3.0s
[CV] END ...............clf__C=0.1, tfidf__max_features=3000; total time=   2.9s
[CV] END ...............clf__C=0.1, tfidf__max_features=3000; total time=   3.0s
[CV] END ...............clf__C=0.1, tfidf__max_features=3000; total time=   2.9s
[CV] END ...............clf__C=0.1, tfidf__max_features=3000; total time=   2.8s
[CV] END ...............clf__C=0.1, tfidf__max_features=5000; total time=   3.1s
[CV] END ...............clf__C=0.1, tfidf__max_f

In [48]:
# Get the best parameters and model
print("Best parameters found: ", grid_search.best_params_)
best_model = grid_search.best_estimator_

Best parameters found:  {'clf__C': 10, 'tfidf__max_features': 10000}


In [49]:
test_df = pd.read_csv('data/test.csv')
processed_test_df = test_df.copy()

In [50]:
processed_test_df['resume_text'] = processed_test_df['resume_text'].map(lambda s: preprocess(s))
processed_test_df['job_description_text'] = processed_test_df['job_description_text'].map(lambda s: preprocess(s))
processed_test_df['combined_text'] = processed_test_df['resume_text'] + ' ' + processed_test_df['job_description_text']
processed_test_df['label'] = le.transform(processed_test_df['label'])

In [51]:
predictions = best_model.predict(processed_test_df['combined_text'])
accuracy = accuracy_score(processed_test_df['label'], predictions)
print("Accuracy: ", accuracy)

Accuracy:  0.4991472427515634
