In [1]:
import numpy as np
import pandas as pd
import nltk
# nltk.download('stopwords')
import re
import pdfplumber
import tensorflow as tf
from tqdm.notebook import tqdm
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from PIL import Image
from pytesseract import pytesseract
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Lasso 
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('ResumeScore.csv')
df.head()

Unnamed: 0,Resumes,Score
0,CYBER_SECURITY_RESUME_0,65
1,CYBER_SECURITY_RESUME_1,75
2,CYBER_SECURITY_RESUME_10,75
3,CYBER_SECURITY_RESUME_11,67
4,CYBER_SECURITY_RESUME_12,58


### PDF TO Images

In [3]:
from pdf2image import convert_from_path
for j in tqdm(range(0,len(df))):
    images = convert_from_path(str("dataset" + "/" + df['Resumes'][j]+".pdf"))
    for i in range(len(images)):
        images[i].save(f"ResumeImages/{df.Resumes[j]}"+'.jpg', "JPEG")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=118.0), HTML(value='')))




### Extract Text from Images

In [4]:
pdf_text = list()
path_to_tesseract = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

for i in tqdm(range(0,len(df))):
    image_path = str("ResumeImages" + "/" + df['Resumes'][i]+".jpg")
    img = Image.open(image_path)
    pytesseract.tesseract_cmd = path_to_tesseract
    text = pytesseract.image_to_string(img)
    pdf_text.append(text[:-1])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=118.0), HTML(value='')))




In [5]:
#create a dataframe
df_new = pd.DataFrame(pdf_text,columns=['text'])
df = pd.concat([df,df_new],axis=1)
df.head()

Unnamed: 0,Resumes,Score,text
0,CYBER_SECURITY_RESUME_0,65,"KHADIJAH KOSS\n\n51513 FRITSCH WALKS, CHICAGO,..."
1,CYBER_SECURITY_RESUME_1,75,"= Enforce IT processes to ensure consistent, w..."
2,CYBER_SECURITY_RESUME_10,75,"FIRST LAST\n\nNew York, NY | P: +44 123456789 ..."
3,CYBER_SECURITY_RESUME_11,67,DON EVANS\nCyber Security Analyst\n\n\ 70954-6...
4,CYBER_SECURITY_RESUME_12,58,Robert Smith nese (123) 456 78 99\n\nCyber Sec...


### Data Preprocessing

In [6]:
ps = PorterStemmer()
corpus = []
for i in range(0,len(df['text'])):
    text = re.sub('[^a-zA-Z]',' ',df['text'][i])
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stopwords.words('english')]
    text = [word for word in text if word.isalpha()]
    
    text = " ".join(text)
    corpus.append(text)

In [7]:
#create dataframe for the new corpus
preprocess_df = pd.DataFrame(corpus,columns=['preprocessed_text'])
preprocess_df

Unnamed: 0,preprocessed_text
0,khadijah koss fritsch walk chicago il phone l ...
1,enforc process ensur consist well integr appli...
2,first last new york ny p first last resumeword...
3,evan cyber secur analyst wreewexamplacom donev...
4,robert smith nese cyber secur engin hidniinyim...
...,...
113,place de miremont pari franc profil creativ we...
114,elza experi nader walker low windler houston t...
115,sylvest rath alysa ridg dalla tx phone experi ...
116,karli berkley front end web develop contact wo...


In [8]:
#concat the data with the new preprocess text
data = pd.concat([df,preprocess_df],axis=1)
data.head()

Unnamed: 0,Resumes,Score,text,preprocessed_text
0,CYBER_SECURITY_RESUME_0,65,"KHADIJAH KOSS\n\n51513 FRITSCH WALKS, CHICAGO,...",khadijah koss fritsch walk chicago il phone l ...
1,CYBER_SECURITY_RESUME_1,75,"= Enforce IT processes to ensure consistent, w...",enforc process ensur consist well integr appli...
2,CYBER_SECURITY_RESUME_10,75,"FIRST LAST\n\nNew York, NY | P: +44 123456789 ...",first last new york ny p first last resumeword...
3,CYBER_SECURITY_RESUME_11,67,DON EVANS\nCyber Security Analyst\n\n\ 70954-6...,evan cyber secur analyst wreewexamplacom donev...
4,CYBER_SECURITY_RESUME_12,58,Robert Smith nese (123) 456 78 99\n\nCyber Sec...,robert smith nese cyber secur engin hidniinyim...


#### Convert Text data into vectors

In [9]:
cv= CountVectorizer()
X = cv.fit_transform(data['preprocessed_text'].values)
X.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
y = data['Score'].values
y

array([65, 75, 75, 67, 58, 75, 57, 56, 50, 68, 55, 55, 55, 57, 65, 75, 85,
       55, 57, 77, 15, 60, 58, 70, 65, 78, 70, 57, 60, 65, 80, 82, 78, 75,
       60, 72, 78, 75, 80, 60, 82, 69, 78, 75, 79, 82, 90, 85, 85, 80, 83,
       50, 78, 63, 95, 75, 20, 60, 78, 65, 70, 65, 63, 85, 80, 72, 60, 62,
       78, 70, 70, 68, 58, 78, 77, 80, 80, 70, 69, 57, 65, 66, 60, 80, 75,
       78, 78, 65, 79, 88, 80, 68, 60, 78, 75, 80, 82, 77, 55, 55, 58, 60,
       55, 78, 50, 85, 88, 78, 60, 70, 56, 75, 78, 85, 60, 65, 80, 79],
      dtype=int64)

### Model Planning and Building

In [11]:
def select_model(model):
    model.fit(X,y)
    y_pred=model.predict(X)
    mse = mean_squared_error(y,y_pred)
    rmse = np.sqrt(mse)
    score = model.score(X,y)
    
    print("MSE : ", mse)
    print("RMSE:", rmse)
    print("SCORE:", score)

In [12]:
#Create a object for each algorithm
lr = LinearRegression()
ls = Lasso()
rd = Ridge()
xgb = XGBRegressor()
rf = RandomForestRegressor()
svr = SVR()

In [13]:
model_list = [lr,ls,rd,xgb,rf,svr]
for model in model_list:
    print("Model: ", model)
    select_model(model)
    print("\n")

Model:  LinearRegression()
MSE :  3.0268361581923435
RMSE: 1.7397804913816983
SCORE: 0.9801055544646167


Model:  Lasso()
MSE :  89.72174761968775
RMSE: 9.472156439781163
SCORE: 0.4102870693783174


Model:  Ridge()
MSE :  3.098844070663034
RMSE: 1.7603533936863456
SCORE: 0.9796322690213701


Model:  XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)
MSE :  3.0268390854408853
RMSE: 1.7397813326509988
SCORE: 0.9801055352247294

### Predicition On Different Resume

In [14]:
#save the linear regression model and vectorizer model

joblib.dump(lr,"Web App/model.joblib")
joblib.dump(cv, 'Web App/vectorizer.pkl')

['Web App/vectorizer.pkl']