In [1]:
import pandas as pd

In [3]:
df = pd.read_csv("Resume Screening.csv")

In [4]:
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [18]:
df.columns

Index(['Category', 'Resume'], dtype='object')

In [17]:
df.shape

(962, 2)

In [9]:
df['Category'].value_counts()

Category
Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Blockchain                   40
ETL Developer                40
Operations Manager           40
Data Science                 40
Sales                        40
Mechanical Engineer          40
Arts                         36
Database                     33
Electrical Engineering       30
Health and fitness           30
PMO                          30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
SAP Developer                24
Civil Engineer               24
Advocate                     20
Name: count, dtype: int64

Preprocessing

In [10]:
import re

In [11]:
def clean_text(text):
    #lowercase
    text = text.lower()
    #remove url and change it with white space
    text = re.sub(r"http\S+", " ", text)
    #remove special characters
    text = re.sub(r"\W", " ", text)
    #remove numbers
    text = re.sub(r"\d+", " ", text)
    #remove extra space
    text = re. sub(r"\s+", " ", text)
    return text

In [23]:
#remove stop words and vectorize(TF-IDF)
df['clean_resume'] = df['Resume'].apply(clean_text)

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    #removes words like- is, the, etc
    stop_words = 'english',
    #keeps top 3000 most important words as too many words slows the model down.
    max_features = 3000
)

In [24]:
#x = cleaned resume data
x = tfidf.fit_transform(df['clean_resume'])
#y = category
y = df['Category']

In [25]:
df[['Resume', 'clean_resume']].head()

Unnamed: 0,Resume,clean_resume
0,Skills * Programming Languages: Python (pandas...,skills programming languages python pandas num...
1,Education Details \r\nMay 2013 to May 2017 B.E...,education details may to may b e uit rgpv data...
2,"Areas of Interest Deep Learning, Control Syste...",areas of interest deep learning control system...
3,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skills â r â python â sap hana â tableau â sap...
4,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...


In [26]:
print(df['clean_resume'].iloc[0])

skills programming languages python pandas numpy scipy scikit learn matplotlib sql java javascript jquery machine learning regression svm naã ve bayes knn random forest decision trees boosting techniques cluster analysis word embedding sentiment analysis natural language processing dimensionality reduction topic modelling lda nmf pca neural nets database visualizations mysql sqlserver cassandra hbase elasticsearch d js dc js plotly kibana matplotlib ggplot tableau others regular expression html css angular logstash kafka python flask git docker computer vision open cv and understanding of deep learning education details data science assurance associate data science assurance associate ernst young llp skill details javascript exprience months jquery exprience months python exprience monthscompany details company ernst young llp description fraud investigations and dispute services assurance technology assisted review tar technology assisted review assists in accelerating the review proc

train test split

In [27]:
#input = x_train and x_test, output = y_train and y_test.
# x_train means- resumes for learning, y_train means- correct answers for learning.
# here test_size=0.2 means 20% resumes arre for testing and 80% are for training.
# we are using random state as we want the randomness of the model to be fixed, if we use a particular value for randomness, the accuracy does not change on every run and the split stays the same for training and test data, it makes it easier to debug and compare models.
#as for SEED we can pick any value.
from sklearn.model_selection import train_test_split
SEED = 20
x_train, x_test, y_train, y_test = train_test_split(  
    x, y, test_size= 0.2, random_state = SEED
)

train the model

In [31]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


evaluate the model

In [33]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9792746113989638
                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         3
                     Arts       1.00      1.00      1.00        10
       Automation Testing       1.00      0.57      0.73         7
               Blockchain       1.00      1.00      1.00         6
         Business Analyst       1.00      1.00      1.00         6
           Civil Engineer       1.00      1.00      1.00         6
             Data Science       1.00      1.00      1.00        11
                 Database       1.00      1.00      1.00         8
          DevOps Engineer       1.00      0.92      0.96        12
         DotNet Developer       1.00      1.00      1.00         6
            ETL Developer       1.00      1.00      1.00        10
   Electrical Engineering       0.75      1.00      0.86         3
                       HR       1.00      1.00      1.00         7
                   Hadoop       

test with new resume(demo model)

In [34]:
sample_resume = """
Skills: Python, SQL, Machine Learning, Pandas, NumPy
Experience: Data analysis, model training, visualization
"""

cleaned = clean_text(sample_resume)
vector = tfidf.transform([cleaned])
prediction = model.predict(vector)

print("Predicted Category:", prediction[0])


Predicted Category: Data Science
