# importing libaries

In [8]:
# Importing important packages
import os
import sys

# Data Wrangling and manipulation
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
# from wordcloud import WordCloud
%matplotlib inline

from sklearn.model_selection import train_test_split

# Importing from the NLP tools
import spacy # for lemmatization
nlp = spacy.load('en_core_web_sm')

import re   # regex
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt') # for word_tokenize
nltk.download('stopwords') # for stopwords


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wish6\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wish6\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# Loading the data
data = pd.read_csv('Resume.csv')

# drop unused column
data.drop(columns=['ID', 'Resume_html'], inplace=True)

# renaming columns to make easier access
data.rename(columns={'Resume_str':'resume', 'Category':'role'}, inplace=True)

# Checking out a sample format of the loaded data
data.head(3)

Unnamed: 0,resume,role
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR
1,"HR SPECIALIST, US HR OPERATIONS ...",HR
2,HR DIRECTOR Summary Over 2...,HR


In [10]:
data['resume'] = data['resume'].astype(str)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   resume  2484 non-null   object
 1   role    2484 non-null   object
dtypes: object(2)
memory usage: 38.9+ KB


# Data preprocessing

In [11]:
# removing this entry because resume is a empty string
data.drop(656, inplace=True)

In [12]:
# checking duplicated
data.drop_duplicates(inplace=True)
data.duplicated().sum()

0

In [13]:
data.shape

(2481, 2)

In [14]:
data.sample(5)

Unnamed: 0,resume,role
1457,FOOD SERVER Professional Summar...,CHEF
1221,CONSULTANT ACCOUNT Summary T...,CONSULTANT
760,SENIOR SPECIALTY SALES REPRESENTATIVE...,HEALTHCARE
11,HR MANAGER Summary Human...,HR
2083,BILLING SERVICES ASSOCIATE ...,PUBLIC-RELATIONS


function for cleaning text,
- change text to lower case
- remove email, URLs, html tags, non-english characters
- remove extra space
- lemmatizing or converting words into their base forms eg. apples -> apple, flying -> fly
- remove stopwords eg. I, you, am, etc.

In [15]:
def clean(text):
    text = text.lower()

    text = re.sub('^[\w\-\.]+@([\w-]+\.)+[\w-]{2,4}', ' ', text)  # remove email
    text = re.sub('[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#\?&//=]*)', ' ', text)  # remove URLs
    text = re.sub('<.+?>', ' ', text)  # html tags
    text = re.sub('[^a-z]', ' ', text) # remove non-characters
    text = re.sub('\s+', ' ', text)  # remove extra whitespace

    doc = nlp(text) # change text into DOC object
    text = ' '.join([token.lemma_ for token in doc]) # lemmatization
     
    text = text.lower() # we need to change 
     
    text = word_tokenize(text) # tokenize word in list
    
    stop_words = set(stopwords.words('english')) # get stopwords
    text = [word for word in text if word not in stop_words] # remove stop words
    
    return ' '.join(text)

In [16]:
clean("I am sly king of the fly.")

'sly king fly'

In [17]:
data['resume'] = data['resume'].apply(lambda x : clean(x))

In [82]:
data.sample(10)

Unnamed: 0,resume,role,role_id
2227,employee relation consultant summary human res...,BANKING,7
781,health system analyst summary healthcare busin...,HEALTHCARE,18
1716,engineering technician summary work engineer t...,ENGINEERING,15
2232,financial analyst intern skill financial plann...,BANKING,7
916,adult education teacher accomplishment cal pol...,AGRICULTURE,2
1022,sale associate golf sale associate executive s...,SALES,22
111,designer summary establish well rounded design...,DESIGNER,13
1742,software engineering manager summary multiface...,ENGINEERING,15
1526,finance manager summary ability communicate ef...,FINANCE,16
828,care coordinator professional summary position...,FITNESS,17


# Label Encoding

In [19]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

# label encoding -> role
data['role_id'] = le.fit_transform(data['role'])

# printing labels
for i,v in enumerate(le.classes_):
    print(f'{i} : {v}')

0 : ACCOUNTANT
1 : ADVOCATE
2 : AGRICULTURE
3 : APPAREL
4 : ARTS
5 : AUTOMOBILE
6 : AVIATION
7 : BANKING
8 : BPO
9 : BUSINESS-DEVELOPMENT
10 : CHEF
11 : CONSTRUCTION
12 : CONSULTANT
13 : DESIGNER
14 : DIGITAL-MEDIA
15 : ENGINEERING
16 : FINANCE
17 : FITNESS
18 : HEALTHCARE
19 : HR
20 : INFORMATION-TECHNOLOGY
21 : PUBLIC-RELATIONS
22 : SALES
23 : TEACHER


In [85]:
data.head()

Unnamed: 0,resume,role,role_id
0,hr administrator marketing associate hr admini...,HR,19
1,hr specialist hr operation summary versatile m...,HR,19
2,hr director summary year experience recruiting...,HR,19
3,hr specialist summary dedicate driven dynamic ...,HR,19
4,hr manager skill highlight hr skill hr departm...,HR,19


In [83]:
data.to_csv('cleaned_Resume.csv', index=False)

# Exploratory Data Analysis : check word frequency

In [21]:
all_records = ''

# combining records into one string
for record in data['resume']:
    all_records += record
    
# creating wordcloud
# wordcloud = WordCloud(background_color ='black',min_font_size = 10, colormap='Blues').generate(all_records)
   
# # plotting wordcloud 
# plt.figure(figsize = (8, 8), facecolor = 'black')
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis("off")
# plt.tight_layout(pad = 0)
 
# plt.show()

In [22]:
all_tokens = word_tokenize(all_records) # tokenize word in list
word_freq = nltk.FreqDist(all_tokens)
mostcommon = word_freq.most_common(50)
print(mostcommon)


[('state', 16256), ('company', 15714), ('city', 15100), ('management', 12110), ('name', 11776), ('customer', 11301), ('service', 9055), ('work', 8686), ('sale', 8369), ('skill', 8023), ('business', 7990), ('project', 7924), ('team', 7134), ('system', 7085), ('manage', 6612), ('client', 6489), ('new', 6420), ('experience', 6239), ('include', 6117), ('process', 6070), ('maintain', 5962), ('develop', 5947), ('development', 5797), ('provide', 5650), ('manager', 5526), ('training', 5239), ('report', 5194), ('support', 5063), ('program', 5023), ('design', 5005), ('account', 4983), ('product', 4847), ('information', 4439), ('use', 4430), ('office', 4413), ('plan', 4402), ('employee', 4386), ('financial', 4327), ('marketing', 4295), ('staff', 4186), ('professional', 4183), ('education', 4095), ('create', 3961), ('ensure', 3933), ('need', 3705), ('year', 3687), ('operation', 3683), ('communication', 3631), ('assist', 3630), ('university', 3507)]


# feature extraction : BoW, TF-IDF

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts = data/

tfidf = TfidfVectorizer(min_df = 2, max_df = 0.5, ngram_range = (1, 2))
features = tfidf.fit_transform(texts)
 
pd.Dataframe(features.todense(), columns = tfidf.get_feature_names())

SyntaxError: invalid syntax (2518192608.py, line 3)