In [1]:
import csv
import numpy as np  
from sklearn.model_selection import train_test_split
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
import re
import pandas as pd
import string
from sklearn import metrics
from sklearn.metrics import classification_report
import itertools 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SHNAVI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#load and pre-process dataset
df = pd.read_csv("train_data.csv")
#drop unnecessary columns
df.drop(['Unnamed: 0','Sentence_id'],axis=1,inplace=True)
#drop rows with nan values
df = df.dropna()
# rename columns
df.columns = ['Text', 'Type']

In [3]:
# Function to remove punctuation and convert to lowercase and return a list of individual words 
def preprocess(sentence):
    #remove punctuations
    sentence = re.sub('[^\\w\\s]',' ', sentence)
    sentence = re.sub('[0-9]',' ', sentence)
    #convert to lower case
    sentence = sentence.lower()
    #split sentence into a list of words 
    w=sentence.split()
    
    sw = stopwords.words('english')
    i=0
    while i<len(w):
        if w[i] in sw:     #removing the stop words, finding them and poping them out from the words splitted list
            w.pop(i)
            i=0
        i+=1
    
    return w   

In [4]:
# Perform preprocessing on each row of the text data 
ls_words = []
for index, row in df.iterrows():
    ls_words.append(preprocess(row['Text']))
df['preprocessed_Text'] = ls_words
# visualize dataframe after adding 'preprocessed_Text' column
df.head(10)

Unnamed: 0,Text,Type,preprocessed_Text
0,Author and/or Review architecture/design and o...,Responsibility,"[author, review, architecture, design, technic..."
1,Should be able to develop custom dynamic shape...,Responsibility,"[be, able, develop, custom, dynamic, shape, ob..."
2,Experience in working crosslly with a larger ...,Requirement,"[experience, working, crosslly, larger, engine..."
3,"Previous business experience, including but no...",Skill,"[previous, business, experience, including, li..."
4,Delivering fast and right the first time.,SoftSkill,"[delivering, fast, right, first, time]"
5,Track department expenses and capital budget .,Responsibility,"[track, department, expenses, capital, budget]"
6,Meet performance metrics.,Responsibility,"[meet, performance, metrics]"
7,YOU MUST HAVEBachelors degree in Engineering.,Requirement,"[must, havebachelors, degree, engineering]"
8,After COVID-19: Ability to travel to manufactu...,Requirement,"[covid, ability, travel, manufacturing, site, ..."
9,Six Sigma qualification to at least Green Bel...,Education,"[six, sigma, qualification, least, green, belt..."


In [5]:
# Split Data in to Train (70%), Validation (15%) Test(15%)

X_train, X_test, y_train, y_test = train_test_split(df,df['Type'] ,test_size=0.3, random_state=1)

X_test, X_val, y_test, y_val = train_test_split(X_test,X_test['Type'] ,test_size=0.5, random_state=1)

 

In [6]:
# get unique words in the dataset
word_set=set()
for index, row in X_train.iterrows():           #unique collections of words in a set to avoid repetitions
    word_set.update(row['preprocessed_Text'])

In [7]:
#set of words in each type which would be used for creating dict of word frequencies
Education=set()                 
Experience=set()
Requirement=set()
Responsibility =set()
Skill = set()
SoftSkill = set()

for index, row in X_train.iterrows(): 
    if row['Type']=='Education':
        words=row['preprocessed_Text']
        Education.update(words)


    elif row['Type']=='Experience':
        words=row['preprocessed_Text']
        Experience.update(words)

    elif row['Type']=='Requirement':
        words=row['preprocessed_Text']
        Requirement.update(words)
    
    elif row['Type']=='Responsibility':
        words=row['preprocessed_Text']
        Responsibility.update(words)
    
    elif row['Type']=='Skill':
        words=row['preprocessed_Text']
        Skill.update(words)

    elif row['Type']=='SoftSkill':
        words=row['preprocessed_Text']
        SoftSkill.update(words)

In [8]:
# Create dict of words and their frequencies for each type
dict_Education={}
dict_Experience={}           
dict_Requirement={}
dict_Responsibility={}
dict_Skill={}
dict_SoftSkill={}

# Initialize the frequencies of each word to zero
for word in Education:
    dict_Education[word]=0
for word in Experience:
    dict_Experience[word]=0        
for word in Requirement:
    dict_Requirement[word]=0
for word in Responsibility:
    dict_Responsibility[word]=0
for word in Skill:
    dict_Skill[word]=0
for word in SoftSkill:
    dict_SoftSkill[word]=0

# create dict of words and frequencies for each type
for index, row in X_train.iterrows(): 
    if row['Type']=='Education':
        words=row['preprocessed_Text']          
        for wd in words:
              dict_Education[wd]+=1

    elif row['Type']=='Experience':
        words=row['preprocessed_Text']          
        for wd in words:
              dict_Experience[wd]+=1
                
    elif row['Type']=='Requirement':
        words=row['preprocessed_Text']          
        for wd in words:
              dict_Requirement[wd]+=1
    
    elif row['Type']=='Responsibility':
        words=row['preprocessed_Text']          
        for wd in words:
              dict_Responsibility[wd]+=1
    elif row['Type']=='Skill':
        words=row['preprocessed_Text']          
        for wd in words:
              dict_Skill[wd]+=1
                
    elif row['Type']=='SoftSkill':
        words=row['preprocessed_Text']          
        for wd in words:
              dict_SoftSkill[wd]+=1



In [9]:
Type_labels=set()
for index,row in X_train.iterrows():
    Type_labels.add(row['Type'])

In [10]:
def NBC(add_one_smoothing=False, a=0.001, X= X_test, Y = y_test):
    
#-------------------------------- Compute prior probabilities of each class------------------------
    Type_count={}   # Empty dictionary to store number of class occurances for every class
    for  types in Type_labels:
        count=0
        for index,row in X_train.iterrows():
            if types==row['Type']:
                count+=1
        Type_count[types]=count

    print('the prior distributions of classes are:')
    if add_one_smoothing:
        a=a
    else:
        a=0
    
    prior_Education= (Type_count['Education'])/len(X_train)
    prior_Experience= (Type_count['Experience'])/len(X_train)
    prior_Requirement= (Type_count['Requirement'])/len(X_train)
    prior_Responsibility= (Type_count['Responsibility'])/len(X_train)
    prior_Skill= (Type_count['Skill'])/len(X_train)
    prior_SoftSkill= (Type_count['SoftSkill'])/len(X_train)

    print("Education=",prior_Education)
    print("Experience=", prior_Experience )
    print("Requirement=", prior_Requirement )
    print("Responsibility=", prior_Responsibility )
    print("Skill=", prior_Skill )
    print("SoftSkill=", prior_SoftSkill )
    
#-------------------------------- Compute conditional probabilities of word given class------------------------
    
    #summing up the total number of words in each Type
    total_Education = 0
    total_Experience = 0
    total_Requirement = 0
    total_Responsibility = 0
    total_Skill = 0
    total_SoftSkill = 0
    for key,values in dict_Education.items():
        total_Education = total_Education+values
    for key,values in dict_Experience.items():
        total_Experience = total_Experience+values
    for key,values in dict_Requirement.items():
        total_Requirement = total_Requirement+values
    for key,values in dict_Responsibility.items():
        total_Responsibility = total_Responsibility+values
    for key,values in dict_Skill.items():
        total_Skill = total_Skill+values
    for key,values in dict_SoftSkill.items():
        total_SoftSkill = total_SoftSkill+values
   

    # Initiate class conditional probabilites of each word
    ccp_Education = {}
    ccp_Experience = {}
    ccp_Requirement = {}
    ccp_Responsibility = {}
    ccp_Skill ={}
    ccp_SoftSkill ={}

    for word in dict_Education:
        ccp_Education[word] = dict_Education[word]/total_Education
    for word in dict_Experience:
        ccp_Experience[word] = dict_Experience[word]/total_Experience
    for word in dict_Requirement:
        ccp_Requirement[word] = dict_Requirement[word]/total_Requirement
    for word in dict_Responsibility:
        ccp_Responsibility[word] = dict_Responsibility[word]/total_Responsibility
    for word in dict_Skill:
        ccp_Skill[word] = dict_Skill[word]/total_Skill
    for word in dict_SoftSkill:
        ccp_SoftSkill[word] = dict_SoftSkill[word]/total_SoftSkill

    #if the word is not in the sentence, we use Laplace smoothing
    k = len(word_set)
    for word in word_set:
        if word not in Education:
            ccp_Education[word]=a/(total_Education+a*k)
        if word not in Experience:                   
            ccp_Experience[word]=a/(total_Experience+a*k)
        if word not in Requirement:
            ccp_Requirement[word]=a/(total_Requirement+a*k)
        if word not in Responsibility:
            ccp_Responsibility[word]=a/(total_Responsibility+a*k)
        if word not in Skill:
            ccp_Skill[word]=a/(total_Skill+a*k)
        if word not in SoftSkill:
            ccp_SoftSkill[word]=a/(total_SoftSkill+a*k)

#-------------------------------------Compute posterior distribution of class for given sentence ---------------
    
    pred_test_label = []
    for index,row in X.iterrows():
        test_words=row['preprocessed_Text']
        # Add-one (Laplace) smoothing when words in test are not present in training data
        k = len(test_words)
        for word in test_words:
            if word not in word_set:
                ccp_Education[word]=a/(total_Education+a*k)
                ccp_Experience[word]=a/(total_Experience+a*k)
                ccp_Requirement[word]=a/(total_Requirement+a*k)
                ccp_Responsibility[word]=a/(total_Responsibility+a*k)
                ccp_Skill[word]=a/(total_Skill+a*k)
                ccp_SoftSkill[word]=a/(total_SoftSkill+a*k)
        # initialization for product of conditional probabilities
        posterior_Education=1
        posterior_Experience=1
        posterior_Requirement=1
        posterior_Responsibility=1
        posterior_Skill=1
        posterior_SoftSkill=1

        #product of probabilities of individual words given class assuming independence
        for word in test_words:
            posterior_Education*=ccp_Education[word]
            posterior_Experience*=ccp_Experience[word]
            posterior_Requirement*=ccp_Requirement[word]
            posterior_Responsibility*=ccp_Responsibility[word]
            posterior_Skill*=ccp_Skill[word]
            posterior_SoftSkill*=ccp_SoftSkill[word]
        # multiplicating likelihood probabilities with prior
        posterior_Education=posterior_Education*prior_Education/(1)
        posterior_Experience=posterior_Experience*prior_Experience/(1)
        posterior_Requirement=posterior_Requirement*prior_Requirement/(1)
        posterior_Responsibility=posterior_Responsibility*prior_Responsibility/(1)
        posterior_Skill=posterior_Skill*prior_Skill/(1)
        posterior_SoftSkill=posterior_SoftSkill*prior_SoftSkill/(1)

        label_dict = []
        label_dict.append(posterior_Education)
        label_dict.append(posterior_Experience)
        label_dict.append(posterior_Requirement)
        label_dict.append(posterior_Responsibility)
        label_dict.append(posterior_Skill)
        label_dict.append(posterior_SoftSkill)

        ind = label_dict.index(max(label_dict))
        ls_lables = ['Education','Experience','Requirement','Responsibility','Skill','SoftSkill']
        pred_test_label.append(ls_lables[ind])
        
#-----------------------------------------classification report----------------------------------------
    print(classification_report(Y, pred_test_label))

    

In [11]:
# Hyper parameter alpha 'a' tuning for a gird of values [1000,100,10,1,0.1,0.01,0.001,0.0001]
NBC(add_one_smoothing=False,X = X_val, Y = y_val)
NBC(add_one_smoothing=True, a=1000, X = X_val, Y = y_val)
NBC(add_one_smoothing=True, a=100, X = X_val, Y = y_val)
NBC(add_one_smoothing=True, a=10, X = X_val, Y = y_val)
NBC(add_one_smoothing=True, a=1, X = X_val, Y = y_val)
NBC(add_one_smoothing=True, a=0.1, X = X_val, Y = y_val)
NBC(add_one_smoothing=True, a=0.01, X = X_val, Y = y_val)
NBC(add_one_smoothing=True, a=0.001, X = X_val, Y = y_val)
NBC(add_one_smoothing=True, a=0.0001, X = X_val, Y = y_val)

the prior distributions of classes are:
Education= 0.07721362678869761
Experience= 0.15302292922689523
Requirement= 0.23512747875354106
Responsibility= 0.25817776809278226
Skill= 0.11590518389385245
SoftSkill= 0.16055301324423138
                precision    recall  f1-score   support

     Education       0.29      0.85      0.43       680
    Experience       0.68      0.75      0.71      1356
   Requirement       0.57      0.28      0.38      2058
Responsibility       0.78      0.67      0.72      2348
         Skill       0.45      0.37      0.41      1025
     SoftSkill       0.61      0.66      0.63      1384

      accuracy                           0.57      8851
     macro avg       0.56      0.60      0.55      8851
  weighted avg       0.61      0.57      0.57      8851

the prior distributions of classes are:
Education= 0.07721362678869761
Experience= 0.15302292922689523
Requirement= 0.23512747875354106
Responsibility= 0.25817776809278226
Skill= 0.11590518389385245
SoftSkil

In [16]:
# Highest weighted accuracy on validation data is observed for hypreparamaetr 'alpha'(a) is observed at a =0.1
# Using best 'a' value on val data to perform inference on test data
NBC(add_one_smoothing=True, a=0.1, X = X_test, Y = y_test)

the prior distributions of classes are:
Education= 0.07721362678869761
Experience= 0.15302292922689523
Requirement= 0.23512747875354106
Responsibility= 0.25817776809278226
Skill= 0.11590518389385245
SoftSkill= 0.16055301324423138
                precision    recall  f1-score   support

     Education       0.67      0.83      0.74       671
    Experience       0.73      0.85      0.78      1402
   Requirement       0.64      0.36      0.46      2109
Responsibility       0.82      0.85      0.83      2246
         Skill       0.48      0.55      0.51      1021
     SoftSkill       0.61      0.74      0.67      1401

      accuracy                           0.68      8850
     macro avg       0.66      0.70      0.67      8850
  weighted avg       0.68      0.68      0.67      8850



In [20]:
#Experiment2
# Identify top words that are contributing to class prediction
# create list of dictionaries for all classes
dicts = [dict_Education, dict_Experience, dict_Requirement, dict_Responsibility, dict_Skill, dict_SoftSkill]
# find common keys in all classes
common_keys = set(dict_Education.keys())
for d in dicts[1:]:
    common_keys.intersection_update(set(d.keys()))

common_dict = {}
common_keys = list(common_keys)
# find the highest frequency of common keys accross all classes
for word in common_keys:
    maxi = 0
    for d in dicts:
        if(d[word]>maxi):
            common_dict[word] = d[word]
# Create a dictionary of common keys and highest frequency of that key across classes
common_dict = {}
common_keys = list(common_keys)
for word in common_keys:
    ls_val = []
    ls_val.append(dicts[0][word])
    ls_val.append(dicts[1][word])
    ls_val.append(dicts[2][word])
    ls_val.append(dicts[3][word])
    ls_val.append(dicts[4][word])
    ls_val.append(dicts[5][word])
    common_dict[word] = max(ls_val)

# sort dictionary based on frequency of key word
top_words = dict(sorted(common_dict.items(), key=lambda item: item[1], reverse=True))
# print top 10 words
top_10_words = dict(itertools.islice(top_words.items(), 10)) 
print(top_10_words)

{'experience': 5967, 'years': 5589, 'skills': 2270, 'degree': 1753, 'ability': 1709, 'engineering': 1347, 'business': 1326, 'team': 1132, 'must': 1067, 'knowledge': 1057}
