In [2]:
# set up

import pandas as pd

# needed to convert strings to vectors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer


# import classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier

# to create random train and test subsets
from sklearn.model_selection import train_test_split

# metrics are used to find accuracy or error
from sklearn import metrics

# for tokenizing and stemming 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


In [3]:
df_jobs = pd.read_excel('./job_titles.xlsx', sheet_name='jobs', index_col='job_title')

df_jobs.head()

Unnamed: 0_level_0,functional_group_code,management_level_code
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Director IT,2,0
Graphic Designer,3,1
Manager Bus Systems/Operations,1,2
Manager IT,2,2
Product Specialist,3,1


In [4]:
df_functions = pd.read_excel('./job_titles.xlsx', sheet_name='functional_groups', index_col='functional_group_code')

df_functions.head()

Unnamed: 0_level_0,functional_group
functional_group_code,Unnamed: 1_level_1
0,Architecture
1,Business Analytics
2,Engineering
3,Product
4,Project Management


In [5]:
df_levels = pd.read_excel('./job_titles.xlsx', sheet_name='management_levels', index_col='management_level_code')

df_levels.head()

Unnamed: 0_level_0,management_level
management_level_code,Unnamed: 1_level_1
0,Director
1,IC
2,Manager
3,VP (and above)


In [6]:
job_titles = list(df_jobs.index)

job_functions = list(df_jobs['functional_group_code'])

job_levels = list(df_jobs['management_level_code'])


In [7]:
# #stem job titles
# stemmed_jobs=[]

# for job in job_titles:
#     stemmed_job=""
#     words = word_tokenize(job)
#     for word in words:
#         word = PorterStemmer().stem(word)
#         stemmed_job = stemmed_job + " " + word
#     stemmed_jobs.append(stemmed_job.strip())
    

In [8]:
# create train and test sets
X_train, X_test, y_train, y_test = train_test_split(job_titles, job_functions, random_state=42)


In [30]:
stop_list = ['sr', 'senior', 'staff', 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'exempt']

In [28]:
# Create a Vectorizer object
vectorizer = CountVectorizer(stop_words=stop_list, ngram_range=(1,2))
  
vectorizer.fit(X_train)
  
# # Printing the identified unique words along with their indices
# print("Vocabulary: ", vectorizer.vocabulary_)
  
# Encode the train and test documents
X_train_vector = vectorizer.transform(X_train)
X_test_vector = vectorizer.transform(X_test)
  
# # summarizing the encoded texts
# print("Encoded Document is:")
# print(X_test_vector.toarray())



In [29]:
# creating a RF classifier
clf = RandomForestClassifier(n_estimators=1000, random_state=42)

# Training the model on the training dataset
clf.fit(X_train_vector, y_train)

# performing predictions on the test dataset
y_pred = clf.predict(X_test_vector)

# using metrics module for accuracy calculation
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_test, y_pred))

ACCURACY OF THE MODEL:  0.9337931034482758


In [23]:
df_results = pd.DataFrame({'job': X_test, 'predicted': y_pred, 'actual': y_test})



In [73]:
df_results = df_results.merge(df_functions, how='left', left_on='predicted', right_index=True)
df_results.rename(columns = {'functional_group': 'function_predicted'}, inplace=True)
df_results = df_results.merge(df_functions, how='left', left_on='actual', right_index=True)
df_results.rename(columns = {'functional_group': 'function_actual'}, inplace=True)

df_results.head()

Unnamed: 0,job,predicted,actual,function_predicted,function_actual
0,"Business Development Specialist, Consultant",6,6,Other,Other
1,"Systems Engineer, Staff",2,2,Engineering,Engineering
2,"VP, Operations, Open Solutions",6,6,Other,Other
3,PM Implementation Project Management III (Non-...,4,4,Project Management,Project Management
4,"Project Manager, Team Lead",4,4,Project Management,Project Management
