In [1]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib as plt
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import StandardScaler

#from sklearn import _______
%matplotlib inline

In [2]:
jobs = pd.read_csv('out3.csv', delimiter='\t')
jobs = jobs.drop(jobs.columns[[0]], axis=1)

jobs['high_low'] = jobs.salary_mid.apply(lambda x: x>6500)
#jobs['intercept'] = 1
#
jobs = jobs.drop(columns = ['min_experience'], axis=1)

jobs = jobs.drop(columns = ['link','company', 'address', 'district'], axis=1)
#Similar to target variable
jobs = jobs.drop(columns = ['salary_low','salary_mid', 'salary_high', 'salary_time', ], axis=1)
#Created dummies for these
jobs = jobs.drop(columns = ['job_title', 'employment_type', 'job_category', 'seniority', 'skills'], axis=1)
jobs.head()

Unnamed: 0,account manager,account manager it solutions,analyst,application developer,architect,assistant manager,assistant professor,big data,big data developer,big data engineer,...,working knowledge,written communication,written communication skills,written verbal,years experience,years relevant,years relevant experience,years working,years working experience,high_low
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21871,0.233659,False
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21871,0.233659,True
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.20651,0.0,0.0,0.0,0.0,False
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.121946,0.0,0.0,0.0,0.0,0.0,False


In [3]:
jobs.shape

(1110, 890)

In [4]:
y = jobs.high_low
X = jobs.drop(columns = 'high_low')

In [5]:
#Baseline is 0.509 with just over half of salary_mid being under $6500 a month
baseline = max(1-sum(y)/len(y), sum(y)/len(y))
print(baseline)

0.509009009009009


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
lr = LogisticRegression(solver='newton-cg').fit(X_train, y_train)
cross_val_score(lr, X_train, y_train, cv=4, scoring="accuracy")

array([0.79372197, 0.8018018 , 0.71171171, 0.81900452])

In [8]:
y_train_pred = cross_val_predict(lr, X_train, y_train, cv=4)
print('confusion matrix\n', confusion_matrix(y_train, y_train_pred))
print('precision score', precision_score(y_train, y_train_pred))
print('recall score', recall_score(y_train, y_train_pred))
print('f1 score', f1_score(y_train, y_train_pred))
print('aoc score', roc_auc_score(y_train, y_train_pred))

confusion matrix
 [[357 100]
 [ 94 337]]
precision score 0.7711670480549199
recall score 0.7819025522041764
f1 score 0.7764976958525346
aoc score 0.7815420857300969


In [9]:
y_test_pred = cross_val_predict(lr, X_test, y_test, cv=4)
print('confusion matrix\n', confusion_matrix(y_test, y_test_pred))
print('precision score', precision_score(y_test, y_test_pred))
print('recall score', recall_score(y_test, y_test_pred))
print('f1 score', f1_score(y_test, y_test_pred))
print('aoc score', roc_auc_score(y_test, y_test_pred))

confusion matrix
 [[76 32]
 [34 80]]
precision score 0.7142857142857143
recall score 0.7017543859649122
f1 score 0.7079646017699114
aoc score 0.702729044834308


In [10]:
rnd_clf = RandomForestClassifier(n_estimators=400, max_leaf_nodes=100, n_jobs=-2)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [11]:
print('confusion matrix\n', confusion_matrix(y_test, y_pred_rf))
print('precision score', precision_score(y_test, y_pred_rf))
print('recall score', recall_score(y_test, y_pred_rf))
print('f1 score', f1_score(y_test, y_pred_rf))
print('aoc score', roc_auc_score(y_test, y_pred_rf))

confusion matrix
 [[84 24]
 [28 86]]
precision score 0.7818181818181819
recall score 0.7543859649122807
f1 score 0.7678571428571429
aoc score 0.7660818713450293


In [12]:
best_feat = pd.DataFrame(rnd_clf.feature_importances_).sort_values(by=[0], ascending = False).head(8).index
pd.DataFrame(rnd_clf.feature_importances_).sort_values(by=[0], ascending = False).head(8)

Unnamed: 0,0
480,0.022533
884,0.01779
482,0.013002
689,0.012962
481,0.012118
594,0.012096
483,0.011787
719,0.010914


In [13]:
X_test.columns[best_feat]

Index(['Executive', 'years experience', 'Junior Executive', '10 years',
       'Fresh/entry level', 'Microsoft Office', 'Manager', 'computer science'],
      dtype='object')

In [14]:
svmclf = SVC(kernel='linear')  
svmclf.fit(X_train, y_train)
y_pred_svm = svmclf.predict(X_test)

In [15]:
print('confusion matrix\n', confusion_matrix(y_test, y_pred_svm))
print('precision score', precision_score(y_test, y_pred_svm))
print('recall score', recall_score(y_test, y_pred_svm))
print('f1 score', f1_score(y_test, y_pred_svm))
print('aoc score', roc_auc_score(y_test, y_pred_svm))

confusion matrix
 [[81 27]
 [27 87]]
precision score 0.7631578947368421
recall score 0.7631578947368421
f1 score 0.7631578947368421
aoc score 0.756578947368421


In [16]:
mnvclf = MultinomialNB()  
mnvclf.fit(X_train, y_train)
y_pred_mnv = mnvclf.predict(X_test)

In [17]:
print('confusion matrix\n', confusion_matrix(y_test, y_pred_mnv))
print('precision score', precision_score(y_test, y_pred_mnv))
print('recall score', recall_score(y_test, y_pred_mnv))
print('f1 score', f1_score(y_test, y_pred_mnv))
print('aoc score', roc_auc_score(y_test, y_pred_mnv))

confusion matrix
 [[84 24]
 [39 75]]
precision score 0.7575757575757576
recall score 0.6578947368421053
f1 score 0.704225352112676
aoc score 0.7178362573099415
