In [1]:
import os
import pandas as pd
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import random
from category_encoders import LeaveOneOutEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
%matplotlib inline

In [2]:
print(os.getcwd())
os.chdir('/Users/liuyang/Desktop/Springboard_Capstone/data') 
print(os.getcwd())

/Users/liuyang/Desktop/Springboard_Capstone/notebooks
/Users/liuyang/Desktop/Springboard_Capstone/data


In [3]:
df = pd.read_csv('preprocessed_train.csv')

In [4]:
df.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,city_149,0.689,1,1,0,1,1,1,2,4,1,106,0
1,city_83,0.923,1,1,0,1,1,3,1,2,1,69,0
2,city_16,0.91,1,1,0,1,1,2,1,1,1,4,0
3,city_64,0.666,1,1,0,1,1,3,1,4,1,26,0
4,city_100,0.887,1,0,0,1,1,2,1,4,1,88,1


In [5]:
X = df.drop(['target'],axis=1)
y = df['target']

In [6]:
# Use LeaveOneOutEncoder to encode 'city'
enc = LeaveOneOutEncoder(cols=['city'])

In [7]:
X_enc = enc.fit_transform(X,y)

In [8]:
X_enc.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,0.125,0.689,1,1,0,1,1,1,2,4,1,106
1,0.099291,0.923,1,1,0,1,1,3,1,2,1,69
2,0.093164,0.91,1,1,0,1,1,2,1,1,1,4
3,0.101695,0.666,1,1,0,1,1,3,1,4,1,26
4,0.159259,0.887,1,0,0,1,1,2,1,4,1,88


In [9]:
X_scaled = StandardScaler().fit_transform(X_enc)

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.40,random_state=3)

In [11]:
y_test.shape

(7344,)

In [12]:
logreg=LogisticRegression()

In [13]:
logreg.fit(X_train,y_train)

LogisticRegression()

In [14]:
y_pred=logreg.predict(X_test)

In [15]:
metrics.confusion_matrix(y_test, y_pred,labels=[1,0])

array([[   0,  944],
       [   0, 6400]])

### The model end up predicting no positive class, as the target data is very imbalanced(6:1)
### Therefore I tried to change the threshold from default to 0.2

In [16]:
# try to change the cutoff value from 0.5 to 0.2
y_pred_adj = np.where(logreg.predict_proba(X_test)[:,1] > 0.2, 1, 0)

In [17]:
metrics.confusion_matrix(y_test, y_pred_adj,labels=[1,0])

array([[ 259,  685],
       [ 685, 5715]])

In [18]:
y_pred_adj.sum()

944

In [19]:
y_test.sum()

944

In [20]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_adj))
print("Precision:",metrics.precision_score(y_test, y_pred_adj))
print("Recall:",metrics.recall_score(y_test, y_pred_adj))

Accuracy: 0.8134531590413944
Precision: 0.274364406779661
Recall: 0.274364406779661


### As you can see the accuracy looks okay, but Recall and Precision is very low. In this case Recall = Precision

In [21]:
df_normalized=(df.drop('city',axis=1)-df.drop('city',axis=1).mean())/df.drop('city',axis=1).std()
cov_matrix=df_normalized.cov()
cov_matrix

Unnamed: 0,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
city_development_index,1.0,0.031236,0.063763,-0.165145,-0.018126,-0.095845,0.28246,0.106869,-0.055009,0.171022,-0.006193,-0.1357
gender,0.031236,1.0,-0.030756,0.002148,0.025354,-0.071997,-0.047715,0.01413,-0.042085,-0.010614,-0.000299,0.015446
relevent_experience,0.063763,-0.030756,1.0,-0.321089,0.306221,0.016964,0.329443,0.161266,0.006071,0.304668,0.006397,-0.074088
enrolled_university,-0.165145,0.002148,-0.321089,1.0,-0.176097,0.087867,-0.318575,-0.106847,-0.025572,-0.200541,0.005139,0.077647
education_level,-0.018126,0.025354,0.306221,-0.176097,1.0,-0.125811,0.22221,0.158417,-0.069167,0.250441,-0.00165,0.021681
major_discipline,-0.095845,-0.071997,0.016964,0.087867,-0.125811,1.0,-0.005319,0.002061,0.021276,-0.050299,0.001695,0.011687
experience,0.28246,-0.047715,0.329443,-0.318575,0.22221,-0.005319,1.0,0.14426,-0.010271,0.427211,0.000596,-0.08249
company_size,0.106869,0.01413,0.161266,-0.106847,0.158417,0.002061,0.14426,1.0,-0.045344,0.186941,-0.022222,-0.040886
company_type,-0.055009,-0.042085,0.006071,-0.025572,-0.069167,0.021276,-0.010271,-0.045344,1.0,-0.044452,0.006705,0.004348
last_new_job,0.171022,-0.010614,0.304668,-0.200541,0.250441,-0.050299,0.427211,0.186941,-0.044452,1.0,-0.007452,-0.029902
