In [1]:
## INSTRUCTIONS ##

use_dir = 'C:/Users/Anna/Documents/Github/analysis-exercise/data/' # change me!
## note - the above filepath should be the only item you will have to change in order to run the notebook
## from here on, run everything sequentially and it should work
## as long as scikit-learn, pandas, scipy and statsmodels are installed
## this is using python 2.7

In [2]:
## INITIAL SETUP

## importing packages
import pandas as pd
import scipy.stats as ss
import statsmodels.stats as sts
import statsmodels.api
import os
from __future__ import division

## modeling packages (scikit-learn)
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

##changing directory and adjusting display settings
os.chdir(use_dir)
pd.options.display.max_rows = 1000

## read in data
hires = pd.read_csv('data_exercise_hires.csv')
applicants = pd.read_csv('data_exercise_applicants.csv')

## Part 1: Data Exploration

In [3]:
## Which organizations have the greatest applicant-to-hire ratio?
raw_hires = hires.groupby('client').count()['user_id']
raw_apps = applicants.groupby('client_name').count()['user_id']
app_to_hire = pd.concat([raw_hires, raw_apps], axis = 1)
app_to_hire.columns = ['raw_hires','raw_apps']
app_to_hire.pct = app_to_hire.raw_hires / app_to_hire.raw_apps
app_to_hire.pct.sort_values(ascending=False)

client9     1.536424
client10    0.452661
client4     0.276627
client11    0.249813
client1          NaN
client2          NaN
client3          NaN
client5          NaN
client6          NaN
client7          NaN
client8          NaN
dtype: float64

In [4]:
## check distinct clients in applicant data
applicants.groupby('client_name').count()['user_id']

client_name
client10    2123
client11    6701
client4      676
client9      453
Name: user_id, dtype: int64

In [5]:
## Which organizations have the greatest challenge with attrition in the first 3 months of employment?

## looking at tenure <= 90 days for employees who are no longer employed (assuming this is roughly 3 months)
first_three_mos = hires[(hires.tenure_length <= 90) & (hires.currently_employed == 'N')].groupby('client').count()['tenure_length']
total_hires = hires[hires.currently_employed == 'N'].groupby('client').count()['tenure_length']

# get the percent of no-longer-employed individuals who turned over in three months
first3mos = pd.concat([first_three_mos, total_hires], axis=1)
first3mos.columns = ['tenure_less_3mos', 'all_tenure']
first3mos['pct'] = first3mos.tenure_less_3mos / first3mos.all_tenure

first3mos['pct'].sort_values(ascending = False)

client
client1     0.472527
client2     0.221698
client4     0.194969
client5     0.143089
client7     0.132275
client6     0.116279
client11    0.100709
client3     0.060787
client10    0.059113
client9     0.050167
client8     0.025532
Name: pct, dtype: float64

## Part 2: Data Analysis

In [6]:
# Do 6 month attrition rates vary significantly across different job categories?
for job in hires.hire_job_category.unique():
    less_60 = hires[(hires.tenure_length <= 60) & (hires.hire_job_category == job)].count()['user_id']
    total = hires[hires.hire_job_category==job].count()['user_id']
    pct = less_60 / total
    print job+'\n\ttotal hires: '+str(total)+'\n\tpct 6 mo attrition: '+str(round(pct,3))

nursing
	total hires: 3102
	pct 6 mo attrition: 0.057
administrative
	total hires: 300
	pct 6 mo attrition: 0.047
patient_care
	total hires: 1694
	pct 6 mo attrition: 0.069
housekeeping
	total hires: 317
	pct 6 mo attrition: 0.107
aide
	total hires: 292
	pct 6 mo attrition: 0.116
dietary
	total hires: 432
	pct 6 mo attrition: 0.12
sales
	total hires: 75
	pct 6 mo attrition: 0.067
director
	total hires: 28
	pct 6 mo attrition: 0.0
other
	total hires: 130
	pct 6 mo attrition: 0.077
nursing_assistant
	total hires: 610
	pct 6 mo attrition: 0.107
lab
	total hires: 136
	pct 6 mo attrition: 0.029


In [7]:
## z-test for significance, 1 vs rest
hires['attrition'] = 0
hires.ix[hires.tenure_length<=60, 'attrition'] = 1

for job in hires.hire_job_category.unique():
    job_attr = hires[hires.hire_job_category == job]['attrition']
    other_attr = hires[hires.hire_job_category != job]['attrition']
    attr_ztest = sts.weightstats.ztest(job_attr, x2=other_attr, value=0,alternative='two-sided')
    print job,attr_ztest

nursing (-4.2781491306386332, 1.8845375269145253e-05)
administrative (-1.731750083528518, 0.083318069143836854)
patient_care (-0.52604172469593202, 0.59885921691845856)
housekeeping (2.4893535875524004, 0.01279756176434018)
aide (3.0056818003483796, 0.0026498613648733853)
dietary (4.0224870619380075, 5.7586818450984734e-05)
sales (-0.17800028154061429, 0.85872275463398706)
director (-1.4762912803229575, 0.13986571395700917)
other (0.22140721994439619, 0.82477537122322142)
nursing_assistant (3.4617021714479463, 0.00053677079666752123)
lab (-1.9386334805047032, 0.052545983734292312)


In [8]:
## for next part, combine data sets
combined_data = applicants.join(hires, on='user_id', how='outer', rsuffix = '_hires')

## check quality of join
len(combined_data[(combined_data.user_id_hires.notnull()) & (combined_data.user_id.notnull())])

7116

In [9]:
## How does the type of device used to take the questionnaire impact an applicant’s tenure length, if at all?
for d in combined_data.device.unique():
    unique_device = combined_data[combined_data.device == d]['tenure_length']
    other_devices = combined_data[(combined_data.device.notnull())&(combined_data.device != d)]['tenure_length']
    device_ttest = ss.ttest_ind(unique_device, other_devices)
    print d,device_ttest

nan Ttest_indResult(statistic=nan, pvalue=nan)
iPhone Ttest_indResult(statistic=nan, pvalue=nan)
Samsung SM-N900T Ttest_indResult(statistic=nan, pvalue=nan)
ZTE Z987 Ttest_indResult(statistic=nan, pvalue=nan)
Samsung SPH-L720 Ttest_indResult(statistic=nan, pvalue=nan)
Other Ttest_indResult(statistic=nan, pvalue=nan)
Samsung SM-G900T Ttest_indResult(statistic=nan, pvalue=nan)
LG-D415 Ttest_indResult(statistic=nan, pvalue=nan)
iPad Ttest_indResult(statistic=nan, pvalue=nan)
HTC Desire 510 Ttest_indResult(statistic=nan, pvalue=nan)
Samsung SM-G900V Ttest_indResult(statistic=nan, pvalue=nan)
Samsung SCH-I415 Ttest_indResult(statistic=nan, pvalue=nan)
XT1254 Ttest_indResult(statistic=nan, pvalue=nan)
Samsung SM-T320 Ttest_indResult(statistic=nan, pvalue=nan)
Samsung SM-N910P Ttest_indResult(statistic=nan, pvalue=nan)
N9520 Ttest_indResult(statistic=nan, pvalue=nan)
Samsung SPH-L710 Ttest_indResult(statistic=nan, pvalue=nan)
Alcatel One Touch Fierce Ttest_indResult(statistic=nan, pvalue=nan)



In [10]:
## check to see if we actually have any data to work with
len(combined_data[(combined_data['device'].notnull())&(combined_data['tenure_length'].notnull())])

0

## Part 3: Predictive Modeling

In [11]:
## PRE-PROCESSING ##

## create flags for categorical variables: job category, client, current employ
for job in combined_data[combined_data.hire_job_category.notnull()].hire_job_category.unique():
    combined_data['category_'+job] = combined_data.hire_job_category.map(lambda x: x == job).astype(int)

for client in combined_data[combined_data.client.notnull()].client.unique():
    combined_data[client] = combined_data.client.map(lambda x: x == client).astype(int)
                         
combined_data['current_employ_flag'] = combined_data.currently_employed.map(lambda x: x == 'Y').astype(int)
                         
## create outcome(y) variable
combined_data['tenure'] = 'none'
combined_data.ix[combined_data.tenure_length<182, 'tenure'] = 'less6'
combined_data.ix[(combined_data.tenure_length>=182)&(combined_data.tenure_length<365), 'tenure'] = '6_12'                         
combined_data.ix[combined_data.tenure_length>=365, 'tenure'] = '12plus'
                         
## variables to not include as model inputs
drop_columns = ['device','tenure_length','client_name','client','user_id','user_id_hires','currently_employed']                         

#separate out into train/test and X,y 
#drop undesired variables, exclude non-numeric features for redundancy in case any were missed above
data = dict()
data['X'] = combined_data[combined_data.tenure != 'none'].drop(drop_columns, axis=1).fillna(0).select_dtypes(exclude=[object]).copy()
data['y'] = combined_data[combined_data.tenure != 'none']['tenure'].copy()

Xtrain, Xtest, ytrain, ytest = train_test_split(data['X'], data['y'], test_size=0.30, random_state=21)

In [12]:
## FEATURE SELECTION ##

# setting up models + hyperparameters
rf = RandomForestClassifier(n_estimators=200, random_state=21)
lm = LogisticRegression(fit_intercept=True, random_state=21)

# feature selection
model_types = [rf, lm]
names = ['random forest', 'logistic regression']
x_train = dict()
x_test = dict()
for mtype, name in zip(model_types,names):
    select = SelectFromModel(estimator=mtype,threshold=0.02, prefit=False)
    select.fit_transform(Xtrain,ytrain)
    features = select.get_support()
    x_train[mtype] = Xtrain[Xtrain.columns[features]]
    x_test[mtype] = Xtest[Xtest.columns[features]]
    print name, Xtrain.columns[features]
    print 'number of features: {}'.format(len(Xtrain.columns[features]))

random forest Index([u'attrition', u'category_nursing', u'client3', u'client8', u'client1',
       u'client2', u'current_employ_flag'],
      dtype='object')
number of features: 7
logistic regression Index([u'answer1', u'answer2', u'answer3', u'answer4', u'answer5', u'answer6',
       u'answer7', u'answer8', u'answer9', u'answer10', u'answer11',
       u'answer12', u'answer13', u'answer14', u'answer15', u'answer16',
       u'answer17', u'answer18', u'answer19', u'answer20', u'answer21',
       u'answer22', u'answer23', u'answer24', u'answer25', u'log_total_time',
       u'attrition', u'category_other', u'category_nursing',
       u'category_dietary', u'category_patient_care',
       u'category_nursing_assistant', u'category_aide',
       u'category_housekeeping', u'category_lab', u'category_administrative',
       u'category_sales', u'category_director', u'client3', u'client4',
       u'client5', u'client7', u'client8', u'client9', u'client10',
       u'client11', u'client1', u'client2

In [13]:
## MODELING ## 

# fit the models
models = dict()
for mtype in model_types:
    X_train = x_train[mtype]
    models[mtype] = mtype.fit(X_train, ytrain)

In [14]:
# get predictions based on the models
preds = dict()
for mtype in model_types:
    X_test = x_test[mtype]
    preds[mtype] = models[mtype].predict(X_test)

In [15]:
# get labels for outcomes
labels = ytest.sort_values().unique()

In [16]:
## MODEL EVALUATION ##

# precision, recall, F-beta scores and print a confusion matrix
for mtype, name in zip(model_types, names):
    scores = metrics.precision_recall_fscore_support(ytest, preds[mtype])
    print '\n'+name+':'
    print labels[0],scores[0]
    print labels[1],scores[1]
    print labels[2],scores[2]
    print pd.DataFrame(metrics.confusion_matrix(ytest, preds[mtype]),index=labels,columns=labels)


random forest:
12plus [ 0.70845481  0.47337278  0.83544304]
6_12 [ 0.9559402   0.17857143  0.49253731]
less6 [ 0.81379772  0.25931929  0.61971831]
        12plus  6_12  less6
12plus    1215    39     17
6_12       346    80     22
less6      154    50    198

logistic regression:
12plus [ 0.71085044  0.46491228  0.7384106 ]
6_12 [ 0.95357986  0.11830357  0.55472637]
less6 [ 0.81451613  0.1886121   0.63352273]
        12plus  6_12  less6
12plus    1212    33     26
6_12       342    53     53
less6      151    28    223
