In [88]:
from __future__ import division
import numpy as np
import pandas as pd
import scipy.stats as ss
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pickle
import os
os.chdir('C:/Users/Anna/Documents/lab_at_dc_part_a/')

In [178]:
# reading in the data from csv
stops1 = pd.read_csv('stopsData1.csv', index_col='stop_id')
stops2 = pd.read_csv('stopsData2.csv', index_col='stop_id')
courts = pd.read_csv('courtData.csv')

In [179]:
# clean courts data up a bit
courts['stop_id'] = courts['stop_id'].str.replace(r'O', r'0')
courts['ticket_amount'] = courts['ticket_amount'].str.replace(r'$', r'')
courts['ticket_amount'] = courts['ticket_amount'].str.replace(r',', r'.')
courts['ticket_amount'] = courts['ticket_amount'].str.replace(r'O', r'0')
courts = courts.set_index('stop_id')
courts.index = pd.to_numeric(courts.index)
courts.ticket_amount = pd.to_numeric(courts.ticket_amount)

In [180]:
# concatting the files
stops = stops1.join(stops2, how='left')
stops = stops.join(courts, how='left')

In [181]:
#check to make sure it joined properly
stops[stops.status.notnull()].count()

officer_id       705
driver_id        705
driver_age       705
driver_sex       705
ticket           705
ward             705
ticket_amount    705
status           705
dtype: int64

In [182]:
ladies = stops[(stops.status.notnull()) & (stops.driver_sex == 'f') & (stops.ticket==1)]['ticket_amount']
gents = stops[(stops.status.notnull()) & (stops.driver_sex == 'm') & (stops.ticket==1)]['ticket_amount']

print "Average for Ladies: {}\nAverage for Gents: {}".format(np.average(ladies).round(0), np.average(gents).round(0))

Average for Ladies: 86.0
Average for Gents: 121.0


In [183]:
# test for difference
ss.ttest_ind(ladies, gents)

Ttest_indResult(statistic=-7.2260515430355765, pvalue=1.2981445070864148e-12)

In [184]:
print (np.average(gents) - np.average(ladies)) / np.average(ladies)

0.413728495997


In [185]:
##looking at ticket amount by ward
for i in np.arange(1,9,1):
    wards = stops[(stops.status.notnull()) & (stops.ward == i) & (stops.ticket==1)]['ticket_amount']
    other_wards = stops[(stops.status.notnull()) & (stops.ward != i) & (stops.ticket==1)]['ticket_amount']
    ward_ttests = ss.ttest_ind(wards, other_wards)
    print i, np.average(wards).round(0),ward_ttests      

1 117.0 Ttest_indResult(statistic=0.95760110749350813, pvalue=0.33859313879876785)
2 100.0 Ttest_indResult(statistic=-1.8047606653972181, pvalue=0.071539942609154805)
3 104.0 Ttest_indResult(statistic=-1.1953692994458311, pvalue=0.2323458521943387)
4 112.0 Ttest_indResult(statistic=0.10056120041265304, pvalue=0.91992746965271455)
5 122.0 Ttest_indResult(statistic=1.8610447248815549, pvalue=0.063155100641677522)
6 115.0 Ttest_indResult(statistic=0.51119878326485835, pvalue=0.60937237936937605)
7 113.0 Ttest_indResult(statistic=0.28939039786860504, pvalue=0.77236796572956212)
8 108.0 Ttest_indResult(statistic=-0.6223052141742057, pvalue=0.53394297693027115)


In [186]:
# looking at ticket amount by MPD officer
for x in stops['officer_id'].unique():
    officers = stops[(stops.status.notnull()) & (stops.officer_id == x) & (stops.ticket==1)]['ticket_amount']
    other_officers = stops[(stops.status.notnull()) & (stops.officer_id != x) & (stops.ticket==1)]['ticket_amount']
    officer_ttests = ss.ttest_ind(officers, other_officers)
    print x,np.average(officers).round(0),officer_ttests  

B 106.0 Ttest_indResult(statistic=-1.223970492998939, pvalue=0.22137340337137251)
A 114.0 Ttest_indResult(statistic=0.47544084880033077, pvalue=0.63462047393980503)
D 116.0 Ttest_indResult(statistic=1.055615143776597, pvalue=0.29150671442463538)
E 108.0 Ttest_indResult(statistic=-0.79462804550772614, pvalue=0.42709807206309669)
C 113.0 Ttest_indResult(statistic=0.4959975944058555, pvalue=0.62005113633472519)


In [187]:
# create dummy var for male
stops['sex_m'] = 0
stops.ix[stops.driver_sex == 'm', 'sex_m'] = 1

# create dummy vars for officer 
for j in stops['officer_id'].unique():
    stops['officer_'+j] = stops.officer_id.map(lambda x: x == j).astype(int)

In [192]:
#separate out into train/test and X,y
data = dict()
data['X'] = stops[stops.ticket == 1].drop(['driver_id','ticket'], axis=1).fillna(0).select_dtypes(exclude=[object]).copy()
data['y'] = stops[stops.ticket == 1]['status'].copy()

from sklearn.cross_validation import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(data['X'], data['y'], test_size=0.30, random_state=21)

In [193]:
Xtrain.head().T

stop_id,97,5814,3369,3170,4820
driver_age,43.0,48.0,48.0,59.0,67.0
ward,1.0,7.0,3.0,7.0,8.0
ticket_amount,120.0,145.0,190.0,105.0,50.0
sex_m,1.0,1.0,1.0,1.0,1.0
officer_B,0.0,0.0,0.0,0.0,1.0
officer_A,0.0,0.0,0.0,0.0,0.0
officer_D,1.0,0.0,0.0,0.0,0.0
officer_E,0.0,1.0,0.0,1.0,0.0
officer_C,0.0,0.0,1.0,0.0,0.0


In [212]:
# setting up for models and feature selection
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=21)
lm = LogisticRegression(fit_intercept=True, random_state=21)

model_types = [rf, lm]
x_train = dict()
for mtype in model_types:
    select = SelectFromModel(estimator=mtype,threshold=0.04, prefit=False)
    select.fit_transform(Xtrain,ytrain)
    features = select.get_support()
    x_train[mtype] = Xtrain[Xtrain.columns[features]]
    print mtype, Xtrain.columns[features]
    print 'number of features: {}'.format(len(Xtrain.columns[features]))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=21, verbose=0, warm_start=False) Index([u'driver_age', u'ward', u'ticket_amount', u'sex_m'], dtype='object')
number of features: 4
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=21, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) Index([u'driver_age', u'ward', u'sex_m', u'officer_B', u'officer_A',
       u'officer_D', u'officer_E', u'officer_C'],
      dtype='object')
number of features: 8


In [232]:
# fit the models
models = dict()
for mtype in model_types:
    models[mtype] = mtype.fit(X_train, ytrain)

In [234]:
# get predictions based on the models
preds = dict()
for mtype in model_types:
    preds[mtype] = models[mtype].predict(X_test)

In [252]:
# get labels for outcomes
labels = y_test.sort_values().unique()

In [276]:
#evaluate model performance
from sklearn import metrics
for mtype in model_types:
    scores = metrics.precision_recall_fscore_support(y_test, preds[mtype])
    print labels[0],scores[0]
    print labels[1],scores[1]
    print labels[2],scores[2]
    print pd.DataFrame(metrics.confusion_matrix(y_test, preds[mtype]),index = labels,columns=labels)

challenged [ 0.56179775  0.125       0.09090909  0.25      ]
overdue [ 0.84033613  0.03333333  0.04761905  0.04761905]
paid [ 0.67340067  0.05263158  0.0625      0.08      ]
            challenged  overdue  paid  pending
challenged         100        4    14        1
overdue             23        1     5        1
paid                36        3     2        1
pending             19        0     1        1
challenged [ 0.56132075  0.          0.          0.        ]
overdue [ 1.  0.  0.  0.]
paid [ 0.71903323  0.          0.          0.        ]
            challenged  overdue  paid  pending
challenged         119        0     0        0
overdue             30        0     0        0
paid                42        0     0        0
pending             21        0     0        0
