In [29]:
###### import commands
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.utils import resample

In [30]:
#### read model
df=pd.read_csv('train.csv', encoding = 'ISO-8859-1', low_memory=False)
df=df.set_index('ticket_id')
dfOG=df.copy()

In [31]:
#### create new feature - time difference between ticket and hearing dates
tdiff=pd.to_datetime(dfOG['hearing_date'])-pd.to_datetime(dfOG['ticket_issued_date'])
dfOG['TimeDiff']=tdiff.dt.total_seconds()/86400

In [32]:
#### add month feature for issue and hearing date, also day and weekday
dfOG['hearing_month']=pd.to_datetime(dfOG['hearing_date']).dt.month
dfOG['issue_month']=pd.to_datetime(dfOG['ticket_issued_date']).dt.month

dfOG['hearing_day']=pd.to_datetime(dfOG['hearing_date']).dt.day
dfOG['issue_day']=pd.to_datetime(dfOG['ticket_issued_date']).dt.day

dfOG['hearing_dayofweek']=pd.to_datetime(dfOG['hearing_date']).dt.weekday
dfOG['issue_dayofweek']=pd.to_datetime(dfOG['ticket_issued_date']).dt.weekday


In [33]:
#### read latlon, addresses
latlon_df=pd.read_csv('latlons.csv')
add_df=pd.read_csv('addresses.csv')
add_df['address'] = add_df['address'].str.upper()

In [34]:
#### merge latlon, addresses on addresses column

# latlon_df['address']
latlon_df['address'] = latlon_df['address'].str.upper()
latlon_df.head()

df_info=add_df.merge(latlon_df,left_on='address',right_on='address')
df_info=df_info.set_index('ticket_id')
print('add={:d} + ll={:d} = {:d}'.format(len(add_df),len(latlon_df),len(df_info)))

# there should be 8 rows with nan lat/lon
# df_info['lon'].isna().sum() # =8
# df_info.head()


add=311307 + ll=121769 = 311307


In [35]:
#### merge latlon info into main df
df2=dfOG.merge(df_info,left_index=True,right_index=True, how='left')
print('og={:d} + info={:d} = {:d}'.format(len(dfOG),len(df_info),len(df2)))
dfOG=df2.copy()
df=df2.copy()

og=250306 + info=311307 = 250306


In [36]:
## what types of data are in here? the 'compliance' column is the target value (0 or 1)
y=df['compliance']
X=df.drop(['compliance'],axis=1)
X.iloc[0]

agency_name                      Buildings, Safety Engineering & Env Department
inspector_name                                                  Sims, Martinzie
violator_name                                 INVESTMENT INC., MIDWEST MORTGAGE
violation_street_number                                                    2900
violation_street_name                                                     TYLER
violation_zip_code                                                          NaN
mailing_address_str_number                                                    3
mailing_address_str_name                                              S. WICKER
city                                                                    CHICAGO
state                                                                        IL
zip_code                                                                  60606
non_us_str_code                                                             NaN
country                                 

In [37]:
## what is the cardinality of each column?
for col in X.columns:
    print('{} - {}'.format(col,len(X[col].value_counts())))
    

agency_name - 5
inspector_name - 173
violator_name - 119992
violation_street_number - 19175
violation_street_name - 1791
violation_zip_code - 0
mailing_address_str_number - 15826
mailing_address_str_name - 37896
city - 5184
state - 59
zip_code - 4251
non_us_str_code - 2
country - 5
ticket_issued_date - 86979
hearing_date - 6222
violation_code - 235
violation_description - 258
disposition - 9
fine_amount - 43
admin_fee - 2
state_fee - 2
late_fee - 37
discount_amount - 13
clean_up_cost - 1
judgment_amount - 57
payment_amount - 533
balance_due - 606
payment_date - 2307
payment_status - 3
collection_status - 1
grafitti_status - 1
compliance_detail - 10
TimeDiff - 33217
hearing_month - 12
issue_month - 12
hearing_day - 31
issue_day - 31
hearing_dayofweek - 6
issue_dayofweek - 7
address - 95598
lat - 78694
lon - 87249


In [249]:
# good ones - ['discount_amount', 'disposition', ]

var='agency_name'

ctTot=pd.crosstab(X[var], y, margins=True)
ctNorm=pd.crosstab(X[var], y, normalize='index', margins=True)
ctNorm
# X.issue_day.value_counts()
ctNorm['N']=X[var].value_counts()
ctNorm

compliance,0.0,1.0,N
agency_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Buildings, Safety Engineering & Env Department",0.939257,0.060743,157784.0
Department of Public Works,0.910039,0.089961,74717.0
Detroit Police Department,0.86828,0.13172,8900.0
Health Department,0.934149,0.065851,8903.0
Neighborhood City Halls,1.0,0.0,2.0
All,0.927464,0.072536,


In [180]:
## Get ready to plot bar graphs comparing compliant and non-compliant groups for different columns
# first split into compliant and non-compliant (this leaves out ~90k that are null - not responsible)
Xall=X.loc[(y==1) | (y==0)] # not null
yall=y[(y==1) | (y==0)]
X1=X.loc[y==1]
X0=X.loc[y==0]
y1=y[y==1]
y0=y[y==0]
# X0=X.loc[y==0] # non-compliant

print("Compliant: {}, non-compliant: {}, all: {}".format(len(X1),len(X0),len(Xall)))

# define column name
var='discount_amount'

# what are the unique values of this column
# vals=Xall[var].unique()
v1=X1[var].value_counts()
v0=X0[var].value_counts()

vcomb=pd.DataFrame(dict(noncomp=v0,comp=v1))
vcomb=vcomb.fillna(0)

vals=vcomb.index
vcomb


Compliant: 11597, non-compliant: 148283, all: 159880


Unnamed: 0,noncomp,comp
0.0,148226.0,10474
3.0,0.0,19
5.0,8.0,159
10.0,14.0,141
13.0,0.0,1
20.0,13.0,122
25.0,18.0,587
30.0,0.0,17
40.0,0.0,1
50.0,3.0,40


In [176]:
# bar plot comparing two sets
%matplotlib notebook
import matplotlib as mpl
import matplotlib.pyplot as plt

xpos=np.arange(len(vals))

# scaled by max value
# fig, (ax1, ax2) = plt.subplots(2,1,sharex=True,sharey=True)

# ax1.bar(xpos,(vcomb['noncomp']/np.max(vcomb['noncomp'])))
# ax1.set_title('Non-compliant')

# ax2.bar(xpos,(vcomb['comp']/np.max(vcomb['comp'])))
# ax2.set_title('Compliant')

# log10 of values, not very accurate for comparison
fig, (ax1, ax2) = plt.subplots(2,1,sharex=True)

ax1.bar(xpos,np.log10(vcomb['noncomp']))
ax1.set_title('Non-compliant')

ax2.bar(xpos,np.log10(vcomb['comp']))
ax2.set_title('Compliant')
ax2.set_xticks(xpos)
ax2.set_xticklabels(vals)
ax2.xaxis.set_tick_params(rotation=45)

vals

<IPython.core.display.Javascript object>



Float64Index([0.0, 3.0, 5.0, 10.0, 13.0, 20.0, 25.0, 30.0, 40.0, 50.0, 100.0,
              250.0, 350.0],
             dtype='float64')

In [170]:
pd.DataFrame(dict(noncomp=v0,comp=v1))


Unnamed: 0,noncomp,comp
0.0,148226.0,10474
3.0,,19
5.0,8.0,159
10.0,14.0,141
13.0,,1
20.0,13.0,122
25.0,18.0,587
30.0,,17
40.0,,1
50.0,3.0,40


In [97]:
# X['violator_name'].value_counts()
Xtmp=X.loc[X['fine_amount']==10000]

## find mode of this subset
from statistics import mode 
for col in Xtmp.columns:
    # first get mode
    x1=Xtmp[col].value_counts()
    x2=X[col].value_counts()
    xall=pd.DataFrame(dict(sml=x1, big=x2)).reset_index()
    
    print('************** ' + col + ' **************')
    print(x1)
#     print('{} {}'.format(col,xx))


************** agency_name **************
Department of Public Works    305
Health Department              36
Detroit Police Department      15
Neighborhood City Halls         1
Name: agency_name, dtype: int64
************** inspector_name **************
Hayes, Billy J         85
Zizi, Josue            45
Tidwell, Rhonda        22
Langston, Yolanda      18
Davis, Darlene         16
Frazier, Willie        14
Cato, Valesta          13
Williamson, Lillett    12
Havard, Jacqueline     12
Brooks, Eric           11
Buchanan, Daryl        10
Jones, Derron M         9
Johnson, Lois           9
Houston, Doris          9
Moore, David            8
Gibson, Christopher     6
Funchess, Mitchell      5
McCants, Angela         4
Traylor, Alva           4
McClain, Melvin         4
Carver, Gharian         4
OBannon, James          3
Jones, Leah             3
Lusk, Gertrina          3
Gray, Paul              3
Fountain, Michael       2
Mathis, Marlena         2
Forte, Laurie           2
Talbert, Reginald

************** state **************
MI    297
CA     12
TX     10
FL      9
SC      5
PA      3
IL      3
NY      2
VA      2
NJ      2
UT      2
OK      2
GA      1
MN      1
IN      1
AR      1
NV      1
SD      1
OH      1
OR      1
Name: state, dtype: int64
************** zip_code **************
48227    25
48235    20
48228    16
48219    15
48205    13
48037    12
48224    12
48209    10
48223     9
48204     8
48076     7
48075     6
48213     6
48034     6
48238     6
48234     6
48221     5
76137     5
48202     5
48206     5
48322     5
48217     4
48207     4
48215     4
48126     4
48212     3
48170     3
48226     3
48210     3
48098     3
         ..
48216     1
15230     1
57701     1
48105     1
92618     1
48823     1
48114     1
34205     1
72956     1
48324     1
34116     1
91107     1
75093     1
48048     1
90043     1
46256     1
48067     1
48080     1
19054     1
48141     1
29054     1
48010     1
97005     1
48201     1
15272     1
29210     1
84414     1
840

************** collection_status **************
IN COLLECTION    80
Name: collection_status, dtype: int64
************** grafitti_status **************
Series([], Name: grafitti_status, dtype: int64)
************** compliance_detail **************
non-compliant by no payment                        187
not responsible by disposition                     162
non-compliant by late payment more than 1 month      7
compliant by late payment within 1 month             1
Name: compliance_detail, dtype: int64
************** TimeDiff **************
20.895833     3
22.111111     2
17.895833     2
33.895833     2
27.031250     2
87.895833     2
234.802083    2
35.965278     2
21.937500     2
122.840278    2
29.027778     2
40.020833     2
110.975694    2
102.815972    1
77.906250     1
27.020833     1
218.104167    1
31.052083     1
146.958333    1
101.913194    1
40.795139     1
10.017361     1
39.836806     1
112.895833    1
331.958333    1
29.104167     1
35.166667     1
103.958333    1
115.986

0    GRAND RIVER
dtype: object

In [185]:
Xall[colnames[5]].value_counts()

305.0      79621
85.0       17655
140.0      12516
250.0      11355
280.0       7177
580.0       6545
1130.0      4803
3880.0      3785
360.0       3680
130.0       2972
80.0        2760
2780.0      1506
230.0       1356
57.5        1223
167.5        723
530.0        373
1680.0       251
855.0        217
0.0          195
11030.0      195
1030.0       162
55.0         155
415.0        126
330.0         88
5530.0        85
3530.0        74
155.0         70
1350.0        40
2530.0        39
470.0         36
2230.0        14
7730.0        13
1530.0        13
780.0         12
690.0         10
3330.0         4
430.0          3
272.0          3
206.0          2
2030.0         2
134.5          2
1955.0         2
380.0          2
1405.0         2
217.0          1
1060.0         1
300.0          1
8830.0         1
1097.0         1
31.1           1
50.0           1
180.0          1
525.0          1
2994.5         1
750.5          1
774.7          1
195.0          1
Name: judgment_amount, dtype: i

In [220]:
# scatterplots showing percentage non-compliant for different features
# only do numeric ones
colnames=['fine_amount','discount_amount','late_fee','judgment_amount']

%matplotlib notebook
import matplotlib as mpl
import matplotlib.pyplot as plt

cmap=mpl.cm.jet
fig, axes = plt.subplots(len(colnames),1)

for i, col in enumerate(colnames):
    ctTot=pd.crosstab(Xall[col], y, margins=True)
    ctNorm=pd.crosstab(Xall[col], y, normalize='index')
    ctNorm['N']=Xall[col].value_counts()
    
    ax=axes[i]
    sc=ax.scatter(ctNorm.index,ctNorm[0.0],c=np.log10(ctNorm['N']),cmap=cmap)
    ax.set_title(col)
    plt.colorbar(sc,ax=ax)
    
plt.tight_layout()

<IPython.core.display.Javascript object>

In [246]:
Xall.columns

Index(['agency_name', 'inspector_name', 'violator_name',
       'violation_street_number', 'violation_street_name',
       'violation_zip_code', 'mailing_address_str_number',
       'mailing_address_str_name', 'city', 'state', 'zip_code',
       'non_us_str_code', 'country', 'ticket_issued_date', 'hearing_date',
       'violation_code', 'violation_description', 'disposition', 'fine_amount',
       'admin_fee', 'state_fee', 'late_fee', 'discount_amount',
       'clean_up_cost', 'judgment_amount', 'payment_amount', 'balance_due',
       'payment_date', 'payment_status', 'collection_status',
       'grafitti_status', 'compliance_detail', 'TimeDiff', 'hearing_month',
       'issue_month', 'hearing_day', 'issue_day', 'hearing_dayofweek',
       'issue_dayofweek', 'address', 'lat', 'lon'],
      dtype='object')

In [248]:
# scatterplots comparing the different numeric features
colnames=['fine_amount','discount_amount','late_fee','judgment_amount','TimeDiff']

%matplotlib notebook
import matplotlib as mpl
import matplotlib.pyplot as plt


fig, axes = plt.subplots(len(colnames),len(colnames),sharex='col',sharey='row')

# plot everything against eachother
for i, icol in enumerate(colnames):     # loop over the rows
    for j, jcol in enumerate(colnames): # loop over the columns
        ax=axes[i,j]
        print("Plotting {}:{} in ({},{})".format(icol,jcol,i,j))
        ax.plot(X1[jcol],X1[icol],'b.')
        ax.plot(X0[jcol],X0[icol],'r.')


<IPython.core.display.Javascript object>

Plotting fine_amount:fine_amount in (0,0)
Plotting fine_amount:discount_amount in (0,1)
Plotting fine_amount:late_fee in (0,2)
Plotting fine_amount:judgment_amount in (0,3)
Plotting fine_amount:TimeDiff in (0,4)
Plotting discount_amount:fine_amount in (1,0)
Plotting discount_amount:discount_amount in (1,1)
Plotting discount_amount:late_fee in (1,2)
Plotting discount_amount:judgment_amount in (1,3)
Plotting discount_amount:TimeDiff in (1,4)
Plotting late_fee:fine_amount in (2,0)
Plotting late_fee:discount_amount in (2,1)
Plotting late_fee:late_fee in (2,2)
Plotting late_fee:judgment_amount in (2,3)
Plotting late_fee:TimeDiff in (2,4)
Plotting judgment_amount:fine_amount in (3,0)
Plotting judgment_amount:discount_amount in (3,1)
Plotting judgment_amount:late_fee in (3,2)
Plotting judgment_amount:judgment_amount in (3,3)
Plotting judgment_amount:TimeDiff in (3,4)
Plotting TimeDiff:fine_amount in (4,0)
Plotting TimeDiff:discount_amount in (4,1)
Plotting TimeDiff:late_fee in (4,2)
Plotting 

In [227]:
X1.head()

Unnamed: 0_level_0,agency_name,inspector_name,violator_name,violation_street_number,violation_street_name,violation_zip_code,mailing_address_str_number,mailing_address_str_name,city,state,...,TimeDiff,hearing_month,issue_month,hearing_day,issue_day,hearing_dayofweek,issue_dayofweek,address,lat,lon
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
27586,"Buildings, Safety Engineering & Env Department","Williams, Darrin","Michigan, Covenant House",4311.0,CENTRAL,,2959.0,Martin Luther King,Detroit,MI,...,378.041667,5.0,4,6.0,23,4.0,4,"4311 CENTRAL, DETROIT MI",42.326937,-83.135118
18746,"Buildings, Safety Engineering & Env Department","Williams, Darrin","Klenk, Dean",14715.0,KLENK,,14715.0,Klenk,Detroit,MI,...,138.125,2.0,10,22.0,7,1.0,3,"14715 KLENK, DETROIT MI",42.360836,-82.930958
18744,"Buildings, Safety Engineering & Env Department","Williams, Darrin","Kuhn, Dean",14715.0,KLENK,,141715.0,Klenk,Detroit,MI,...,138.125,2.0,10,22.0,7,1.0,3,"14715 KLENK, DETROIT MI",42.360836,-82.930958
18657,"Buildings, Safety Engineering & Env Department","Schuman, Randy",", APB LAUNDRY",15439.0,MACK,,15439.0,MACK,DETROIT,MI,...,35.208333,2.0,1,18.0,14,4.0,4,"15439 MACK, DETROIT MI",42.388282,-82.942
18653,"Buildings, Safety Engineering & Env Department","Gailes, Orbie J","Garden Court Apt.s, na",2900.0,JEFFERSON,,29.0,E. Jefferson,Det,MI,...,39.739583,2.0,1,23.0,14,2.0,4,"2900 JEFFERSON, DETROIT MI",42.339391,-83.019373


In [221]:
# list(ctNorm.index)

In [192]:
ctTot=pd.crosstab(Xall[col], y, margins=True)
ctNorm=pd.crosstab(Xall[col], y, normalize='index', margins=True)
ctNorm['N']=Xall[col].value_counts()
ctNorm

compliance,0.0,1.0,N
judgment_amount,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.0,1.0,195.0
31.1,0.0,1.0,1.0
50.0,0.0,1.0,1.0
55.0,0.516129,0.483871,155.0
57.5,0.973835,0.026165,1223.0
80.0,0.696739,0.303261,2760.0
85.0,0.942622,0.057378,17655.0
130.0,0.682705,0.317295,2972.0
134.5,1.0,0.0,2.0
140.0,0.930409,0.069591,12516.0


In [70]:
## read test model
dftest=pd.read_csv('test.csv')
dftest=dftest.set_index('ticket_id')

In [72]:
dftest.iloc[0]

agency_name                                          Department of Public Works
inspector_name                                               Granberry, Aisha B
violator_name                                                  FLUELLEN, JOHN A
violation_street_number                                                   10041
violation_street_name                                                 ROSEBERRY
violation_zip_code                                                          NaN
mailing_address_str_number                                                  141
mailing_address_str_name                                              ROSEBERRY
city                                                                    DETROIT
state                                                                        MI
zip_code                                                                  48213
non_us_str_code                                                             NaN
country                                 

In [243]:

dftest['disposition'].value_counts()


Responsible by Default                51602
Responsible by Admission               4484
Responsible by Determination           4124
Responsible (Fine Waived) by Deter      781
Responsible - Compl/Adj by Default        6
Responsible - Compl/Adj by Determi        2
Responsible by Dismissal                  1
Responsible (Fine Waived) by Admis        1
Name: disposition, dtype: int64