In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import numpy as np
replacement_value = 'dummy_repalcement_value'
path = '/home/td/Documents'

# Problem structure

One way to set up the problem is to make it a binary classification task. This has the advantages of simplicity but it makes the assumption that large Makes share characteristics which is not always true. In this pass I solved the problem this way. 

A more thorough way to set up the problem would be to predict the make or a group of makes sharing similar characteristics and back into the probability of the make being in the top 25. 

# Data problems

In [5]:
# Label problems
path = '/home/td/Documents'
df = pd.read_csv('{path}/tickets.csv'.format(path=path), low_memory=False)

top_25_makes = df['Make'].value_counts()[:25].index.tolist()
df['Make'].value_counts()[:25]

TOYT    721411
HOND    491961
FORD    382695
NISS    311324
CHEV    297076
BMW     199221
MERZ    177307
VOLK    149501
HYUN    133864
DODG    127764
LEXS    124508
KIA     101746
JEEP    100909
AUDI     84229
MAZD     79853
OTHR     72411
GMC      62391
CHRY     57317
INFI     56809
ACUR     52703
SUBA     46898
VOLV     42330
TOYO     40064
MITS     37842
CADI     34080
Name: Make, dtype: int64

Curation will be required, there is at least 1 duplicate with toyota and there are some invalid makes such as OTHR

In [6]:
df.loc[:,'Latitude'] = df.loc[:,'Latitude'].replace(99999.0, np.nan)
df.loc[:,'Longitude'] = df.loc[:,'Longitude'].replace(99999.0, np.nan)


In [7]:


percent_missing = df.isnull().sum()*100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df

Unnamed: 0,column_name,percent_missing
Ticket number,Ticket number,0.0
Issue Date,Issue Date,0.006143
Issue time,Issue time,0.029601
Meter Id,Meter Id,73.991538
Marked Time,Marked Time,96.669739
RP State Plate,RP State Plate,0.008767
Plate Expiry Date,Plate Expiry Date,9.108706
VIN,VIN,99.813099
Make,Make,50.062606
Body Style,Body Style,0.101879


There are problems with the latitude and longitude values, they are not in standard coordinate format. For example: Record with ticket id 1109139006 should have latitude and longitude of 34.156940, -118.435250. It has 6467477, 1880027 instead. 


In [33]:
df[['Latitude', 'Longitude']].head()

Unnamed: 0,Latitude,Longitude
0,,
1,,
2,6439997.9,1802686.4
3,6440041.1,1802686.2
4,,


# Label analysis

In [10]:
df_features = pd.read_csv('feature_analysis.csv')

In [11]:
df_features.sort_values('model_feature_importance', ascending = False)[:50]


Unnamed: 0,columns,intercept,model_feature_importance,p_value,r_value,slope,std_err
119,Fine amount,0.498704,0.021993,0.509448,0.001181499,1.849256e-05,2.803185e-05
120,plate_expiration_diff_ts,0.49956,0.020995,0.257164,-0.002029384,-3.013163e-22,2.6591690000000003e-22
89,ticket_dow_0.0,0.49971,0.017412,0.464613,0.001309687,0.001770736,0.002421444
77,ticket_month_1.0,0.50033,0.017367,0.241517,-0.002097592,-0.003667722,0.003131575
73,ticket_year_2016,0.500465,0.017072,0.370883,-0.001602597,-0.001844952,0.002061809
74,ticket_year_2017,0.499632,0.01691,0.486821,0.001245395,0.001422203,0.002045231
12,color_WT,0.500188,0.016517,0.681058,-0.0007361242,-0.0009082704,0.002209797
7,color_GY,0.500446,0.016225,0.310632,-0.001815858,-0.002294626,0.002263172
72,ticket_year_2015,0.500049,0.016197,0.924127,-0.0001705662,-0.000197176,0.002070377
86,ticket_month_10.0,0.49957,0.016158,0.109555,0.002865898,0.005204541,0.003252432


In [33]:
df_labels = pd.read_csv('label_analysis.csv')
df_labels = df_labels[df_labels['Make'].isin(top_25_makes)]
df_counts= df_labels['Make'].value_counts()
from sklearn.preprocessing import StandardScaler

relevant_features = df_features.sort_values('model_feature_importance', ascending = False)['columns'][:8].tolist()


for i in relevant_features:
    scaler = StandardScaler()
    df_labels.loc[:, i] =  scaler.fit_transform(df_labels[i].values.reshape(-1, 1))
df_labels


df_group = df_labels[['Make'] + relevant_features].groupby('Make').mean()



In [34]:
df_labels.head()

Unnamed: 0.1,Unnamed: 0,rp_state_plate_CA,rp_state_plate_dummy_replacement_value,color_BK,color_BL,color_BN,color_GN,color_GO,color_GY,color_MR,...,ticket_hour_of_day_18,ticket_hour_of_day_19,ticket_hour_of_day_20,ticket_hour_of_day_21,ticket_hour_of_day_22,ticket_hour_of_day_23,ticket_hour_of_day_dummy_replacement_value,Fine amount,plate_expiration_diff_ts,Make
4,4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.49063,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.222184,0.435373,CHEV
10,10,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.49063,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.712441,-2.296883,FORD
11,11,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.49063,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.089357,0.435373,CHRY
14,14,1.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.49063,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.406042,0.435373,TOYO
18,18,1.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.49063,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.222184,0.435373,CHEV


In [35]:
from sklearn.cluster import KMeans


cluster_alg = KMeans(n_clusters = 5)
df_group['cluster'] = cluster_alg.fit_predict(df_group[relevant_features])
df_group.sort_values('cluster')


Unnamed: 0_level_0,Fine amount,plate_expiration_diff_ts,ticket_dow_0.0,ticket_month_1.0,ticket_year_2016,ticket_year_2017,color_WT,color_GY,cluster
Make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TOYO,-0.020102,-0.00245,0.000628,0.009889,0.00329,0.002969,0.000856,-0.001646,0
MITS,-0.011268,-0.007986,-0.018444,0.011507,-0.004987,-0.002699,-0.00151,0.001727,0
ACUR,0.001659,-0.003236,-0.010861,-0.004134,0.003901,-0.004398,0.009383,-0.001858,1
TOYT,0.002887,0.002136,0.000788,0.000799,0.001455,-0.000368,0.001328,-0.000804,1
NISS,-0.001246,-0.000193,-0.003184,0.002242,0.003781,-0.004878,-0.000195,-0.006371,1
MERZ,0.005097,-0.000975,0.001249,0.001863,-0.003431,0.005561,-0.002376,0.005469,1
MAZD,-0.003245,0.010208,0.006148,0.001513,0.003316,0.000632,0.003205,0.002774,1
LEXS,0.001537,-0.001849,0.002649,-0.006326,-0.001285,-0.001155,-0.000836,0.008504,1
KIA,-0.00315,0.002128,-0.005024,-0.001963,-0.005705,0.003634,0.002786,-0.002084,1
VOLK,0.000736,-0.004793,-5.5e-05,0.0042,0.000859,0.003533,0.004724,-0.001782,1


In [36]:
df_group.groupby('cluster').mean()
        

Unnamed: 0_level_0,Fine amount,plate_expiration_diff_ts,ticket_dow_0.0,ticket_month_1.0,ticket_year_2016,ticket_year_2017,color_WT,color_GY
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,-0.015685,-0.005218,-0.008908,0.010698,-0.000849,0.000135,-0.000327,4.1e-05
1,0.000205,6.6e-05,-0.000784,-7.8e-05,-0.000782,0.000962,0.001427,0.000162
2,-0.003291,0.000912,0.001227,-0.004777,0.004296,-0.00571,-0.005998,0.000924
3,-0.004921,-0.005114,0.009486,-0.001328,-0.014128,0.003013,-0.006962,0.009214
4,0.009442,-0.006968,0.00677,-0.007485,-0.007445,-0.001617,0.003217,0.002346


# Features

There are 3 general type of features: location, car characteristics and ticket/violation characteristics. 

The best location features are the Route and ticket location density (lat_long_outlier_score). 

The best car characteristics features are the Body type and the car color. 

The best violation features are the fine amount and datetime information about the violation. Fine amount is important as it captures other violation information and is correlated to violation description features.

In [11]:
df_copy = df.copy()
df_copy = df_copy.dropna(subset = ['Make'])
df_copy['target'] = df_copy['Make'].isin(top_25_makes).astype(int)

def pad_int(num, l = 4):
    if not pd.isna(num):
        num = int(float(num))
        num_str = str(num)
        while len(num_str) < l:
            num_str  = '0' + num_str
        return num_str

df_copy['ticket_dt'] = pd.to_datetime(df_copy.loc[:,'Issue Date'], errors='coerce')
df_copy['ticket_year'] = df_copy['ticket_dt'].dt.year
df_copy['ticket_month'] = df_copy['ticket_dt'].dt.month
df_copy['ticket_dow'] = df_copy['ticket_dt'].dt.dayofweek
df_copy['ticket_hour_of_day'] = df_copy['Issue time'].apply(lambda x: pad_int(x)).astype(str).str[:2]


In [12]:
def get_cat_info_about_column(df, col_name, min_perc = .01):
    valid_values = [i for  i, j in dict(df_copy[col_name].value_counts(normalize = True)).items() if j >= min_perc]
    df_valid = df_copy[df_copy[col_name].isin(valid_values)]
    mean_df =  df_valid.groupby([col_name])['target'].mean().to_frame()
    count_df = df_valid[col_name].value_counts(normalize = True).to_frame()
    count_df =  count_df.reset_index()
    count_df.columns = [col_name, 'perc_of_values']
    mean_df.columns = ['mean_target']
    mean_df =  mean_df.reset_index()
    output = mean_df.merge(count_df)
    return output.sort_values('perc_of_values', ascending = False)
    

In [13]:
get_cat_info_about_column(df_copy, 'RP State Plate', min_perc = .001)



Unnamed: 0,RP State Plate,mean_target,perc_of_values
1,CA,0.914914,0.94341
0,AZ,0.939867,0.009045
19,TX,0.923509,0.006372
12,NV,0.938159,0.006059
3,FL,0.908506,0.004441
22,WA,0.936398,0.00402
5,IL,0.904017,0.002899
16,OR,0.926637,0.002807
2,CO,0.939125,0.002566
13,NY,0.909765,0.001911


In [14]:
get_cat_info_about_column(df_copy, 'Route', min_perc = .01)


Unnamed: 0,Route,mean_target,perc_of_values
8,600,0.906531,0.292103
7,500,0.915932,0.205643
5,402,0.931876,0.099345
4,401,0.924408,0.09667
6,403,0.935765,0.074387
3,315,0.917117,0.063738
1,111,0.904189,0.061456
0,107,0.91493,0.056316
2,309,0.922919,0.050343


In [15]:
get_cat_info_about_column(df_copy, 'Body Style', min_perc = .01)


Unnamed: 0,Body Style,mean_target,perc_of_values
1,PA,0.927128,0.896611
2,PU,0.992731,0.034937
4,VN,0.925558,0.027223
3,TK,0.686678,0.024821
0,CM,0.651436,0.016408


In [16]:
get_cat_info_about_column(df_copy, 'ticket_hour_of_day', min_perc = .01)

Unnamed: 0,ticket_hour_of_day,mean_target,perc_of_values
10,12,0.916145,0.129633
6,8,0.92877,0.126963
8,10,0.917084,0.123134
9,11,0.904551,0.08758
7,9,0.906974,0.063702
11,13,0.904565,0.063257
14,16,0.904276,0.053326
12,14,0.900727,0.048345
16,18,0.922039,0.038526
15,17,0.911528,0.037841


In [17]:
# get_cat_info_about_column(df_copy, 'ticket_month', min_perc = .01)


In [18]:
get_cat_info_about_column(df_copy, 'ticket_dow', min_perc = .01)


Unnamed: 0,ticket_dow,mean_target,perc_of_values
1,1,0.912119,0.194048
3,3,0.915282,0.187689
2,2,0.915017,0.185382
0,0,0.912202,0.163238
4,4,0.914962,0.162515
5,5,0.920945,0.056258
6,6,0.923178,0.05087


In [19]:
df_corr = df_labels.corr()


In [20]:
df_corr[(df_corr['Fine amount'] > .1)|(df_corr['Fine amount'] < -.1)].sort_values('Fine amount')


Unnamed: 0.1,Unnamed: 0,rp_state_plate_CA,rp_state_plate_dummy_replacement_value,color_BK,color_BL,color_BN,color_GN,color_GO,color_GY,color_MR,...,ticket_hour_of_day_17,ticket_hour_of_day_18,ticket_hour_of_day_19,ticket_hour_of_day_20,ticket_hour_of_day_21,ticket_hour_of_day_22,ticket_hour_of_day_23,ticket_hour_of_day_dummy_replacement_value,Fine amount,plate_expiration_diff_ts
violation_code_5204A-,0.00085,0.067625,-0.067625,0.001434,0.005625,-0.020215,0.019858,0.010108,-0.005932,0.00986,...,0.011208,-0.007639,-0.009475,-0.008594,-0.00506,-0.002042,-0.002146,0.013127,-0.359311,0.03376
violation_desc_DISPLAY OF TABS,0.00085,0.067625,-0.067625,0.001434,0.005625,-0.020215,0.019858,0.010108,-0.005932,0.00986,...,0.011208,-0.007639,-0.009475,-0.008594,-0.00506,-0.002042,-0.002146,0.013127,-0.359311,0.03376
violation_code_5200,0.001174,0.043663,-0.043663,0.032761,-0.003209,-0.015704,-0.009995,-0.006869,-0.000933,-0.006351,...,0.020013,0.014016,0.017879,0.003898,0.001219,0.003794,0.004466,0.009727,-0.23536,0.013699
violation_desc_DISPLAY OF PLATES,0.001174,0.043663,-0.043663,0.032761,-0.003209,-0.015704,-0.009995,-0.006869,-0.000933,-0.006351,...,0.020013,0.014016,0.017879,0.003898,0.001219,0.003794,0.004466,0.009727,-0.23536,0.013699
violation_code_88.13B+,0.000272,-0.017319,0.017319,0.037732,-0.006868,-0.026866,-0.021376,-0.01522,0.007799,-0.013957,...,0.126604,0.047035,0.115894,-0.047497,-0.046755,-0.051226,-0.042407,-0.05938,-0.102341,-0.001926
violation_desc_METER EXP.,0.000272,-0.017319,0.017319,0.037732,-0.006868,-0.026866,-0.021376,-0.01522,0.007799,-0.013957,...,0.126604,0.047035,0.115894,-0.047497,-0.046755,-0.051226,-0.042407,-0.05938,-0.102341,-0.001926
violation_code_dummy_replacement_value,0.001037,-0.000592,0.000592,-0.019691,0.00064,-0.004275,-0.017782,0.003598,-0.018815,-0.0097,...,0.01835,-0.001696,-0.000986,0.024745,0.032396,0.028403,0.02989,0.048624,0.17095,0.016831
violation_code_80.56E4+,-0.000532,0.007586,-0.007586,-0.029748,-0.017079,0.192555,-0.001236,-0.003901,-0.027349,-0.002045,...,-0.007078,-0.020202,-0.008541,0.005106,0.012033,0.013871,0.012323,0.050498,0.194697,-0.081224
violation_desc_RED ZONE,-0.000319,0.007672,-0.007672,-0.029292,-0.016353,0.18474,-0.004084,-0.003688,-0.026498,-0.003813,...,-0.007292,-0.019758,-0.008557,0.006652,0.014417,0.014794,0.013833,0.055329,0.201302,-0.075618
violation_desc_dummy_replacement_value,0.001081,-0.001228,0.001228,-0.019449,-0.000366,-0.003133,-0.013458,0.002873,-0.017922,-0.006376,...,0.026528,0.000843,-0.006071,0.018253,0.025385,0.01573,0.020026,0.035758,0.253882,0.012552


In [21]:
df_corr[(df_corr['plate_expiration_diff_ts'] > .1)|(df_corr['plate_expiration_diff_ts'] < -.1)].sort_values('plate_expiration_diff_ts')

Unnamed: 0.1,Unnamed: 0,rp_state_plate_CA,rp_state_plate_dummy_replacement_value,color_BK,color_BL,color_BN,color_GN,color_GO,color_GY,color_MR,...,ticket_hour_of_day_17,ticket_hour_of_day_18,ticket_hour_of_day_19,ticket_hour_of_day_20,ticket_hour_of_day_21,ticket_hour_of_day_22,ticket_hour_of_day_23,ticket_hour_of_day_dummy_replacement_value,Fine amount,plate_expiration_diff_ts
color_BN,0.000476,0.022457,-0.022457,-0.074064,-0.041042,1.0,-0.024349,-0.016344,-0.069358,-0.015561,...,-0.002933,-0.015942,-0.017472,-0.014116,-0.012526,-0.014147,-0.011199,-0.012671,0.045617,-0.162739
body_style_CM,0.001341,0.011131,-0.011131,-0.055906,-0.023681,0.277294,-0.005991,-0.011964,-0.054591,-0.009842,...,0.005405,-0.010166,-0.010894,-0.005762,-0.004348,-0.001482,-0.004111,-0.008826,0.047884,-0.123776
body_style_PA,0.000359,-0.010378,0.010378,0.09472,0.02383,-0.215208,-0.005252,0.011019,0.083599,0.003878,...,0.016368,0.032061,0.03474,0.020061,0.013056,0.011692,0.00816,-0.007686,-0.051081,0.111827
plate_expiration_diff_ts,-0.000687,0.076915,-0.076915,0.003964,0.017584,-0.162739,0.012109,0.014463,0.025608,0.008962,...,0.011284,0.013252,0.012103,0.013343,0.008386,0.01188,0.010003,0.010999,-0.031614,1.0


In [None]:
# 

# Evaluation

In [30]:
tn, fp, fn, tp = 19580, 17765, 216823, 181587
total_num = tn + fp + fn + tp
precision = tp/(tp + fp)
recall = tp/(tp + fn)
f1 = 2 * (precision*recall)/(precision + recall)


In [31]:
eval_df = pd.DataFrame(data = [[tn/total_num, fp/total_num], [fn/total_num, tp/total_num]],
                      columns = ['predicted 0', 'predicted 1'],
                      index = ['actual 0', 'actual 1'])
eval_df

Unnamed: 0,predicted 0,predicted 1
actual 0,0.044934,0.040768
actual 1,0.49758,0.416718


In [32]:
precision, recall, f1

(0.9108862715197239, 0.4557792224090761, 0.6075561845684404)

There is some predictive power to this model however it will frequently incorrectly predict that a top make is not a top make. 