In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import numpy as np
replacement_value = 'dummy_repalcement_value'
path = '/home/td/Documents'

# Problem structure

One way to set up the problem is to make it a binary classification task. This has the advantages of simplicity but it makes the assumption that large Makes share characteristics which is not always true. In this pass I solved the problem this way. 

A more thorough way to set up the problem would be to predict the make or a group of makes sharing similar characteristics and back into the probability. 

# Data problems

In [2]:
# Label problems
path = '/home/td/Documents'
df = pd.read_csv('{path}/tickets.csv'.format(path=path), low_memory=False)

top_25_makes = df['Make'].value_counts()[:25].index.tolist()
df['Make'].value_counts()[:25]

TOYT    721411
HOND    491961
FORD    382695
NISS    311324
CHEV    297076
BMW     199221
MERZ    177307
VOLK    149501
HYUN    133864
DODG    127764
LEXS    124508
KIA     101746
JEEP    100909
AUDI     84229
MAZD     79853
OTHR     72411
GMC      62391
CHRY     57317
INFI     56809
ACUR     52703
SUBA     46898
VOLV     42330
TOYO     40064
MITS     37842
CADI     34080
Name: Make, dtype: int64

Curation will be required, there is at least 1 duplicate with toyota and there are some invalid makes such as OTHR

In [3]:
df.loc[:,'Latitude'] = df.loc[:,'Latitude'].replace(99999.0, np.nan)
df.loc[:,'Longitude'] = df.loc[:,'Longitude'].replace(99999.0, np.nan)


In [4]:


percent_missing = df.isnull().sum()*100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df

Unnamed: 0,column_name,percent_missing
Ticket number,Ticket number,0.0
Issue Date,Issue Date,0.006143
Issue time,Issue time,0.029601
Meter Id,Meter Id,73.991538
Marked Time,Marked Time,96.669739
RP State Plate,RP State Plate,0.008767
Plate Expiry Date,Plate Expiry Date,9.108706
VIN,VIN,99.813099
Make,Make,50.062606
Body Style,Body Style,0.101879


There are problems with the latitudes and longitudes, they don't make to standard coordinates. For example: Record with ticket id 1109139006 should have latitude and longitude of 34.156940, -118.435250. It has 6467477, 1880027 instead. Provided lat and long don't seem to map to standard formats.


In [5]:
#TODO: check for differences in test and train set

# Label analysis

In [6]:
df_features = pd.read_csv('feature_analysis.csv')

In [7]:
df_features.sort_values('model_feature_importance', ascending = False)[:20]


Unnamed: 0,columns,intercept,model_feature_importance,p_value,r_value,slope,std_err
119,Fine amount,0.498704,0.027279,0.509448,0.001181,1.849256e-05,2.803185e-05
120,plate_expiration_diff_ts,0.49956,0.021914,0.257164,-0.002029,-3.013163e-22,2.6591690000000003e-22
74,ticket_year_2017,0.499632,0.021214,0.486821,0.001245,0.001422203,0.002045231
7,color_GY,0.500446,0.020335,0.310632,-0.001816,-0.002294626,0.002263172
72,ticket_year_2015,0.500049,0.019994,0.924127,-0.000171,-0.000197176,0.002070377
90,ticket_dow_1,0.499558,0.019343,0.31479,0.0018,0.002275849,0.002264023
93,ticket_dow_4,0.500748,0.018496,0.057241,-0.003405,-0.004623488,0.002431534
73,ticket_year_2016,0.500465,0.018306,0.370883,-0.001603,-0.001844952,0.002061809
89,ticket_dow_0,0.49971,0.018268,0.464613,0.00131,0.001770736,0.002421444
12,color_WT,0.500188,0.017575,0.681058,-0.000736,-0.0009082704,0.002209797


In [8]:
df_labels = pd.read_csv('label_analysis.csv')
df_labels = df_labels[df_labels['Make'].isin(top_25_makes)]
df_counts= df_labels['Make'].value_counts()
from sklearn.preprocessing import StandardScaler

relevant_features = ['Fine amount', 
                     'plate_expiration_diff_ts', 
                     'route_dummy_replacement_value',
                     'color_GY',
                     'color_BL',
                     'color_BK',
                     'color_WT',
                     'color_BL',
                     'color_SL',
                     'route_00600',
                     'route_00500']


for i in relevant_features:
    scaler = StandardScaler()
    df_labels.loc[:, i] =  scaler.fit_transform(df_labels[i].values.reshape(-1, 1))
df_labels


df_group = df_labels[['Make'] + relevant_features].groupby('Make').mean()



KeyError: 'lat_long_outlier_score'

In [None]:
from sklearn.cluster import KMeans


cluster_alg = KMeans(n_clusters = 2)
df_group['cluster'] = cluster_alg.fit_predict(df_group[relevant_features])
df_group.sort_values('cluster')


In [None]:
df_group.groupby('cluster').mean()
        

# Features

There are 3 general type of features: location, car characteristics and ticket/violation characteristics. 

The best location features are the Route and ticket location density (lat_long_outlier_score). 

The best car characteristics features are the Body type and the car color. 

The best violation features are the fine amount and datetime information about the violation. Fine amount is important as it captures other violation information and is correlated to violation description features.

In [None]:
df_copy = df.copy()
df_copy = df_copy.dropna(subset = ['Make'])
df_copy['target'] = df_copy['Make'].isin(top_25_makes).astype(int)

def pad_int(num, l = 4):
    if not pd.isna(num):
        num = int(float(num))
        num_str = str(num)
        while len(num_str) < l:
            num_str  = '0' + num_str
        return num_str

df_copy['ticket_dt'] = pd.to_datetime(df_copy.loc[:,'Issue Date'], errors='coerce')
df_copy['ticket_year'] = df_copy['ticket_dt'].dt.year
df_copy['ticket_month'] = df_copy['ticket_dt'].dt.month
df_copy['ticket_dow'] = df_copy['ticket_dt'].dt.dayofweek
df_copy['ticket_hour_of_day'] = df_copy['Issue time'].apply(lambda x: pad_int(x)).astype(str).str[:2]


In [None]:
def get_cat_info_about_column(df, col_name, min_perc = .01):
    valid_values = [i for  i, j in dict(df_copy[col_name].value_counts(normalize = True)).items() if j >= min_perc]
    df_valid = df_copy[df_copy[col_name].isin(valid_values)]
    mean_df =  df_valid.groupby([col_name])['target'].mean().to_frame()
    count_df = df_valid[col_name].value_counts(normalize = True).to_frame()
    count_df =  count_df.reset_index()
    count_df.columns = [col_name, 'perc_of_values']
    mean_df.columns = ['mean_target']
    mean_df =  mean_df.reset_index()
    output = mean_df.merge(count_df)
    return output.sort_values('perc_of_values', ascending = False)
    

In [None]:
get_cat_info_about_column(df_copy, 'RP State Plate', min_perc = .001)


In [None]:
get_cat_info_about_column(df_copy, 'Route', min_perc = .01)


In [None]:
get_cat_info_about_column(df_copy, 'Body Style', min_perc = .01)


In [None]:
get_cat_info_about_column(df_copy, 'ticket_hour_of_day', min_perc = .01)

In [None]:
# get_cat_info_about_column(df_copy, 'ticket_month', min_perc = .01)


In [None]:
get_cat_info_about_column(df_copy, 'ticket_dow', min_perc = .01)


In [None]:
df_corr = df_labels.corr()



In [None]:

df_corr[(df_corr['Fine amount'] > .1)|(df_corr['Fine amount'] < -.1)].sort_values('Fine amount')

In [None]:
df_corr[(df_corr['plate_expiration_diff_ts'] > .1)|(df_corr['plate_expiration_diff_ts'] < -.1)].sort_values('plate_expiration_diff_ts')

# Evaluation

In [13]:
tn, fp, fn, tp = 24072, 269106, 13273, 129304
total_num = tn + fp + fn + tp
precision = tp/(tp + fp)
recall = tp/(tp + fn)
f1_score = 2 * (precision*recall)/(precision + recall)

In [14]:
eval_df = pd.DataFrame(data = [[tp/total_num, fp/total_num], [tn/total_num, fn/total_num]],
                      columns = [1, 0],
                      index = [1, 0])
eval_df

Unnamed: 0,1,0
1,0.296736,0.617563
0,0.055242,0.03046
