In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('train.csv', encoding='windows-1252', low_memory=False)
test_df = pd.read_csv('test.csv')

train_df.head()

Unnamed: 0,ticket_id,agency_name,inspector_name,violator_name,violation_street_number,violation_street_name,violation_zip_code,mailing_address_str_number,mailing_address_str_name,city,...,clean_up_cost,judgment_amount,payment_amount,balance_due,payment_date,payment_status,collection_status,grafitti_status,compliance_detail,compliance
0,22056,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","INVESTMENT INC., MIDWEST MORTGAGE",2900.0,TYLER,,3.0,S. WICKER,CHICAGO,...,0.0,305.0,0.0,305.0,,NO PAYMENT APPLIED,,,non-compliant by no payment,0.0
1,27586,"Buildings, Safety Engineering & Env Department","Williams, Darrin","Michigan, Covenant House",4311.0,CENTRAL,,2959.0,Martin Luther King,Detroit,...,0.0,855.0,780.0,75.0,2005-06-02 00:00:00,PAID IN FULL,,,compliant by late payment within 1 month,1.0
2,22062,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","SANDERS, DERRON",1449.0,LONGFELLOW,,23658.0,P.O. BOX,DETROIT,...,0.0,0.0,0.0,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,
3,22084,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","MOROSI, MIKE",1441.0,LONGFELLOW,,5.0,ST. CLAIR,DETROIT,...,0.0,0.0,0.0,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,
4,22093,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","NATHANIEL, NEAL",2449.0,CHURCHILL,,7449.0,CHURCHILL,DETROIT,...,0.0,0.0,0.0,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,


In [3]:
train_df.shape, test_df.shape

((250306, 34), (61001, 27))

<h2>Train/test ticket IDs</h2>

In [4]:
train_ticket_id, test_ticket_id = train_df['ticket_id'], test_df['ticket_id']

<h2>Initial Cleaning</h2>
<br>
For the assignment, all violators that were found not responsible are not being considered in this evaluation.
    <ul>- Violators that are found not responsible will have NaN in compliance</ul>
<br>
Extra columns were also provided in training dataframe for reference but should not be considered in evaluation as well.

In [5]:
extra_cols = ['payment_amount','payment_date','payment_status','balance_due','collection_status','compliance_detail']
useless_cols = ['ticket_id', 'violation_street_name', 'mailing_address_str_name', 'country', 'violation_description',\
                'inspector_name', 'violator_name', 'grafitti_status', 'non_us_str_code']
train_df.drop(extra_cols, axis=1, inplace=True)
train_df.drop(useless_cols, axis=1, inplace=True)
train_df.dropna(subset=['compliance'], inplace=True)

In [6]:
# dropping useless columns in test dataframe as well
test_df.drop(useless_cols, axis=1, inplace=True)

<h3>Counting distribution of target</h3>

In [9]:
train_df['compliance'].value_counts()

0.0    148283
1.0     11597
Name: compliance, dtype: int64

<h3>Counting Nulls</h3>

In [13]:
train_df.isnull().sum().sort_values(ascending=False)

# a lot more of nulls in hearing dates were remove when removing target nulls

violation_zip_code            159880
mailing_address_str_number      2558
hearing_date                     227
state                             84
zip_code                           1
compliance                         0
violation_street_number            0
city                               0
ticket_issued_date                 0
violation_code                     0
judgment_amount                    0
disposition                        0
fine_amount                        0
admin_fee                          0
state_fee                          0
late_fee                           0
discount_amount                    0
clean_up_cost                      0
agency_name                        0
dtype: int64

<h3>Column Types</h3>

In [15]:
train_df.dtypes.value_counts()

float64    11
object      8
dtype: int64

<h3>Counts of unique values for categorical columns</h3>

In [17]:
train_df.select_dtypes('object').apply(pd.Series.nunique, axis=0).sort_values(ascending=False)

ticket_issued_date    68097
hearing_date           5970
city                   4093
zip_code               3498
violation_code          189
state                    59
agency_name               5
disposition               4
dtype: int64

<h3>Dealing with dates</h3>
<br>Date part the datetime columns</br>
<br>Get difference between ticket_issued_date & hearing_date (use this for transformation)</br>

In [58]:
from datetime import datetime

train_df['hearing_issue_date_diff'] = (train_df['hearing_date'].astype('datetime64') -\
                                    train_df['ticket_issued_date'].astype('datetime64')).dt.days
train_df = train_df[train_df['hearing_issue_date_diff'] >= 0]
#train_df['hearing_issue_date_diff'].head()

<h3>Dropping location columns for the sake of simplicity.
Dropping violation_code for now and get baseline results. May add it back into columns if needed.</h3>

For violation_code maybe only include violations with occurances above the mean of count

In [64]:
train_df.drop(['ticket_issued_date','hearing_date','city','zip_code','violation_code','state'], axis=1, inplace=True)

In [75]:
train_df = pd.get_dummies(train_df)
train_df.head()

Unnamed: 0,violation_street_number,violation_zip_code,mailing_address_str_number,fine_amount,admin_fee,state_fee,late_fee,discount_amount,clean_up_cost,judgment_amount,...,hearing_issue_date_diff,"agency_name_Buildings, Safety Engineering & Env Department",agency_name_Department of Public Works,agency_name_Detroit Police Department,agency_name_Health Department,agency_name_Neighborhood City Halls,disposition_Responsible (Fine Waived) by Deter,disposition_Responsible by Admission,disposition_Responsible by Default,disposition_Responsible by Determination
0,2900.0,,3.0,250.0,20.0,10.0,25.0,0.0,0.0,305.0,...,369,1,0,0,0,0,0,0,1,0
1,4311.0,,2959.0,750.0,20.0,10.0,75.0,0.0,0.0,855.0,...,378,1,0,0,0,0,0,0,0,1
5,6478.0,,2755.0,250.0,20.0,10.0,25.0,0.0,0.0,305.0,...,323,1,0,0,0,0,0,0,1,0
6,8027.0,,476.0,750.0,20.0,10.0,75.0,0.0,0.0,855.0,...,253,1,0,0,0,0,0,0,1,0
7,8228.0,,8228.0,100.0,20.0,10.0,10.0,0.0,0.0,140.0,...,251,1,0,0,0,0,0,0,1,0


In [None]:
#align modifications with test_df