# Final Checks for model

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(f"D:/Docs/train_1.csv", encoding='mac_roman')
#tickets = pd.read_csv(f"D:/SYED/Docs/train.csv", encoding='mac_roman')

## 1. Use ONLY compliance available columns

In [4]:
df = df[df['compliance'].notna()]

In [5]:
df.shape

(159880, 11)

In [8]:
#Make paid in full also as compliant
df.loc[df['payment_status'] == 'PAID IN FULL', 'compliance'] = 1
df['payment_status'].value_counts()

NO PAYMENT APPLIED         120269
PAID IN FULL                30429
PARTIAL PAYMENT APPLIED      9182
Name: payment_status, dtype: int64

## 2. Encode payment status

In [9]:
# Need to encode the text values as numbers as well
df['payment_status'].replace(['NO PAYMENT APPLIED', 'PAID IN FULL', 'PARTIAL PAYMENT APPLIED'], [0, 1, 2], inplace=True)
df['payment_status'].value_counts()


0    120269
1     30429
2      9182
Name: payment_status, dtype: int64

In [10]:
df.head(3)

Unnamed: 0,ticket_id,city,ticket_date,disposition,fine_amount,late_fee,judgment_amount,balance_due,payment_status,compliance,date
0,22056,CHICAGO,16-03-04,1,250.0,25.0,305.0,305.0,0,0.0,NaT
1,27586,Detroit,23-04-04,1,750.0,75.0,855.0,75.0,1,1.0,NaT
5,22046,LOG BEACH,01-05-04,1,250.0,25.0,305.0,305.0,0,0.0,NaT


## 4. Encode city values > 300

In [11]:
df = df.groupby('city').filter(lambda x : len(x) > 300)
#df['city'].value_counts()
#counts = df.city.value_counts()
#counts.to_csv('city100plus.csv')

In [12]:
df['fine_amount'] = df['fine_amount'].fillna(0)
df.shape

(130320, 11)

In [13]:
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder()
df["city_code"] = ord_enc.fit_transform(df[["city"]])
df[["city", "city_code"]].head(11)

Unnamed: 0,city,city_code
0,CHICAGO,4.0
1,Detroit,6.0
7,Detroit,6.0
8,Detroit,6.0
9,Detroit,6.0
12,Detroit,6.0
13,Detroit,6.0
14,Detroit,6.0
18,Detroit,6.0
19,Detroit,6.0


In [14]:
df.head(5)

Unnamed: 0,ticket_id,city,ticket_date,disposition,fine_amount,late_fee,judgment_amount,balance_due,payment_status,compliance,date,city_code
0,22056,CHICAGO,16-03-04,1,250.0,25.0,305.0,305.0,0,0.0,NaT,4.0
1,27586,Detroit,23-04-04,1,750.0,75.0,855.0,75.0,1,1.0,NaT,6.0
7,18735,Detroit,16-06-04,1,100.0,10.0,140.0,140.0,0,0.0,NaT,6.0
8,18733,Detroit,16-06-04,1,100.0,10.0,140.0,140.0,0,0.0,NaT,6.0
9,28204,Detroit,12-07-04,1,750.0,75.0,855.0,855.0,0,0.0,NaT,6.0


## 5. Drop useless columns

In [16]:
df = df.drop(labels=['date', 'ticket_date', 'disposition'], axis=1)
df.head(5)

Unnamed: 0,ticket_id,city,fine_amount,late_fee,judgment_amount,balance_due,payment_status,compliance,city_code
0,22056,CHICAGO,250.0,25.0,305.0,305.0,0,0.0,4.0
1,27586,Detroit,750.0,75.0,855.0,75.0,1,1.0,6.0
7,18735,Detroit,100.0,10.0,140.0,140.0,0,0.0,6.0
8,18733,Detroit,100.0,10.0,140.0,140.0,0,0.0,6.0
9,28204,Detroit,750.0,75.0,855.0,855.0,0,0.0,6.0


## 6. Build the actual model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#Feature selection is very interesting phase here where by own intuition I have to select the features
#which might give best fit for data at hand

feature_names_tickets = ['ticket_id', 'fine_amount', 'city_code']
X_tickets = df[feature_names_tickets]
#Our target variable is compliance i.e., ticket chances of getting PAID
y_tickets = df['compliance']
#X_train, X_test, y_train, y_test = train_test_split(X_tickets, y_tickets, random_state = 0)
clf = LogisticRegression(C=100).fit(X_tickets, y_tickets)

## 7. Check ROC / AUC

In [24]:
# First we need to load our test dataset
df1 = pd.read_csv(f"D:/Docs/test_1.csv", encoding='mac_roman')

In [25]:
#Filter for cities encoding above 300 count
df1 = df1.groupby('city').filter(lambda x : len(x) > 300)
counts = df1.city.value_counts()
counts.to_csv('city_test_300plus.csv')
#Remove any NAN from fine_amount
df1['fine_amount'] = df1['fine_amount'].fillna(0)

#Encode cities
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder()
df1["city_code"] = ord_enc.fit_transform(df1[["city"]])
df1[["city", "city_code"]].head(4)

Unnamed: 0,city,city_code
0,Detroit,4.0
1,Detroit,4.0
2,Detroit,4.0
4,LIVONIA,6.0


In [26]:
df1.head(5)

Unnamed: 0,ticket_id,city,fine_amount,city_code
0,284932,Detroit,200,4.0
1,285362,Detroit,1000,4.0
2,285361,Detroit,100,4.0
4,285346,LIVONIA,100,6.0
5,285345,LIVONIA,200,6.0


In [27]:
feature_names_test = ['ticket_id', 'fine_amount', 'city_code']
X_test = df[feature_names_test]

In [28]:
from sklearn.metrics import roc_curve, auc
y_score_lr = clf.decision_function(X_test)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_score_lr)
roc_auc_lr = auc(fpr_lr, tpr_lr)

plt.figure()
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr_lr, tpr_lr, lw=3, label='LogRegr ROC curve (area = {:0.2f})'.format(roc_auc_lr))
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curve (1-of-10 digits classifier)', fontsize=16)
plt.legend(loc='lower right', fontsize=13)
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.axes().set_aspect('equal')
plt.show()

ValueError: Found input variables with inconsistent numbers of samples: [32580, 130320]