# Classify case disposition

We classify the subset of the data we have into two buckets, and predict whether a test case would end up in a disposition of conviction or aquittal

## Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [2]:
dataset1 = pd.read_csv("/kaggle/input/classif-data/final_case_1.csv")
dataset2 = pd.read_csv("/kaggle/input/classif-data/final_case_2.csv")
dataset3 = pd.read_csv("/kaggle/input/classif-data/final_case_3.csv")


dataset = pd.concat([dataset1,dataset2,dataset3])

In [3]:
dataset["case_time"] = (pd.to_datetime(dataset["date_of_decision"]) - pd.to_datetime(dataset["date_of_filing"])).dt.days

In [4]:
dataset["judge_experience"] = (pd.to_datetime(dataset["date_of_decision"]) - pd.to_datetime(dataset["start_date"])).dt.days

In [5]:
dataset

Unnamed: 0.1,Unnamed: 0,ddl_case_id,year,judge_position_x,female_defendant,female_adv_def,disp_name,date_of_filing,date_of_decision,disp_name_s,ddl_decision_judge_id,ddl_judge_id,female_judge,judge_position_y,start_date,end_date,case_time,judge_experience
0,0,01-01-01-203008000082010,2010,chief judicial magistrate,0 male,0,19,2010-01-12,2015-01-13,convicted,3.0,3,0 nonfemale,chief judicial magistrate,2014-02-21,2016-05-31,1827,326
1,1,01-01-01-203008002452010,2010,chief judicial magistrate,0 male,1,4,2010-08-18,2015-04-15,acquitted,3.0,3,0 nonfemale,chief judicial magistrate,2014-02-21,2016-05-31,1701,418
2,2,01-01-01-203008002972010,2010,chief judicial magistrate,0 male,0,4,2010-09-18,2015-03-25,acquitted,3.0,3,0 nonfemale,chief judicial magistrate,2014-02-21,2016-05-31,1649,397
3,3,01-01-01-203408000182010,2010,chief judicial magistrate,0 male,0,4,2010-01-06,2015-01-14,acquitted,3.0,3,0 nonfemale,chief judicial magistrate,2014-02-21,2016-05-31,1834,327
4,4,01-01-01-203408003302010,2010,chief judicial magistrate,0 male,0,4,2010-05-18,2015-09-14,acquitted,3.0,3,0 nonfemale,chief judicial magistrate,2014-02-21,2016-05-31,1945,570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1590,1590,23-09-01-220600004292018,2018,district and sessions court,0 male,0,20,2018-05-14,2018-07-13,convicted,88654.0,88654,0 nonfemale,district and sessions court,2018-06-20,2018-10-13,60,23
1591,1591,23-09-01-220600006322018,2018,district and sessions court,0 male,0,20,2018-07-12,2018-08-14,convicted,88654.0,88654,0 nonfemale,district and sessions court,2018-06-20,2018-10-13,33,55
1592,1592,23-18-01-220600000242018,2018,district and sessions court,0 male,0,20,2018-01-29,2018-04-09,convicted,89475.0,89475,0 nonfemale,district and sessions court,2017-08-05,2018-05-22,70,247
1593,1593,23-29-01-220600000192018,2018,district and sessions court,0 male,0,20,2018-01-05,2018-04-12,convicted,90687.0,90687,0 nonfemale,district and sessions court,2018-02-01,2018-05-28,97,70


We now assign boolean values to the disposition values, so that they can be used for classification

In [6]:
def boolean_val(s):
    if s == "convicted":
         return 1
    else:
        return 0

In [7]:
dataset["disp_boolean"] = dataset.apply(lambda x : boolean_val(x["disp_name_s"]), axis = 1)

In [8]:
dataset.drop(["disp_name_s", "disp_name"], axis = 1, inplace = True)

dataset.drop(["Unnamed: 0"], axis=1, inplace = True)

# Training

We train with the random forest classification model because it provides additional randomess and uncorrelation in the model

In [9]:
numerical_cols = ["case_time", "judge_experience"]
categorical_cols = ["female_judge","female_defendant", "year", "female_adv_def", ]
label_cols = ["disp_boolean"]
categorical_preprocessor = OneHotEncoder(handle_unknown ='ignore')
numerical_preprocessor  = StandardScaler()

In [10]:
preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_cols),
    ('standard_scaler', numerical_preprocessor, numerical_cols),
])

In [11]:
model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators = 100))

In [12]:
x = dataset[['female_judge', 'female_defendant', 'female_adv_def', 'year',
       'case_time', 'judge_experience']]
y = dataset['disp_boolean']

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [14]:
%%time
model.fit(x_train, y_train)

CPU times: user 22.3 s, sys: 190 ms, total: 22.5 s
Wall time: 22.5 s


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['female_judge',
                                                   'female_defendant', 'year',
                                                   'female_adv_def']),
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  ['case_time',
                                                   'judge_experience'])])),
                ('randomforestclassifier', RandomForestClassifier())])

# Performance Metrics

## Score

In [15]:
model.score(x_test, y_test) * 100

82.22920958473382

As we can see. Our model scores a respectable 82% in accuracy. This can further be improved by increasing the number of parameters taken, and using more test data.

## Confusion Matrix

In [16]:
y_pred = model.predict(x_test)

In [17]:
confusion_matrix(y_test, y_pred)

array([[29593,  1272],
       [ 5284,   743]])

## Precision

In [18]:
def find_TP(y, y_hat):
   # counts the number of true positives (y = 1, y_hat = 1)
   return sum((y == 1) & (y_hat == 1))
def find_FN(y, y_hat):
   # counts the number of false negatives (y = 1, y_hat = 0) Type-II error
   return sum((y == 1) & (y_hat == 0))
def find_FP(y, y_hat):
   # counts the number of false positives (y = 0, y_hat = 1) Type-I error
   return sum((y == 0) & (y_hat == 1))
def find_TN(y, y_hat):
   # counts the number of true negatives (y = 0, y_hat = 0)
   return sum((y == 0) & (y_hat == 0))

In [19]:
TP = find_TP(y_test, y_pred)
FN = find_FN(y_test, y_pred)
FP = find_FP(y_test, y_pred)
TN = find_TN(y_test, y_pred)
print('TP:',TP)
print('FN:',FN)
print('FP:',FP)
print('TN:',TN)
precision = TP/(TP+FP)
print('Precision:',precision)

TP: 743
FN: 5284
FP: 1272
TN: 29593
Precision: 0.36873449131513647
