In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

# data processing

In [2]:
train_raw=pd.read_csv('train_auto.csv')
print(train_raw.info()) # first look at data
display(train_raw.head())
train_raw=train_raw.dropna() # very few NaN, so we drop all lines with NaN

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8161 entries, 0 to 8160
Data columns (total 26 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   INDEX        8161 non-null   int64  
 1   TARGET_FLAG  8161 non-null   int64  
 2   TARGET_AMT   8161 non-null   float64
 3   KIDSDRIV     8161 non-null   int64  
 4   AGE          8155 non-null   float64
 5   HOMEKIDS     8161 non-null   int64  
 6   YOJ          7707 non-null   float64
 7   INCOME       7716 non-null   object 
 8   PARENT1      8161 non-null   object 
 9   HOME_VAL     7697 non-null   object 
 10  MSTATUS      8161 non-null   object 
 11  SEX          8161 non-null   object 
 12  EDUCATION    8161 non-null   object 
 13  JOB          7635 non-null   object 
 14  TRAVTIME     8161 non-null   int64  
 15  CAR_USE      8161 non-null   object 
 16  BLUEBOOK     8161 non-null   object 
 17  TIF          8161 non-null   int64  
 18  CAR_TYPE     8161 non-null   object 
 19  RED_CA

Unnamed: 0,INDEX,TARGET_FLAG,TARGET_AMT,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,...,BLUEBOOK,TIF,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CAR_AGE,URBANICITY
0,1,0,0.0,0,60.0,0,11.0,"$67,349",No,$0,...,"$14,230",11,Minivan,yes,"$4,461",2,No,3,18.0,Highly Urban/ Urban
1,2,0,0.0,0,43.0,0,11.0,"$91,449",No,"$257,252",...,"$14,940",1,Minivan,yes,$0,0,No,0,1.0,Highly Urban/ Urban
2,4,0,0.0,0,35.0,1,10.0,"$16,039",No,"$124,191",...,"$4,010",4,z_SUV,no,"$38,690",2,No,3,10.0,Highly Urban/ Urban
3,5,0,0.0,0,51.0,0,14.0,,No,"$306,251",...,"$15,440",7,Minivan,yes,$0,0,No,0,6.0,Highly Urban/ Urban
4,6,0,0.0,0,50.0,0,,"$114,986",No,"$243,925",...,"$18,000",1,z_SUV,no,"$19,217",2,Yes,3,17.0,Highly Urban/ Urban


Some quantitative variables seem to be strings, so we convert them into float

In [3]:
def dollar_to_float(x):
    ''' convert value into a float if it is a string '''
    if type(x) is str:
        dollar=x[1:].replace(',','') # delete the coma and dollar sign
        return float(dollar)
    else:
        return x

for column in ['INCOME','HOME_VAL','BLUEBOOK','OLDCLAIM']: # convert those columns into float
    train_raw[column]=train_raw[column].map(dollar_to_float)
train_raw.describe() # statistics of quantitative variables


Unnamed: 0,INDEX,TARGET_FLAG,TARGET_AMT,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,HOME_VAL,TRAVTIME,BLUEBOOK,TIF,OLDCLAIM,CLM_FREQ,MVR_PTS,CAR_AGE
count,6045.0,6045.0,6045.0,6045.0,6045.0,6045.0,6045.0,6045.0,6045.0,6045.0,6045.0,6045.0,6045.0,6045.0,6045.0,6045.0
mean,5090.02134,0.265012,1479.662715,0.173201,44.628453,0.743424,10.494624,58177.013234,150102.074607,33.694293,15235.609595,5.360298,4004.8756,0.784119,1.699752,7.920926
std,2980.105025,0.441376,4553.172055,0.515424,8.707805,1.13274,4.138508,43826.975075,123728.720923,15.892961,8040.962717,4.144664,8822.509329,1.153884,2.157899,5.58388
min,1.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,5.0,1500.0,1.0,0.0,0.0,0.0,-3.0
25%,2470.0,0.0,0.0,0.0,39.0,0.0,9.0,26748.0,0.0,23.0,9170.0,1.0,0.0,0.0,0.0,1.0
50%,5060.0,0.0,0.0,0.0,45.0,0.0,11.0,51624.0,159152.0,33.0,14080.0,4.0,0.0,0.0,1.0,8.0
75%,7667.0,1.0,1037.0,0.0,51.0,1.0,13.0,81287.0,233053.0,44.0,20120.0,7.0,4546.0,2.0,3.0,12.0
max,10302.0,1.0,85523.653347,4.0,81.0,5.0,23.0,367030.0,885282.0,142.0,65970.0,25.0,57037.0,5.0,13.0,28.0


We separate the data frame into a target and the input features

In [4]:
target= train_raw.TARGET_FLAG
data = pd.get_dummies(train_raw.drop(['TARGET_FLAG','TARGET_AMT'],axis=1))
target.value_counts(normalize=True)

0    0.734988
1    0.265012
Name: TARGET_FLAG, dtype: float64

the target value is slightly unbalanced,   
the variables a have wide range of values so I also scale them for better fitting after separating them between training and testing data

In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = .2, random_state = 1234)

from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

# Machine learning

I first make a function to quickly test different algorithms

In [6]:
from sklearn.metrics import classification_report

def training(clf):
    ''' 
    train a classifier on the data and prints the different metrics and correlation table
    return the trained classifier
    '''
    clf.fit(X_train, y_train) # fitting on train set
    y_pred = clf.predict(X_test) # prediction of test set
    print(classification_report(y_test, y_pred, digits=2)) # shows some metrics
    cm = pd.crosstab(y_test, y_pred, colnames=['Predictions']) # correlation matrix
    display(cm)
    return clf

## SVC

In [7]:
from sklearn.svm import SVC
svc=SVC()
svc=training(svc)

              precision    recall  f1-score   support

           0       0.84      0.91      0.88       929
           1       0.60      0.43      0.50       280

    accuracy                           0.80      1209
   macro avg       0.72      0.67      0.69      1209
weighted avg       0.79      0.80      0.79      1209



Predictions,0,1
TARGET_FLAG,Unnamed: 1_level_1,Unnamed: 2_level_1
0,849,80
1,160,120


The suport-vector algorithm gives a good accuracy of 80%, but we notice a lot of false negative: 
indeed the recall for 1 is low (43%)

## Random forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

forest=RandomForestClassifier()
forest=training(forest)

              precision    recall  f1-score   support

           0       0.84      0.94      0.88       929
           1       0.64      0.39      0.49       280

    accuracy                           0.81      1209
   macro avg       0.74      0.66      0.68      1209
weighted avg       0.79      0.81      0.79      1209



Predictions,0,1
TARGET_FLAG,Unnamed: 1_level_1,Unnamed: 2_level_1
0,869,60
1,171,109


The random forest classifier gives similar results. The low recall for 1 might be due to the imbalance between the two classes, so we use a **balanced random forest classifier** instead

## Balanced random forest

In [9]:
from imblearn.ensemble import BalancedRandomForestClassifier

balanced_forest=BalancedRandomForestClassifier()
balanced_forest=training(balanced_forest)

              precision    recall  f1-score   support

           0       0.92      0.71      0.80       929
           1       0.45      0.79      0.57       280

    accuracy                           0.73      1209
   macro avg       0.68      0.75      0.69      1209
weighted avg       0.81      0.73      0.75      1209



Predictions,0,1
TARGET_FLAG,Unnamed: 1_level_1,Unnamed: 2_level_1
0,656,273
1,58,222


The overall accuracy is lower by 7%, but the recall for 1 has greatly improved.  
We finally decide to focuse on the **average recall** to chose our model, because we want the classifier to make better prediction about the positive class. In this case it happens to be the **Balanced Random Forest** with 75%.

# Prediction of test data

In [13]:
test_raw=pd.read_csv('test_auto.csv') # reads the test data with no target

# apply the same processing than on the 'train_auto' dataset
for column in ['INCOME','HOME_VAL','BLUEBOOK','OLDCLAIM']: # convert those columns into float
    test_raw[column]=test_raw[column].map(dollar_to_float)

data_test = train_raw.drop(['TARGET_FLAG','TARGET_AMT'],axis=1) # drops the empty target columns
data_test=pd.get_dummies(data_test.dropna()) # drops the remaining NaN 
data_test=scaler.transform(data_test)

# make the predictions using the best classifier
clf=balanced_forest
prediction_test = pd.DataFrame(clf.predict(data_test))
print(prediction_test.value_counts(normalize=True)) # show the result

prediction_test.to_csv('prediction_test.csv') # create a csv file with the predictions

0    0.633416
1    0.366584
dtype: float64
