## Data Loading

In [56]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [57]:
train_df = pd.read_csv("train.csv")
train_df

Unnamed: 0,POLICY_ID,INSR_BEGIN,INSR_END,CUSTOMER_SENIORITY,SEX,INSR_TYPE,INSURED_VALUE,PREMIUM,VEHICLE_ID,USAGE,CLAIM_PAID,PROD_YEAR,SEATS_NUM,CARRYING_CAPACITY,TYPE_VEHICLE,CCM_TON,MAKE,CLAIMS
0,300226657,2014-07-01,2015-06-30,16,Female,1201,17000.00,163.20,5000546971,Private,,1980,1.0,,Motor-cycle,183.0,SUZUKI,False
1,300226656,2015-07-01,2016-06-30,29,Female,1201,17000.00,223.20,5000546971,Private,,1980,1.0,,Motor-cycle,183.0,SUZUKI,False
2,300231460,2014-07-01,2015-06-30,1,Female,1202,2600000.00,45761.90,5000578640,Own service,30981.20,2013,61.0,0.0,Bus,9700.0,BISHOFTU,True
3,300231459,2015-07-01,2016-06-30,0,Female,1202,2600000.00,45761.90,5000578640,Own service,127708.36,2013,61.0,0.0,Bus,9700.0,BISHOFTU,True
4,300231458,2016-07-01,2017-06-30,3,Female,1202,2600000.00,65515.00,5000578640,Own service,,2013,61.0,0.0,Bus,9700.0,BISHOFTU,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406118,300504321,2017-08-07,2018-08-06,5,Female,1201,1700000.00,4283.43,5001255424,Private,,2012,8.0,,Station Wagones,4164.0,TOYOTA,False
406119,300478457,2017-08-07,2018-08-06,14,Male,1201,0.00,608.40,5001107578,Private,,2002,4.0,,Automobile,1796.0,TOYOTA,False
406120,300506367,2017-08-07,2018-08-06,2,Female,1201,1500000.00,2143.32,5001272585,Private,,2014,7.0,,Station Wagones,4461.0,TOYOTA,False
406121,300505170,2017-08-07,2018-08-06,1,Female,1202,2691421.15,6166.49,5001249176,Own Goods,,2018,6.0,9.0,Pick-up,4164.0,TOYOTA,False


## Missing values & variables selection

##### Decidimos con que variables nos vamos a quedar y el pre processing a realizar. Removemos las dos variables con ID: Policy ID y Vehicle ID, si bien puede haber un patron en los vehiculos que renovaron su policy, podriamos tambien provocar data leakage. Removemos tambien Claim Paid que es la variable que queremos predecir, y ya reemplazamos con la variable Claims de tipo boolean.

In [58]:
train_df = train_df.drop(['POLICY_ID','INSR_BEGIN', 'INSR_END','VEHICLE_ID', 'CLAIM_PAID'], axis=1)

In [59]:
#Separo la variable dependiente de Claims del resto de las variables para el train df 
x=train_df.iloc[:,:-1]
y=train_df['CLAIMS']

In [60]:
#Utilizo  label binarizer para asignar clasificacion binaria a la variable sexo
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
lb.fit(x['SEX'])
x['SEX'] = lb.transform(x['SEX'])

In [61]:
#Reemplazo los null values de carrying capacity por el promedio  de la misma variable
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(np.array(x[['CARRYING_CAPACITY']]))
x['CARRYING_CAPACITY'] = imp.transform(np.array(x[['CARRYING_CAPACITY']]))

In [62]:
#Para el caso de las categoricas Usage, Type of Vehicle yMake utilizo one hot encoder o pd.get_dummies
encode= pd.get_dummies(x['USAGE'])
x=x.join(encode)
x=x.drop('USAGE', axis=1)

In [63]:
encode= pd.get_dummies(x['TYPE_VEHICLE'])
x=x.join(encode)
x=x.drop('TYPE_VEHICLE', axis=1)

In [64]:
#Make al tener originariamente mas de 680 categorias, y generaria demasiada dimensionalidad al df, se agruparon las categorias con menor count
#en OTHERS, dejando solo aquellos top 30 fabricantes de la base
encode= pd.get_dummies(x['MAKE'])
x=x.join(encode)
x=x.drop('MAKE', axis=1

In [65]:
#Se realiza estandarizacion del df por tener tantas diferencias entre variables y  algunos outliers menores que no fueron
#quitados del f,  en vez de tomar la media se utilizo robustscaler que utiliza la media y rango intercuartil
from sklearn.preprocessing import RobustScaler
x=RobustScaler().fit_transform(x)

In [None]:
x.shape


## Data Split

In [67]:
#se realiza data split para separar el df en test y train
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=22)
print(x_train)
print(x_test)
print(y_train)
print(y_test)

[[ 0.16666667  0.          0.         ...  1.          0.
   0.        ]
 [-0.16666667  0.          0.         ...  1.          0.
   0.        ]
 [ 0.33333333  1.          0.         ...  0.          0.
   0.        ]
 ...
 [-0.33333333  0.          0.         ...  0.          0.
   0.        ]
 [ 2.16666667  1.         -1.         ...  1.          0.
   0.        ]
 [ 0.          1.          0.         ...  0.          0.
   0.        ]]
[[-0.33333333  0.          0.         ...  1.          0.
   0.        ]
 [-0.16666667  1.         -1.         ...  1.          0.
   0.        ]
 [-0.33333333  1.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.16666667  0.         -1.         ...  0.          0.
   0.        ]
 [-0.16666667  0.          0.         ...  1.          0.
   0.        ]
 [-0.33333333  0.          0.         ...  0.          0.
   0.        ]]
90239     False
174541    False
167558    False
343327    False
177044    False
          ...  
120166    False


#### Se utilizaron dos modelos que son de los más recomendados para clasificación binaria: Regresión Logística y Random Forest. Me hubiese gustado sumar uno o dos más como SVM o XGBoost, pero por cuestion de tiempo me concentré en estos dos. 


In [73]:
LogReg = LogisticRegression(class_weight={0:1,1:2},max_iter=1000)
LogReg.fit(x_train, y_train)
LorgReg_pred =LogReg.predict(x_test)

In [75]:
print(classification_report(y_test, LorgReg_pred))

              precision    recall  f1-score   support

       False       0.92      0.99      0.96     74777
        True       0.25      0.03      0.05      6448

    accuracy                           0.92     81225
   macro avg       0.59      0.51      0.50     81225
weighted avg       0.87      0.92      0.88     81225



In [77]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(max_features=5, n_estimators=100)
RF.fit(x_train, y_train)
RF_pred= RF.predict(x_test)

In [83]:
print(classification_report(y_test, RF_pred))

              precision    recall  f1-score   support

       False       0.93      0.98      0.95     74777
        True       0.26      0.09      0.13      6448

    accuracy                           0.91     81225
   macro avg       0.59      0.53      0.54     81225
weighted avg       0.87      0.91      0.89     81225



#### Se utilizan metricas de evaluación del rendimiento de los modelos, como accuracy, precision, recall, f1. Ambos modelos con porcentajes muy alejados de lo deseado, pero sin ningún ajuste de hiperparametros aun asi Random Forest tiene mayor rendimiento que el modelo de regresion logistica.

## Optimizacion con hiperparámetros

In [None]:
LogReg = LogisticRegression(max_iter=1000)
LogReg_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
LogReg_grid_search = GridSearchCV(LogReg, LogReg_param_grid, cv=5, scoring='accuracy')
LogReg_grid_search.fit(x_train, y_train)

In [85]:
best_logreg_params = logreg_grid_search.best_params_
best_logreg_params

{'C': 0.1}

In [86]:
LorgReg_pred = logreg_grid_search.predict(x_test)


In [87]:
print("Logistic Regression Results:")
print(classification_report(y_test, LorgReg_pred))


Logistic Regression Results:
              precision    recall  f1-score   support

       False       0.92      1.00      0.96     74777
        True       0.07      0.00      0.00      6448

    accuracy                           0.92     81225
   macro avg       0.50      0.50      0.48     81225
weighted avg       0.85      0.92      0.88     81225



In [71]:
rf = RandomForestClassifier()
rf_param_grid = {'n_estimators': [50, 100, 150],
                 'max_depth': [None, 10, 20, 30],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 2, 4]}
rf_grid_search = GridSearchCV(rf, rf_param_grid, cv=5, scoring='accuracy')
rf_grid_search.fit(x_train, y_train) 

KeyboardInterrupt: 

In [None]:
best_rf_params = rf_grid_search.best_params_
best_rf_params 

In [None]:
RF_pred = rf_grid_search.predict(x_test)

In [None]:
print("Random Forest Results:")
print(classification_report(y_test, RF_pred))