# Inicialização

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os.path

In [2]:
ROOT = os.path.abspath('..')
DATA = os.path.join(ROOT, 'data', 'external')
df = pd.read_csv(os.path.join(DATA, 'train.csv'))

In [5]:
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


# Pré-Processamento

In [21]:
from sklearn.preprocessing import OrdinalEncoder

In [22]:
df.select_dtypes(include=['object'])

Unnamed: 0,Gender,Vehicle_Age,Vehicle_Damage
0,Male,> 2 Years,Yes
1,Male,1-2 Year,No
2,Male,> 2 Years,Yes
3,Male,< 1 Year,No
4,Female,< 1 Year,No
...,...,...,...
381104,Male,1-2 Year,No
381105,Male,< 1 Year,No
381106,Male,< 1 Year,No
381107,Female,> 2 Years,Yes


In [23]:
encoder = OrdinalEncoder()

In [31]:
df[['Gender', 'Vehicle_Age', 'Vehicle_Damage']] = encoder.fit_transform(df.select_dtypes(include=['object']))

In [35]:
encoder.categories_

[array(['Female', 'Male'], dtype=object),
 array(['1-2 Year', '< 1 Year', '> 2 Years'], dtype=object),
 array(['No', 'Yes'], dtype=object)]

- Gender
    - Male = 1
    - Female = 0
- Vehicle_Age
    - 1-2 Year = 0
    - < 1 Year = 1
    - \> 2 Years = 2
- Vehicle_Damage
    - No = 0
    - Yes = 1

# Treinamento

In [75]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [40]:
X_train = df.drop(labels=['id', 'Response'], axis=1)
y_train = df['Response']

In [38]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier()

Analisando os dados de treino

In [41]:
result_train = rfc.predict(X_train)

In [43]:
print(classification_report(y_train, result_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    334399
           1       1.00      1.00      1.00     46710

    accuracy                           1.00    381109
   macro avg       1.00      1.00      1.00    381109
weighted avg       1.00      1.00      1.00    381109



Analisando os dados de teste

In [44]:
df_test = pd.read_csv(os.path.join(DATA, 'test.csv'))

In [47]:
df_test[['Gender', 'Vehicle_Age', 'Vehicle_Damage']] = encoder.transform(df_test.select_dtypes(include=['object']))

In [50]:
X_test = df_test.drop(labels=['id'], axis=1)

In [52]:
result_test = rfc.predict(X_test)

In [62]:
df_target = pd.read_csv(os.path.join(DATA, 'sample_submission.csv'))

In [66]:
result_test.shape

(127037,)

In [67]:
df_target['Response'].value_counts()

0    127037
Name: Response, dtype: int64

In [69]:
df_test

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,381110,1.0,25,1,11.0,1,1.0,0.0,35786.0,152.0,53
1,381111,1.0,40,1,28.0,0,0.0,1.0,33762.0,7.0,111
2,381112,1.0,47,1,28.0,0,0.0,1.0,40050.0,124.0,199
3,381113,1.0,24,1,27.0,1,1.0,1.0,37356.0,152.0,187
4,381114,1.0,27,1,28.0,1,1.0,0.0,59097.0,152.0,297
...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,0.0,26,1,37.0,1,1.0,0.0,30867.0,152.0,56
127033,508143,0.0,38,1,28.0,0,0.0,1.0,28700.0,122.0,165
127034,508144,1.0,21,1,46.0,1,1.0,0.0,29802.0,152.0,74
127035,508145,1.0,71,1,28.0,1,0.0,0.0,62875.0,26.0,265


In [81]:
print(classification_report(df_target['Response'], result_test,zero_division=True ))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98    127037
           1       0.00      1.00      0.00         0

    accuracy                           0.96    127037
   macro avg       0.50      0.98      0.49    127037
weighted avg       1.00      0.96      0.98    127037



In [77]:
confusion_matrix(df_target['Response'], result_test)

array([[121625,   5412],
       [     0,      0]])

In [80]:
# np.unique(result_test, return_counts=True)
print (pd.crosstab(df_target['Response'], result_test, rownames=['Real'], colnames=['      Predito'], margins=True))

      Predito       0     1     All
Real                               
0              121625  5412  127037
All            121625  5412  127037
