In [4]:
import pandas as pd 
import numpy as np 
import seaborn as sns 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import confusion_matrix , accuracy_score , precision_score, recall_score, f1_score

In [5]:
df = pd.read_csv('D:/AP/Data/CrossSell/Ins_train.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


## Observations and Actions 
    1.  No missing values 
    2. Check Vehicle_Age , datatype is object 
    3. Policy_sales_channel? Why the data is float
    4. Change object data types to numeric 
    

In [7]:
df.Vehicle_Age.value_counts()

1-2 Year     200316
< 1 Year     164786
> 2 Years     16007
Name: Vehicle_Age, dtype: int64

In [8]:
df.Vehicle_Damage.value_counts()

Yes    192413
No     188696
Name: Vehicle_Damage, dtype: int64

In [9]:
df.Gender.value_counts()

Male      206089
Female    175020
Name: Gender, dtype: int64

In [10]:
# Create a dictionary of variables and levels with replacement values
replaceStruct = {
                "Vehicle_Age":     {"< 1 Year": 1, "1-2 Year": 2 ,"> 2 Years": 3},
                "Gender":     {"Female": 0, "Male": 1 },
                "Vehicle_Damage":     {"No": 0, "Yes": 1 }
                
                    }

In [11]:
df = df.replace(replaceStruct)

In [12]:
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,1,44,1,28.0,0,3,1,40454.0,26.0,217,1
1,2,1,76,1,3.0,0,2,0,33536.0,26.0,183,0
2,3,1,47,1,28.0,0,3,1,38294.0,26.0,27,1
3,4,1,21,1,11.0,1,1,0,28619.0,152.0,203,0
4,5,0,29,1,41.0,1,1,0,27496.0,152.0,39,0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  int64  
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  int64  
 7   Vehicle_Damage        381109 non-null  int64  
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(9)
memory usage: 34.9 MB


In [14]:
df.Response.value_counts()

0    334399
1     46710
Name: Response, dtype: int64

In [15]:
46710/(46710+334399)

0.12256336113815208

In [16]:
df.columns

Index(['id', 'Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [17]:
X = df[['Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage']]
y = df['Response']

In [18]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 123)

In [19]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(304887, 10)
(76222, 10)
(304887,)
(76222,)


In [20]:
rf = RandomForestClassifier(n_estimators = 200, max_features = 5, max_depth = 12, min_samples_split=50,
                            class_weight = "balanced",n_jobs = -1, verbose=1)
rf.fit(x_train,y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   25.0s finished


RandomForestClassifier(class_weight='balanced', max_depth=12, max_features=5,
                       min_samples_split=50, n_estimators=200, n_jobs=-1,
                       verbose=1)

In [21]:
pred = rf.predict(x_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.3s finished


In [22]:
accuracy_score(y_test, pred)

0.7042848521424261

In [23]:
confusion_matrix(y_test, pred)

array([[45092, 21805],
       [  735,  8590]], dtype=int64)

In [24]:
print("accuracy score:", accuracy_score(y_test, pred))
print("Precision score:", precision_score(y_test, pred))
print("Recall score:", recall_score(y_test, pred))
print("F-1 score:", f1_score(y_test, pred))

accuracy score: 0.7042848521424261
Precision score: 0.2826122717552229
Recall score: 0.9211796246648793
F-1 score: 0.43252769385699896


In [25]:
## lets add a crossvaalidation 

from sklearn.model_selection import KFold, cross_val_score

folds = KFold(n_splits=5, shuffle=True, random_state=123 )

In [26]:
rf_cv_score = cross_val_score(rf, x_train, y_train,cv = folds, n_jobs = -1, verbose=1 )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.5min remaining:  2.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min finished


In [27]:
rf_cv_score

array([0.71245367, 0.70750107, 0.70390475, 0.70715188, 0.70688948])

In [31]:
rf = RandomForestClassifier(class_weight = "balanced",n_jobs = -1, verbose=1)

In [41]:
from sklearn.model_selection import GridSearchCV

param_dist = {"max_depth": [11,13,15],
              "min_samples_split": [50,75,100],
              "max_features": [4,5,6],
              "n_estimators": [200]
             }
grid_search = GridSearchCV(rf, n_jobs=-1, param_grid=param_dist, cv = 5, scoring="roc_auc", verbose=2)
grid_search.fit(x_train,y_train)
grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed: 437.7min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   21.2s finished


RandomForestClassifier(class_weight='balanced', max_depth=13, max_features=4,
                       min_samples_split=100, n_estimators=200, n_jobs=-1,
                       verbose=1)

In [38]:
grid_search.best_estimator_

RandomForestClassifier(class_weight='balanced', max_depth=11,
                       min_samples_split=50, n_estimators=200, n_jobs=-1,
                       verbose=1)

In [42]:
pred = grid_search.predict(x_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    0.3s finished


In [43]:
print("accuracy score:", accuracy_score(y_test, pred))
print("Precision score:", precision_score(y_test, pred))
print("Recall score:", recall_score(y_test, pred))
print("F-1 score:", f1_score(y_test, pred))

accuracy score: 0.7053212983128231
Precision score: 0.2829190904283448
Recall score: 0.9179624664879357
F-1 score: 0.43253075970794064
