In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, roc_auc_score, precision_score, recall_score, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [2]:
data = pd.read_csv("D:/AP/Data/CrossSell/Ins_train.csv")

In [3]:
replaceStruct = {
                "Vehicle_Age":     {"< 1 Year": 1, "1-2 Year": 2 ,"> 2 Years": 3},
                "Gender":     {"Female": 0, "Male": 1 },
                "Vehicle_Damage":     {"No": 0, "Yes": 1 } 
                    }

In [4]:
data = data.replace(replaceStruct)

In [5]:
y = data['Response']
X = data[['Gender', 'Age', 'Driving_License', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage']]

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 123)

In [7]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 123)

In [9]:
gb = GradientBoostingClassifier(learning_rate = 0.01, n_estimators = 500, max_depth = 9, subsample = 0.6 )

In [10]:
gb_cv_score = cross_val_score(gb, x_train, y_train, cv = folds, verbose = 1, n_jobs=4, )

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed: 14.5min finished


In [15]:
print(gb_cv_score)
print("Mean accuracy:", np.mean(gb_cv_score))
print("Standard deviation", np.std(gb_cv_score))

[0.8750205  0.87821837 0.8773964  0.87857717 0.87808518]
Mean accuracy: 0.8774595232993653
Standard deviation 0.001278221764358152


In [16]:
grid =  {"max_depth": [9,10,11],
         "min_samples_split": [30,40,50],
         "n_estimators": [500,1000],
         "learning_rate" : [0.01,0.001],
         "subsample" : [0.5,0.6,0.7]
             }

In [17]:
grid_cv = GridSearchCV(gb, param_grid = grid, cv = 5, n_jobs = 6, verbose = 2 )

In [None]:
grid_cv.fit(x_train,y_train)
grid_cv.best_estimator_

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed: 78.6min


In [18]:
ab = AdaBoostClassifier( n_estimators=100, learning_rate=0.1)
ab.fit(x_train, y_train)

AdaBoostClassifier(learning_rate=0.1, n_estimators=100)

In [19]:
pred = ab.predict(x_test)

In [20]:
confusion_matrix(y_test, pred)

array([[66897,     0],
       [ 9325,     0]], dtype=int64)

In [14]:
from sklearn.metrics import f1_score
print("Accuracy:",accuracy_score(y_test, pred))
print("Precision:",precision_score(y_test, pred))
print("Recall:",recall_score(y_test, pred))
print('F1-Score:',f1_score(y_test, pred))

Accuracy: 0.8776731127495998
Precision: 1.0
Recall: 0.00010723860589812332
F1-Score: 0.00021445421402530555
