### CatBoost is good at handling categoriacal variables 
    Pass a list of categorical variables names while fitting the model. 
    For more information on CatBoost : https://catboost.ai/docs/concepts/python-reference_parameters-list.html#python-reference_parameters-list

In [13]:
import pandas as pd
import numpy as np 
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, roc_auc_score, precision_score, recall_score, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from catboost import CatBoostClassifier

In [3]:
data = pd.read_csv("D:/AP/Data/CrossSell/Ins_train.csv")

### Regular Preprocessing 

In [4]:
replaceStruct = {
                "Vehicle_Age":     {"< 1 Year": 1, "1-2 Year": 2 ,"> 2 Years": 3},
                "Gender":     {"Female": 0, "Male": 1 },
                "Vehicle_Damage":     {"No": 0, "Yes": 1 } 
                    }

In [5]:
data = data.replace(replaceStruct)

In [6]:
data["Annual_Premium"] = np.where(data["Annual_Premium"] >100000, 100000.0,data['Annual_Premium'])

In [7]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
num_feat = ['Age','Annual_Premium','Vintage' ]

for column in num_feat:
    data[[column]] = sc.fit_transform(data[[column]])

In [8]:
from sklearn.preprocessing import LabelEncoder

cat_feat = ['Region_Code', 'Policy_Sales_Channel' ]
lb = LabelEncoder()
for column in cat_feat:
    data[column] = data[column].astype('str')
    data[column] = lb.fit_transform(data[column])

In [9]:
y = data['Response']
X = data[['Gender', 'Age', 'Region_Code',
       'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage']]

In [56]:
## train and test split
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 123)

In [14]:
## Model fit with default parameters
cb = CatBoostClassifier()
cb.fit(x_train, y_train)

Learning rate set to 0.118484
0:	learn: 0.5050478	total: 214ms	remaining: 3m 33s
1:	learn: 0.4055817	total: 255ms	remaining: 2m 7s
2:	learn: 0.3538478	total: 297ms	remaining: 1m 38s
3:	learn: 0.3251186	total: 339ms	remaining: 1m 24s
4:	learn: 0.3078987	total: 383ms	remaining: 1m 16s
5:	learn: 0.2968845	total: 429ms	remaining: 1m 11s
6:	learn: 0.2895086	total: 478ms	remaining: 1m 7s
7:	learn: 0.2845957	total: 521ms	remaining: 1m 4s
8:	learn: 0.2809799	total: 563ms	remaining: 1m 1s
9:	learn: 0.2783981	total: 605ms	remaining: 59.9s
10:	learn: 0.2763815	total: 644ms	remaining: 57.9s
11:	learn: 0.2747059	total: 694ms	remaining: 57.1s
12:	learn: 0.2735640	total: 748ms	remaining: 56.8s
13:	learn: 0.2725474	total: 803ms	remaining: 56.6s
14:	learn: 0.2717824	total: 851ms	remaining: 55.9s
15:	learn: 0.2712148	total: 895ms	remaining: 55s
16:	learn: 0.2705807	total: 940ms	remaining: 54.3s
17:	learn: 0.2702272	total: 984ms	remaining: 53.7s
18:	learn: 0.2699273	total: 1.03s	remaining: 53.4s
19:	lear

<catboost.core.CatBoostClassifier at 0x225c8839f88>

In [16]:
pred = cb.predict(x_test)

In [17]:
confusion_matrix(y_test, pred)

array([[66543,   354],
       [ 9065,   260]], dtype=int64)

## Model performance is poor, lets try to optimize the parameters. 
    1. Class weight is required to balance the classes - Cat boost takes a list of weights for positive and Negative classes eg: [0.5, 1.2] -- Low weights for positive classes and higher weights for negative
    2. Evaluation set is test set only 
    3. Cat_features = List of names of categorical features 

In [29]:
## Create an evaluationset using training and testing 
ev_set = [[x_train, y_train],[x_test, y_test]]

In [26]:
x_train.columns

Index(['Gender', 'Age', 'Region_Code', 'Previously_Insured', 'Vehicle_Age',
       'Vehicle_Damage', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage'],
      dtype='object')

In [27]:
Cat_feat = ['Gender', 'Region_Code', 'Previously_Insured', 'Vehicle_Damage','Policy_Sales_Channel'  ]

In [52]:
cb = CatBoostClassifier(iterations = 200, learning_rate=0.1,depth = 11, class_weights=[0.35,1.2] )
cb.fit(x_train, y_train, eval_set = (x_test, y_test),early_stopping_rounds = 10, cat_features = Cat_feat  )

0:	learn: 0.5904923	test: 0.5906113	best: 0.5906113 (0)	total: 169ms	remaining: 33.6s
1:	learn: 0.5258093	test: 0.5260717	best: 0.5260717 (1)	total: 503ms	remaining: 49.8s
2:	learn: 0.4863262	test: 0.4866398	best: 0.4866398 (2)	total: 771ms	remaining: 50.6s
3:	learn: 0.4617489	test: 0.4621244	best: 0.4621244 (3)	total: 1.08s	remaining: 53.1s
4:	learn: 0.4457429	test: 0.4462335	best: 0.4462335 (4)	total: 1.34s	remaining: 52.4s
5:	learn: 0.4353830	test: 0.4360438	best: 0.4360438 (5)	total: 1.6s	remaining: 51.8s
6:	learn: 0.4283717	test: 0.4290428	best: 0.4290428 (6)	total: 1.83s	remaining: 50.5s
7:	learn: 0.4229407	test: 0.4238056	best: 0.4238056 (7)	total: 2.1s	remaining: 50.5s
8:	learn: 0.4188844	test: 0.4199036	best: 0.4199036 (8)	total: 2.34s	remaining: 49.7s
9:	learn: 0.4158299	test: 0.4169657	best: 0.4169657 (9)	total: 2.61s	remaining: 49.5s
10:	learn: 0.4133314	test: 0.4145741	best: 0.4145741 (10)	total: 2.83s	remaining: 48.7s
11:	learn: 0.4113992	test: 0.4128620	best: 0.4128620 (

<catboost.core.CatBoostClassifier at 0x225c8155988>

In [53]:
pred = cb.predict(x_test,ntree_end = 56
                 )

In [54]:
confusion_matrix(y_test, pred)

array([[52007, 14890],
       [ 2116,  7209]], dtype=int64)

In [55]:
from sklearn.metrics import f1_score
print("Accuracy:",accuracy_score(y_test, pred))
print("Precision:",precision_score(y_test, pred))
print("Recall:",recall_score(y_test, pred))
print('F1-Score:',f1_score(y_test, pred))

Accuracy: 0.7768885623573246
Precision: 0.32621385583058055
Recall: 0.7730831099195711
F1-Score: 0.458821283095723


### Model Performance is on par With Xgboost and trains faster than Xgboost 
 Next steps : 
 1. Perform cross validation 
 2. Use GridSearchCV to find the optimal parameters 
 3. Feature engineering on Features to improve the results 
             

In [58]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 123)

In [59]:

cb_cv_score = cross_val_score(cb, x_train, y_train, cv = folds, verbose = 1, n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.3min remaining:  2.0min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.4min finished


In [60]:
cb_cv_score

array([0.78920266, 0.78685756, 0.7807206 , 0.78457451, 0.78483691])