In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

In [2]:
data_train = pd.read_csv('aug_train.csv')
data_test = pd.read_csv('aug_test.csv')

In [3]:
data_train.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,167647,Male,22,1,7.0,1,< 1 Year,No,2630.0,152.0,16,0
1,17163,Male,42,1,28.0,0,1-2 Year,Yes,43327.0,26.0,135,0
2,32023,Female,66,1,33.0,0,1-2 Year,Yes,35841.0,124.0,253,0
3,87447,Female,22,1,33.0,0,< 1 Year,No,27645.0,152.0,69,0
4,501933,Male,28,1,46.0,1,< 1 Year,No,29023.0,152.0,211,0


In [4]:
data_test.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,57782,Female,34,1,39.0,1,1-2 Year,No,38244.0,124.0,146
1,286811,Female,55,1,28.0,0,> 2 Years,Yes,37577.0,122.0,109
2,117823,Male,39,1,28.0,1,1-2 Year,No,24578.0,26.0,63
3,213992,Male,28,1,50.0,1,1-2 Year,No,40507.0,8.0,129
4,324756,Female,24,1,10.0,0,< 1 Year,Yes,36783.0,152.0,201


In [5]:
print("Train shape is {}, test shape is {}.".format(data_train.shape, data_test.shape))

Train shape is (382154, 12), test shape is (78273, 11).


In [6]:
data_train[['Age', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,382154.0,38.545691,15.226897,20.0,25.0,36.0,49.0,85.0
Annual_Premium,382154.0,30711.271362,17061.595532,2630.0,24546.0,31692.0,39447.75,540165.0
Policy_Sales_Channel,382154.0,111.939812,54.286511,1.0,26.0,145.0,152.0,163.0
Vintage,382154.0,154.189429,83.735107,10.0,81.0,154.0,227.0,299.0


In [7]:
data_train.groupby('Response')['id'].count()

Response
0    319553
1     62601
Name: id, dtype: int64

In [8]:
# target event rate
print("Target event rate is: {}".format(data_train['Response'].sum()/data_train['id'].count()))

Target event rate is: 0.16381092439173736


In [9]:
data_train['Region_Code'].unique()

array([ 7., 28., 33., 46., 25.,  8., 41., 39., 13., 14., 36.,  3., 43.,
       45., 48., 11., 30., 15.,  6., 50., 38., 19., 34., 22., 29., 35.,
       12., 37.,  4., 10., 40., 24., 31., 21.,  2., 52.,  9., 49.,  5.,
       47.,  1., 20., 42., 27., 26., 32., 18., 16., 17., 51.,  0., 23.,
       44.])

In [10]:
data_train['Vehicle_Age'].unique()

array(['< 1 Year', '1-2 Year', '> 2 Years'], dtype=object)

In [11]:
data_train['Vintage'].nunique()

290

**PREPROCESSING DATA**

In [12]:
data_train['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [13]:
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder(sparse=False)

In [14]:
gender = onehot_encoder.fit_transform(data_train[['Gender']])
gender = pd.DataFrame(gender, columns=['Female', 'Male'])

In [15]:
vehicle_damage = onehot_encoder.fit_transform(data_train[['Vehicle_Damage']])
vehicle_damage = pd.DataFrame(vehicle_damage, columns=['Vehicle_Damage_No', 'Vehicle_Damage_Yes'])

In [16]:
vehicle_age = onehot_encoder.fit_transform(data_train[['Vehicle_Age']])
vehicle_age = pd.DataFrame(vehicle_age, columns=[ 'Vehicle_Age_one_two_Year', 'Vehicle_Age_less_one_Year', 'Vehicle_Age_grea_two_Years'])

In [17]:
data_train = pd.concat([data_train, gender, vehicle_damage, vehicle_age], axis=1)

In [18]:
data_train.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Female,Male,Vehicle_Damage_No,Vehicle_Damage_Yes,Vehicle_Age_one_two_Year,Vehicle_Age_less_one_Year,Vehicle_Age_grea_two_Years
0,167647,Male,22,1,7.0,1,< 1 Year,No,2630.0,152.0,16,0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1,17163,Male,42,1,28.0,0,1-2 Year,Yes,43327.0,26.0,135,0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
2,32023,Female,66,1,33.0,0,1-2 Year,Yes,35841.0,124.0,253,0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,87447,Female,22,1,33.0,0,< 1 Year,No,27645.0,152.0,69,0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,501933,Male,28,1,46.0,1,< 1 Year,No,29023.0,152.0,211,0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


In [19]:
data_train = data_train.drop(columns=['Gender', 'Region_Code', 'Vehicle_Damage', 'Vehicle_Age'])
data_index = data_train[['id']]

In [20]:
data_train = data_train.drop(columns=['id'])

In [21]:
data_train.head()

Unnamed: 0,Age,Driving_License,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Female,Male,Vehicle_Damage_No,Vehicle_Damage_Yes,Vehicle_Age_one_two_Year,Vehicle_Age_less_one_Year,Vehicle_Age_grea_two_Years
0,22,1,1,2630.0,152.0,16,0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1,42,1,0,43327.0,26.0,135,0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
2,66,1,0,35841.0,124.0,253,0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,22,1,0,27645.0,152.0,69,0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,28,1,1,29023.0,152.0,211,0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


In [22]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

In [23]:
Y = data_train['Response']
X = data_train.drop(columns = ['Response'])

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=1000)

print("Shape of X_train is: {}. Shape of X_test is: {}".format(X_train.shape, X_test.shape))

Shape of X_train is: (286615, 13). Shape of X_test is: (95539, 13)


In [24]:
# grid search
model = xgb.XGBClassifier(random_state=9)

n_estimators = range(50, 800, 100)
max_depth = range(3,7, 1)
min_child_weight = [1, 3, 5 ,7, 9]
gamma = [0]
learning_rate = [0.01]
tree_method = ['gpu_hist']

param_grid = dict(learning_rate=learning_rate, 
                  n_estimators=n_estimators, 
                  max_depth=max_depth, 
                  min_child_weight=min_child_weight, 
                  gamma=gamma,
                  tree_method=tree_method)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=-1, cv=kfold, verbose=4)
result = grid_search.fit(X_train, y_train, random_state=8)

TypeError: __init__() got an unexpected keyword argument 'random_state'

In [None]:
pd.DataFrame(result.cv_results_).to_excel('resultCV.xlsx')

In [None]:
pd.DataFrame(result.cv_results_).columns

In [None]:
cv_result = pd.DataFrame(result.cv_results_)[['param_gamma', 'param_learning_rate', 'param_max_depth',
       'param_min_child_weight', 'param_n_estimators', 'param_tree_method', 'mean_test_score','rank_test_score']]

In [None]:
cv_result.sort_values('rank_test_score').head()

In [None]:
# 9 je max za min_child_weight
# 750 je max za n_estimators
# 6 je max za max_depth

#probacemo nove parametre

# grid search 2
model = xgb.XGBClassifier(random_state=9)

n_estimators = range(750, 1200, 100)
max_depth = range(6,9, 1)
min_child_weight = [9, 10, 11]
gamma = [0]
learning_rate = [0.01]
tree_method = ['gpu_hist']

param_grid = dict(learning_rate=learning_rate, 
                  n_estimators=n_estimators, 
                  max_depth=max_depth, 
                  min_child_weight=min_child_weight, 
                  gamma=gamma,
                  tree_method=tree_method)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=-1, cv=kfold, verbose=5)
result = grid_search.fit(X_train, y_train)

In [None]:
cv_result_1 = pd.DataFrame(result.cv_results_)[['param_gamma', 'param_learning_rate', 'param_max_depth',
       'param_min_child_weight', 'param_n_estimators', 'param_tree_method', 'mean_test_score','rank_test_score']]

In [None]:
cv_result_1.sort_values('rank_test_score').head()

In [None]:
# 1150 je max za n_estimators
# 11 je max za min_child_weight

# grid search 3
model = xgb.XGBClassifier(random_state=9)

n_estimators = range(1150, 1500, 100)
max_depth = [6]
min_child_weight = [11, 12, 13]
gamma = [0]
learning_rate = [0.01]
tree_method = ['gpu_hist']

param_grid = dict(learning_rate=learning_rate, 
                  n_estimators=n_estimators, 
                  max_depth=max_depth, 
                  min_child_weight=min_child_weight, 
                  gamma=gamma,
                  tree_method=tree_method)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=-1, cv=kfold, verbose=5)
result = grid_search.fit(X_train, y_train)

In [None]:
cv_result_2 = pd.DataFrame(result.cv_results_)[['param_gamma', 'param_learning_rate', 'param_max_depth',
       'param_min_child_weight', 'param_n_estimators', 'param_tree_method', 'mean_test_score','rank_test_score']]

cv_result_2.sort_values('rank_test_score').head()

In [None]:
# gamma

# grid search 4
model = xgb.XGBClassifier(random_state=9)

n_estimators = [1250]
max_depth = [6]
min_child_weight = [12]
gamma = [0, 0.2, 0.4, 0.6]
learning_rate = [0.01]
tree_method = ['gpu_hist']

param_grid = dict(learning_rate=learning_rate, 
                  n_estimators=n_estimators, 
                  max_depth=max_depth, 
                  min_child_weight=min_child_weight, 
                  gamma=gamma,
                  tree_method=tree_method)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=-1, cv=kfold, verbose=5)
result = grid_search.fit(X_train, y_train)

In [None]:
cv_result_3 = pd.DataFrame(result.cv_results_)[['param_gamma', 'param_learning_rate', 'param_max_depth',
       'param_min_child_weight', 'param_n_estimators', 'param_tree_method', 'mean_test_score','rank_test_score']]

cv_result_3.sort_values('rank_test_score').head()

In [None]:
# scale position weight
# max je za gamma=0 

# grid search 5
model = xgb.XGBClassifier(random_state=9)

n_estimators = [1250]
max_depth = [6]
min_child_weight = [12]
gamma = [0]
scale_pos_weight = [1, 2, 3, 4]
learning_rate = [0.01]
tree_method = ['gpu_hist']

param_grid = dict(learning_rate=learning_rate, 
                  n_estimators=n_estimators, 
                  max_depth=max_depth, 
                  min_child_weight=min_child_weight, 
                  gamma=gamma,
                  scale_pos_weight=scale_pos_weight,
                  tree_method=tree_method)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=-1, cv=kfold, verbose=5)
result = grid_search.fit(X_train, y_train)

In [None]:
cv_result_4 = pd.DataFrame(result.cv_results_)[['param_scale_pos_weight', 'param_gamma', 'param_learning_rate', 'param_max_depth',
       'param_min_child_weight', 'param_n_estimators', 'param_tree_method', 'mean_test_score','rank_test_score']]

cv_result_4.sort_values('rank_test_score').head()

In [None]:
# grid search 6
model = xgb.XGBClassifier(random_state=9)

n_estimators = [1250]
max_depth = [6]
min_child_weight = [12]
gamma = [0]
scale_pos_weight = [1]
learning_rate = [0.01, 0.03, 0.05, 0.1]
reg_lambda = [0, 0.01, 0.1, 1, 3]
tree_method = ['gpu_hist']

param_grid = dict(learning_rate=learning_rate, 
                  n_estimators=n_estimators, 
                  max_depth=max_depth, 
                  min_child_weight=min_child_weight, 
                  gamma=gamma,
                  scale_pos_weight=scale_pos_weight,
                  reg_lambda=reg_lambda,
                  tree_method=tree_method)

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="roc_auc", n_jobs=-1, cv=kfold, verbose=5)
result = grid_search.fit(X_train, y_train)

In [None]:
cv_result_5 = pd.DataFrame(result.cv_results_)[['param_reg_lambda', 'param_scale_pos_weight', 'param_gamma', 'param_learning_rate', 'param_max_depth',
       'param_min_child_weight', 'param_n_estimators', 'param_tree_method', 'mean_test_score','rank_test_score']]

cv_result_5.sort_values('rank_test_score').head()

In [27]:
from sklearn.metrics import roc_auc_score, accuracy_score

In [25]:
def modelfit(alg, X_train, y_train, X_test, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train.values, label=y_train.values, feature_names=X_train.columns)
        xgtest = xgb.DMatrix(X_test.values, label=y_test.values, feature_names=X_train.columns)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
             stratified=True, metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    print("Number of tree: {}".format(cvresult.shape[0]))
    
    #Fit the algorithm on the data
    alg.fit(X_train, y_train, eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(X_train)
    dtrain_predprob = alg.predict_proba(X_train)[:,1]
        
    #Print model report:
    print("\nModel Report train:")
    print("Accuracy train : %.4g" % accuracy_score(y_train, dtrain_predictions))
    print("AUC Score (Train): %f" % roc_auc_score(y_train, dtrain_predprob))
    
    #predict on testing data:
    dtest_predictions = alg.predict(X_test)
    dtest_predprob = alg.predict_proba(X_test)[:,1]
    
    print("\nModel Report test:")
    print("Accuracy test : %.4g" % accuracy_score(y_test, dtest_predictions))
    print('AUC Score (Test): %f' % roc_auc_score(y_test, dtest_predprob))
                

In [None]:
import numpy as np
from sklearn.feature_selection import SelectFromModel

model = xgb.XGBClassifier( learning_rate =0.01,
                         n_estimators=1250,
                         max_depth=6,
                         min_child_weight=12,
                         gamma=0,
                         scale_pos_weight=1,
                         objective= 'binary:logistic',
                         tree_method='gpu_hist',
                         nthread=-1,
                         n_jobs=-1,
                         seed=27)
model.fit(X_train, y_train)

# make predictions for test data and evaluate
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Fit model using each importance as a threshold

thresholds = np.sort(model.feature_importances_)
for thresh in thresholds:
    
    # select features using threshold
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(X_train)
    
    # train model
    selection_model = xgb.XGBClassifier( learning_rate =0.01,
                         n_estimators=1250,
                         max_depth=6,
                         min_child_weight=12,
                         gamma=0,
                         scale_pos_weight=1,
                         objective= 'binary:logistic',
                         tree_method='gpu_hist',
                         nthread=-1,
                         n_jobs=-1,
                         seed=27)
    selection_model.fit(select_X_train, y_train)
    
    # eval model
    select_X_test = selection.transform(X_test)
    predictions = selection_model.predict(select_X_test)
    
    accuracy = accuracy_score(y_test, predictions)
    print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))


Najbolje rezultate imamo ukoliko koristimo sve varijable.

In [28]:
from xgboost import plot_tree

model = xgb.XGBClassifier( learning_rate =0.01,
                         n_estimators=1250,
                         max_depth=6,
                         min_child_weight=12,
                         gamma=0,
                         scale_pos_weight=1,
                         objective= 'binary:logistic',
                         tree_method='gpu_hist',
                         nthread=-1,
                         n_jobs=-1,
                         seed=27)
model.fit(X_train, y_train)

# feature importance
feat_imp = pd.DataFrame({'feature_names': X_train.columns, 'feature_importance':model.feature_importances_})

# make predictions for test data and evaluate
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 84.63%


In [29]:
feat_imp.sort_values('feature_importance', ascending=False)

Unnamed: 0,feature_names,feature_importance
8,Vehicle_Damage_No,0.520855
9,Vehicle_Damage_Yes,0.316751
2,Previously_Insured,0.118354
0,Age,0.016548
10,Vehicle_Age_one_two_Year,0.007259
12,Vehicle_Age_grea_two_Years,0.006409
4,Policy_Sales_Channel,0.005569
1,Driving_License,0.002373
11,Vehicle_Age_less_one_Year,0.002201
6,Female,0.001063
