In [2]:
#import all the libraries for data manipulation
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

In [None]:
#read all the given files (policies, veihcles, drivers)
#note we divided data into 70:30 ratio for training and testing
data = pd.read_csv("C:/Users/admin/Desktop/Husky/Travellers/policies_train.csv")

In [None]:
vehicles = pd.read_csv("C:/Users/admin/Desktop/Husky/Travellers/vehicles.csv")

In [None]:
drivers = pd.read_csv("C:/Users/admin/Desktop/Husky/Travellers/drivers.csv")

In [None]:
test = pd.read_csv("C:/Users/admin/Desktop/Husky/Travellers/policies_test.csv")

In [None]:
test1 = pd.read_csv("C:/Users/admin/Desktop/Husky/Travellers/policies_test.csv")

In [None]:
pd.set_option('max_columns', None)
pd.set_option('display.max_rows', 150)

In [None]:
#data vidualization for all datasets
data.shape

In [None]:
vehicles.shape

In [None]:
drivers.shape

In [None]:
test.shape

In [None]:
data.head()

In [None]:
vehicles.head()

In [None]:
vehicles.info()

In [None]:
drivers.head()

In [None]:
drivers.info()

In [None]:
vdata = vehicles

-----------------
## Merging vehicles with policy
-----------------

In [None]:
vdata.head()

In [None]:
vdata['make'] = vdata['make_model'].str.split(':', 0).str[0]
vdata.head()

In [None]:
vdata.drop(['make_model'], axis = 1, inplace = True)
vdata.head()

### First the categorical variables are transformed

In [None]:
vdata = pd.get_dummies(vdata, columns = ['ownership_type', 'color', 'make' ])
vdata.head()

### The dataset is then grouped by the policy no
#### - For attribute age -> mean, min, max is tabulated in seperate columns
#### - For attribute car_no, only the max value from the column is taken for each policy, as that gives total cars in that policy
#### - For all binary columns that we just transformed, the sum is taken to aggregate data for each policy

In [None]:
bool_cols = [col for col in vdata 
             if np.isin(vdata[col].dropna().unique(), [0, 1]).all()]
bool_cols

In [None]:
vdata['age_min'] = vdata['age']
vdata['age_max'] = vdata['age']
d1 = {'car_no':'max'}
d2 = {'age':'mean'}
d3 = {'age_min':'min'}
d4 = {'age_max':'max'}
d5 = dict.fromkeys(bool_cols, 'sum')

d = {**d1, **d2, **d3, **d4, **d5}
vvdata = vdata.groupby('policy_id', as_index=False).agg(d)
vvdata.head()

In [None]:
data_veh = pd.merge(left = data, right = vvdata, how = 'inner', on = 'policy_id')
data_veh.head()

## Merging drivers to the previously merged dataset

In [None]:
ddata = drivers

In [None]:
ddata.head()

In [None]:
ddata.describe()

In [None]:
ddata[ddata['age']>100].describe()

We can see that the age in a number of policies is greater than 100. This data is mostly inaccurate and since there is only a small number of data points with this inaccuracy, these can be removed

In [None]:
ddata = ddata.drop(ddata[ddata['age'] > 100].index)

In [None]:
ddata['age'].describe()

We can group the age into bins to make it a categorical variable

In [None]:
bins = [0,25,35,45,55,65,101]
labels = ['15-25','26-35','36-45','46-55','56-65','65+']
ddata['Age Group'] = pd.cut(ddata['age'], bins = bins, labels = labels)
ddata.head()

In [None]:
ddata = pd.get_dummies(ddata, columns = ['gender', 'living_status', 'Age Group' ,'high_education_ind'])
ddata.head()

In [None]:
ddata.head()

In [None]:
bool_cols1 = [col for col in ddata 
             if np.isin(ddata[col].dropna().unique(), [0, 1]).all()]
bool_cols1

In [None]:
ddata['safety_min'] = ddata['safty_rating']
ddata['safety_max'] = ddata['safty_rating']
d1 = {'safty_rating':'mean'}
d2 = {'safety_min':'min'}
d3 = {'safety_max':'max'}
d4 = dict.fromkeys(bool_cols1, 'sum')

d = {**d1, **d2, **d3, **d4}
dddata = ddata.groupby('policy_id', as_index=False).agg(d)
dddata.head()

In [None]:
df_policy = pd.merge(left = data_veh, right = dddata, how = 'inner', on = 'policy_id')
df_policy.head()

In [None]:
df_policy.to_csv('policy_merged.csv')

# Data cleaning on merged dataset

In [None]:
df_policy['county_name'] = np.where((df_policy['county_name'] == 'New York')|(df_policy['county_name'] =='Kings'), df_policy['county_name'], 0)

In [None]:
df_policy['county_name'].value_counts()

In [None]:
df_policy['Prior_carrier_grp'] = np.where((df_policy['Prior_carrier_grp'] == 'Carrier_3')|(df_policy['Prior_carrier_grp'] =='Carrier_7')|(df_policy['Prior_carrier_grp'] =='Carrier_8'), df_policy['Prior_carrier_grp'], 0)

In [None]:
df_policy['Prior_carrier_grp'].value_counts()

In [None]:
df_policy.columns

In [None]:
df_policy.drop(['make_ACURA ', 'make_BUICK ', 'make_CADILLAC ', 'make_CHRYSLER ', 'make_DODGE ','make_FORD ', 'make_GMC ','make_MAZDA ',
                'make_NISSAN ','make_RAM ','make_SATURN ','make_SMART ','make_SUBARU ','make_TOYOTA '], axis = 1, inplace = True)

In [None]:
df_policy.columns

In [None]:
df_policy.describe()

Safety rating has negative values and that most probably will have to be cleaned.

In [None]:
df_policy.isna().sum()

In [None]:
df_policy['state_id'].unique()

In [None]:
df_policy['zip'].max()

In [None]:
df_policy['zip'].min()

In [None]:
len(df_policy['Agent_cd'].unique())

In [None]:
df_policy['convert_ind'].value_counts(normalize = True)

In [None]:
df_policy['county_name'].value_counts()

In [None]:
df_policy['quoted_amt'].head()

In [None]:
df_policy['quote'] = df_policy['quoted_amt'].str.replace(',','')
df_policy['quote'] = df_policy['quote'].str.replace('$','')
df_policy['quote'] = pd.to_numeric(df_policy['quote'], errors='coerce')
df_policy['quote'].dtype


In [None]:
df_policy['quote'].describe()

In [None]:
plt.hist(df_policy['quote'], bins = 20)

In [None]:
df_policy['quote'].quantile(np.linspace(.1,1,9,0))

In [None]:
bins = [0,2500,5000,7500,10000,200000]
labels = ['Very Low','Low','Medium','High','Very High']
df_policy['quote_range'] = pd.cut(df_policy['quote'], bins = bins, labels = labels)
df_policy.head(10)

In [None]:
df_policy['quote_range'].value_counts()

In [None]:
pd.crosstab(df_policy['quote_range'], df_policy['convert_ind'], normalize='index') * 100

In [None]:
plt.hist(np.log(df_policy['quote']), bins = 20)

In [None]:
df_policy['log_quote'] = np.log(df_policy['quote'])

In [None]:
df_policy['Prior_carrier_grp'].value_counts()

In [None]:
df_policy['Cov_package_type'].value_counts()

In [None]:
df_policy['CAT_zone'].value_counts()

In [None]:
df_policy['total_drivers'] = df_policy['gender_F']+df_policy['gender_M']

In [None]:
len(df_policy[df_policy['number_drivers'] == df_policy['total_drivers']])

In [None]:
df_policy.drop(df_policy[df_policy['number_drivers'] != df_policy['total_drivers']].index, axis = 0, inplace = True)

In [None]:
len(df_policy)

In [None]:
len(df_policy[df_policy['number_drivers'] == df_policy['total_drivers']])

In [None]:
df_policy['primary_parking'].value_counts()

In [None]:
df_policy['gender_F'].sum()

In [None]:
df_policy['gender_M'].sum()

In [None]:
df_policy['credit_score'].max()

In [None]:
plt.hist(df_policy['safty_rating'],bins=10)

In [None]:
df_policy['safty_rating'].quantile(np.linspace(.2,1,4,0))

Safety rating attribute is divided into 5 categories approximated from the distribution of safety ratings

In [None]:
bins = [0,59,69,79,89,100]
labels = ['Very Low','Low','Medium','High','Very High']
df_policy['safety_rating'] = pd.cut(df_policy['safty_rating'], bins = bins, labels = labels)
df_policy.head(10)

In [None]:
df_policy['safety_rating'].value_counts()

In [None]:
fig, ax = plt.subplots(figsize = (15,6))
sns.countplot(x='safety_rating', hue='convert_ind', data=df_policy, palette = ['Red','limegreen'])
ax.bar_label(container=ax.containers[0])
ax.bar_label(container=ax.containers[1])

In [None]:
pd.crosstab(df_policy['safety_rating'], df_policy['convert_ind'], normalize='index') * 100

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize = (15,6))
sns.barplot(x=df_policy[df_policy['convert_ind']==1]['safety_rating'].value_counts().index, y = df_policy[df_policy['convert_ind']==1]['safety_rating'].value_counts(normalize = True).values, ax = ax1, palette = 'Set1')
sns.barplot(x=df_policy[df_policy['convert_ind']==0]['safety_rating'].value_counts().index, y = df_policy[df_policy['convert_ind']==0]['safety_rating'].value_counts(normalize = True).values, ax = ax2, palette = 'Set1')

ax1.bar_label(container=ax1.containers[0])
ax2.bar_label(container=ax2.containers[0])

In [None]:
plt.hist(df_policy['credit_score'],bins=10)

In [None]:
df_policy['credit_score'].quantile(np.linspace(.33,1,2,0))

In [None]:
bins = [0,500,600,700,800,850]
labels = ['Very Low','Low','Medium','High','Very High']
df_policy['credit_score_range'] = pd.cut(df_policy['credit_score'], bins = bins, labels = labels)

In [None]:
df_policy['credit_score_range'].value_counts()

In [None]:
pd.crosstab(df_policy['credit_score_range'], df_policy['convert_ind'], normalize='index') * 100

In [None]:
sns.countplot(x='credit_score_range', hue='convert_ind', data=df_policy, palette = ['Red','limegreen'])

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize = (15,6))
sns.barplot(x=df_policy[df_policy['convert_ind']==1]['credit_score_range'].value_counts().index, y = df_policy[df_policy['convert_ind']==1]['credit_score_range'].value_counts(normalize = True).values, ax = ax1, palette = 'Set1')
sns.barplot(x=df_policy[df_policy['convert_ind']==0]['credit_score_range'].value_counts().index, y = df_policy[df_policy['convert_ind']==0]['credit_score_range'].value_counts(normalize = True).values, ax = ax2, palette = 'Set1')

ax1.bar_label(container=ax1.containers[0])
ax2.bar_label(container=ax2.containers[0])

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize = (15,6))

sns.boxplot(x='convert_ind', y='credit_score', data=df_policy, palette = ['Red','limegreen'], ax = ax1)
sns.boxplot(x='convert_ind', y='age', data=df_policy, palette = ['Red','limegreen'], ax = ax2)
ax2.set_ylabel('age_of_cars')

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize = (15,6))

sns.boxplot(x='convert_ind', y='age_min', data=df_policy, palette = ['Red','limegreen'], ax = ax1)
sns.boxplot(x='convert_ind', y='age_max', data=df_policy, palette = ['Red','limegreen'], ax = ax2)
ax1.set_ylabel('min_age_of_cars')
ax2.set_ylabel('max_age_of_cars')

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize = (15,6))
sns.countplot(x='state_id', hue='convert_ind', data=df_policy, ax = ax1, palette = ['Red','limegreen'])
sns.countplot(x='Prior_carrier_grp', hue='convert_ind', data=df_policy, ax = ax2, palette = ['Red','limegreen'])
fig.tight_layout()

In [None]:
pd.crosstab(df_policy['state_id'], df_policy['convert_ind'], normalize='index') * 100

In [None]:
pd.crosstab(df_policy['Prior_carrier_grp'], df_policy['convert_ind'], normalize='index') * 100

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize = (15,6))
sns.countplot(x='Cov_package_type', hue='convert_ind', data=df_policy, ax = ax1, palette = ['Red','limegreen'])
sns.countplot(x='CAT_zone', hue='convert_ind', data=df_policy, ax = ax2, palette = ['Red','limegreen'])
fig.tight_layout()

In [None]:
pd.crosstab(df_policy['Cov_package_type'], df_policy['convert_ind'], normalize='index') * 100

In [None]:
pd.crosstab(df_policy['CAT_zone'], df_policy['convert_ind'], normalize='index') * 100

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize = (15,6))
sns.countplot(x='number_drivers', hue='convert_ind', data=df_policy, ax = ax1, palette = ['Red','limegreen'])
sns.countplot(x='total_number_veh', hue='convert_ind', data=df_policy, ax = ax2, palette = ['Red','limegreen'])
fig.tight_layout()

In [None]:
pd.crosstab(df_policy['number_drivers'], df_policy['convert_ind'], normalize='index') * 100

In [None]:
pd.crosstab(df_policy['total_number_veh'], df_policy['convert_ind'], normalize='index') * 100

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize = (15,6))
sns.countplot(x='primary_parking', hue='convert_ind', data=df_policy, ax = ax1, palette = ['Red','limegreen'])
sns.countplot(x='car_no', hue='convert_ind', data=df_policy, ax = ax2, palette = ['Red','limegreen'])
fig.tight_layout()

In [None]:
pd.crosstab(df_policy['primary_parking'], df_policy['convert_ind'], normalize='index') * 100

In [None]:
pd.crosstab(df_policy['car_no'], df_policy['convert_ind'], normalize='index') * 100

In [None]:
df_policy.dtypes

In [None]:
numerical = df_policy.select_dtypes(include=['int64','float64','Int64'])[:]
numerical

In [None]:
numerical = numerical.drop(["zip","Agent_cd"], axis = 1)
numerical.dtypes

In [None]:
correlation = numerical.dropna().corr()
correlation

In [None]:
# plotting heatmap usill all methods for all numerical variables
plt.figure(figsize=(36,6), dpi=140)
for j,i in enumerate(['pearson']):
  plt.subplot(1,3,j+1)
  correlation = numerical.dropna().corr(method=i)
  sns.heatmap(correlation, linewidth = 2)
  plt.title(i, fontsize=18)

In [None]:
c = correlation.abs()
s = c.unstack()
so = s.sort_values(ascending = False)
so[so < 1][0:20]

In [None]:
df_policy.info()

In [None]:
df_policy.drop(['quoted_amt', 'total_drivers'], axis = 1, inplace = True)

In [None]:
df_policy['safety_range'] = df_policy['safety_max'] - df_policy['safety_min']
df_policy['car_age_range'] = df_policy['age_max'] - df_policy['age_min']

In [None]:
df_policy['Quote_dt'] = pd.to_datetime(df_policy['Quote_dt'])

In [None]:
df_policy['Quote_month'] = df_policy['Quote_dt'].dt.month
df_policy['Quote_month'].value_counts()

In [None]:
pd.crosstab(df_policy['Quote_month'], df_policy['convert_ind'], normalize='index') * 100

In [None]:
df_policy['Quote_quarter'] = df_policy['Quote_dt'].dt.quarter
df_policy['Quote_quarter'].value_counts()

In [None]:
pd.crosstab(df_policy['Quote_quarter'], df_policy['convert_ind'], normalize='index') * 100

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2, figsize = (15,6))
sns.countplot(x='Quote_month', hue='convert_ind', data=df_policy, ax = ax1, palette = ['Red','limegreen'])
sns.countplot(x='Quote_quarter', hue='convert_ind', data=df_policy, ax = ax2, palette = ['Red','limegreen'])
fig.tight_layout()

In [None]:
df_policy['Quote_year'] = df_policy['Quote_dt'].dt.year
df_policy['Quote_year'].value_counts()

In [None]:
pd.crosstab(df_policy['Quote_year'], df_policy['convert_ind'], normalize='index') * 100

In [None]:
fig, ax = plt.subplots(figsize = (15,6))
sns.countplot(x='Quote_year', hue='convert_ind', data=df_policy, palette = ['Red','limegreen'])
ax.bar_label(container=ax.containers[0])
ax.bar_label(container=ax.containers[1])

# Modelling

In [None]:
x = df_policy

In [None]:
x.drop(['split', 'Unnamed: 0', 'Quote_dt', 'zip', 'Agent_cd', 'policy_id'], axis = 1, inplace=True)

In [None]:
x.head()

In [None]:
x.info()

In [None]:
x1 = pd.get_dummies(x, columns = ['state_id','Prior_carrier_grp', 'primary_parking', 'county_name' ])

In [None]:
x1.head()

In [None]:
x1['discount'] = x1['discount'].astype('category')
x1['Home_policy_ind'] = x1['Home_policy_ind'].astype('category')
x1['Cov_package_type'] = x1['Cov_package_type'].astype('category')

In [None]:
x1['credit_score_range'] = x1['credit_score_range'].cat.codes
x1['Home_policy_ind'] = x1['Home_policy_ind'].cat.codes
x1['Cov_package_type'] = x1['Cov_package_type'].cat.codes
x1['safety_rating'] = x1['safety_rating'].cat.codes
x1['discount'] = x1['discount'].cat.codes
x1['quote_range'] = x1['quote_range'].cat.codes

In [None]:
x1.head()

In [None]:
x1.columns

In [None]:
x1.drop(['log_quote', 'Prior_carrier_grp_0', 'county_name_0'], axis = 1, inplace=True)

In [None]:
x1.drop(['primary_parking_home/driveway', 'primary_parking_parking garage',
       'primary_parking_street', 'primary_parking_unknown', 'state_id_AL', 'state_id_CT', 'state_id_FL',
       'state_id_GA', 'state_id_MN', 'state_id_NJ',
       'state_id_WI'], axis = 1, inplace = True)

In [None]:
y = x1['convert_ind']
x1 = x1.drop(['convert_ind'], axis = 1)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_x_scaled = scaler.fit_transform(x1)
train_x_scaled = pd.DataFrame(train_x_scaled, columns=x1.columns)


______________________________

# XGBoost

## Run this
________________________________

In [None]:
pip install xgboost

In [None]:
import xgboost as xgb

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as rep

________________________________

## Grid Search Optimization and Model Fitting

In [None]:
estimator = xgb.XGBClassifier(objective ='binary:logistic', tree_method = 'gpu_hist',sampling_method='gradient_based',nthread = 4, seed = 20)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_x_scaled, y, test_size=0.2, stratify=y, random_state=20)

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

param_test = {
    'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
    'max_depth': range (6, 8, 1),
    'n_estimators': sp_randint(100, 800),
    'learning_rate': [0.025, 0.1, 0.01, 0.05],
    'num_rounds': [5,10,15],
    'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
    'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
        'scale_pos_weight': sp_randint(4, 15)
}

In [None]:
'''n_HP_points_to_test = 500
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

gs = RandomizedSearchCV(
    estimator=estimator, param_distributions=param_test,
    n_iter=n_HP_points_to_test,
    scoring='roc_auc',
    cv=3,
    refit=True,
    random_state=1,
    verbose=True)'''

In [None]:
'''gs.fit(X_train, y_train)

In [None]:
#print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

#### Best score reached: 0.6846207236377041 with params: {'colsample_bytree': 0.5019644285303547, 'learning_rate': 0.025, 'max_depth': 6, 'n_estimators': 555, 'num_rounds': 5, 'reg_alpha': 100, 'reg_lambda': 50, 'scale_pos_weight': 13}

## Run the code below to fit the model and then predict on the test

In [None]:
xgb_final = xgb.XGBClassifier(objective ='binary:logistic',sampling_method='gradient_based',base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.5019644285303547, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0, gpu_id=0, grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.025, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=6, max_leaves=0, min_child_weight=1, n_estimators=555, n_jobs=4, nthread=4,
              num_parallel_tree=1, num_rounds=5, reg_alpha = 100, reg_lambda = 50, scale_pos_weight = 13, seed = 20)

In [None]:
xgb_final.fit(X_train,y_train)

In [None]:
preds_train = xgb_final.predict(X_train)
preds_test = xgb_final.predict(X_test)

In [None]:
test_prob = xgb_final.predict_proba(X_test)
test_prob

In [None]:
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_train,preds_train)
print('AUC: %.3f' % auc)

In [None]:
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test,preds_test)
print('AUC: %.3f' % auc)

In [None]:
xgb.plot_importance(xgb_final)
plt.rcParams['figure.figsize'] = [15, 15]
plt.show()
plt.savefig('sample.pdf')

In [None]:
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz7.0.1/bin/'

In [None]:
xgb.plot_tree(xgb_final, rankdir='LR')
fig = plt.gcf()
fig.set_size_inches(50, 85)

In [None]:
pip install shap

In [None]:
import shap

In [None]:
#X_train, y_train = shap.datasets.boston()

In [None]:
explainer = shap.Explainer(xgb_final)
shap_values = explainer(X_train)

In [None]:
# visualize the first prediction's explanation
shap.plots.waterfall(shap_values[0], max_display = 16)

In [None]:
# summarize the effects of all the features
shap.summary_plot(shap_values, X_train)

In [None]:
shap.plots.force(shap_values[0])

In [None]:
shap.plots.bar(shap_values, max_display = 15)

In [None]:
clustering = shap.utils.hclust(X_train, y_train) # by default this trains (X.shape[1] choose 2) 2-feature XGBoost models
shap.plots.bar(shap_values, clustering=clustering, max_display = 15)

# Predicting on Test

In [None]:
test.shape

In [None]:
test = pd.merge(left = test, right = vvdata, how = 'inner', on = 'policy_id')
test.head()


In [None]:
test.shape

In [None]:
test = pd.merge(left = test, right = dddata, how = 'left', on = 'policy_id')
test.head()

In [None]:
test.shape

In [None]:
submission = pd.DataFrame(columns = ['policy_id', 'TARGET'])
submission['policy_id'] = test['policy_id']

In [None]:
test['quote'] = test['quoted_amt'].str.replace(',','')
test['quote'] = test['quote'].str.replace('$','')
test['quote'] = pd.to_numeric(test['quote'], errors='coerce')
test['quote'].dtype

In [None]:
bins = [0,2500,5000,7500,10000,200000]
labels = ['Very Low','Low','Medium','High','Very High']
test['quote_range'] = pd.cut(test['quote'], bins = bins, labels = labels)
test.head()

In [None]:
test['log_quote'] = np.log(test['quote'])

In [None]:
test.shape

In [None]:
test['total_drivers'] = df_policy['gender_F']+df_policy['gender_M']

In [None]:
#test.drop(test[test['number_drivers'] != test['total_drivers']].index, axis = 0, inplace = True)

In [None]:
test.shape

In [None]:
bins = [0,59,69,79,89,100]
labels = ['Very Low','Low','Medium','High','Very High']
test['safety_rating'] = pd.cut(test['safty_rating'], bins = bins, labels = labels)
test.head(5)

In [None]:
test['county_name'] = np.where((test['county_name'] == 'New York')|(test['county_name'] =='Kings'), test['county_name'], 0)

In [None]:
test['Prior_carrier_grp'] = np.where((test['Prior_carrier_grp'] == 'Carrier_3')|(test['Prior_carrier_grp'] =='Carrier_7')|(test['Prior_carrier_grp'] =='Carrier_8'), test['Prior_carrier_grp'], 0)

In [None]:
test.drop(['make_ACURA ', 'make_BUICK ', 'make_CADILLAC ', 'make_CHRYSLER ', 'make_DODGE ','make_FORD ', 'make_GMC ','make_MAZDA ',
                'make_NISSAN ','make_RAM ','make_SATURN ','make_SMART ','make_SUBARU ','make_TOYOTA '], axis = 1, inplace = True)

In [None]:
test.columns

In [None]:
bins = [0,500,600,700,800,850]
labels = ['Very Low','Low','Medium','High','Very High']
test['credit_score_range'] = pd.cut(test['credit_score'], bins = bins, labels = labels)

In [None]:
test.drop(['quoted_amt', 'total_drivers'], axis = 1, inplace = True)

In [None]:
test['safety_range'] = test['safety_max'] - test['safety_min']
test['car_age_range'] = test['age_max'] - test['age_min']

In [None]:
test['Quote_dt'] = pd.to_datetime(test['Quote_dt'])

In [None]:
test['Quote_month'] = test['Quote_dt'].dt.month
test['Quote_quarter'] = test['Quote_dt'].dt.quarter
test['Quote_year'] = test['Quote_dt'].dt.year

In [None]:
test.drop(['split', 'Unnamed: 0', 'Quote_dt', 'zip', 'Agent_cd', 'policy_id'], axis = 1, inplace=True)

In [None]:
test = pd.get_dummies(test, columns = ['state_id','Prior_carrier_grp', 'primary_parking', 'county_name' ])

In [None]:
test['discount'] = test['discount'].astype('category')
test['Home_policy_ind'] = test['Home_policy_ind'].astype('category')
test['Cov_package_type'] = test['Cov_package_type'].astype('category')

In [None]:
test['credit_score_range'] = test['credit_score_range'].cat.codes
test['Home_policy_ind'] = test['Home_policy_ind'].cat.codes
test['Cov_package_type'] = test['Cov_package_type'].cat.codes
test['safety_rating'] = test['safety_rating'].cat.codes
test['discount'] = test['discount'].cat.codes
test['quote_range'] = test['quote_range'].cat.codes

In [None]:
test.drop(['primary_parking_home/driveway', 'primary_parking_parking garage',
       'primary_parking_street', 'primary_parking_unknown', 'state_id_AL', 'state_id_CT', 'state_id_FL',
       'state_id_GA', 'state_id_MN', 'state_id_NJ',
       'state_id_WI'], axis = 1, inplace = True)

In [None]:
test.columns

In [None]:
test.drop(['log_quote', 'Prior_carrier_grp_0', 'county_name_0'], axis = 1, inplace=True)

In [None]:
test.info()

In [None]:
test_x = test.drop(['convert_ind'], axis = 1)

In [None]:
scaler = MinMaxScaler()
test_x_scaled = scaler.fit_transform(test_x)
test_x_scaled = pd.DataFrame(test_x_scaled, columns=test_x.columns)

In [None]:
#xgb_final.fit(train_x_scaled,y)

In [None]:
test_y = xgb_final.predict(test_x_scaled)

In [None]:
#print("Accuracy of Model::",accuracy_score(test_y,test_preds))

In [None]:
test_prob = xgb_final.predict_proba(test_x_scaled)
test_prob

In [None]:
submission.shape

In [None]:
c = pd.DataFrame(test_prob[:,1].tolist(), columns = ['predictions'])
c.head()

In [None]:
submission['TARGET'] = c['predictions']

In [None]:
submission.head()

In [None]:
#submission.to_csv('test_predictions18.csv')

In [None]:
xgboost.plot_importance(model)
pl.title("xgboost.plot_importance(model)")
pl.show()