In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from utils import *
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, QuantileTransformer
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, LassoCV, LassoLarsCV
from sklearn.metrics import root_mean_squared_log_error, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.svm import SVR
import scipy.stats as stats

In [2]:
pd.set_option('display.max_columns',None)
pd.set_option('display.precision',2)
df = pd.read_csv('dataset/train.csv',index_col=0)
df = preprocess_data(df)

In [3]:
spearman = df.select_dtypes('float').corr(method='spearman')

In [4]:
abs(spearman['Premium Amount']).sort_values()

Insurance Duration      7.12e-05
Vehicle Age             8.70e-04
Number of Dependents    1.57e-03
Age                     2.31e-03
Policy Start Date       1.02e-02
Health Score            1.54e-02
Credit Score            4.13e-02
Annual Income           5.98e-02
Premium Amount          1.00e+00
Name: Premium Amount, dtype: float64

In [5]:
res = pd.Series()
for c in df.select_dtypes('bool').columns:
    res[c] = stats.mannwhitneyu(df.groupby(c).get_group(True)['Premium Amount'], df.groupby(c).get_group(False)['Premium Amount'])[1]
for c in df.select_dtypes('float').columns:
    if c != 'Premium Amount':
        res[c] = stats.spearmanr(df[c],df['Premium Amount'])[1]
res.sort_values()

Annual Income                   0.00e+00
Previous Claims_two_or_more     0.00e+00
Credit Score                    0.00e+00
Previous Claims_unknown        7.16e-101
Health Score                    1.62e-63
Policy Start Date               6.88e-29
Previous Claims_one             1.03e-14
Customer Feedback_Good          2.22e-09
Customer Feedback_Poor          1.65e-04
Age                             1.13e-02
Marital Status_Married          4.48e-02
Location_Urban                  7.62e-02
Education Level_High School     8.47e-02
Number of Dependents            8.62e-02
Occupation_Self-Employed        1.21e-01
Property Type_Condo             1.91e-01
Occupation_Unemployed           2.03e-01
Policy Type_Premium             2.04e-01
Education Level_PhD             2.99e-01
Exercise Frequency_Weekly       3.36e-01
Vehicle Age                     3.41e-01
Education Level_Master's        3.41e-01
Marital Status_Single           5.25e-01
Exercise Frequency_Monthly      5.30e-01
Property Type_Ho

In [6]:
significant_features = res.loc[(res*len(res))<0.05].index

In [7]:
target = df['Premium Amount']
# competition metric is root mean square log loss -> log-transform target first so that can use common mean-squares losses to optimize for the root mean square log loss
# will convert back predictions later
transformed_target = np.log1p(target)
X = df[significant_features]
Y = transformed_target = np.log1p(target)
# created a smaller sample of training data (10k samples) for faster grid search optimization
# stratified by quantiles of target to ensure similar distribution
xtrain, xtest, ytrain, ytest = train_test_split(
    X,Y,
    random_state=42,train_size=0.5,stratify=pd.qcut(Y,100).cat.codes)

In [8]:
pipe = Pipeline([('scaler', StandardScaler()),('model',Lasso())])
param_grid = {'model__alpha':np.logspace(-4,4,3)}
gs = GridSearchCV(pipe, scoring = 'neg_root_mean_squared_error',param_grid=param_grid,n_jobs=-1, cv = 3)
gs.fit(xtrain,ytrain)
print('best parameters',gs.best_params_)
print('grid search CV score:',gs.best_score_)
best = gs.best_estimator_
print('train score:',root_mean_squared_error(ytrain, best.predict(xtrain)))
print('test score:',root_mean_squared_error(ytest, best.predict(xtest)))

best parameters {'model__alpha': 0.0001}
grid search CV score: -1.0887680923050187
train score: 1.0887424672833808
test score: 1.0888103778510976


In [9]:
pipe = Pipeline([('scaler', StandardScaler()), ('model',LassoCV(eps=1e-5, n_alphas=500, max_iter=1000, n_jobs=-1,))])
pipe.fit(xtrain,ytrain)
print('train score:',root_mean_squared_error(ytrain, pipe.predict(xtrain)))
print('test score:',root_mean_squared_error(ytest, pipe.predict(xtest)))

train score: 1.0887424153841454
test score: 1.0888103953732085


In [16]:
# Random Forest
# set max_samples to 0.1 so that each tree is fit on a bootstrapped subset of samples - faster
pipe = Pipeline([('scaler',StandardScaler()),('model',RandomForestRegressor(bootstrap=True,max_samples=0.1))])
param_grid = {'model__min_weight_fraction_leaf':np.logspace(-5,-3,3),
              'model__min_samples_leaf':np.logspace(-5,-3,3),
              'model__n_estimators':[100]}
gs = GridSearchCV(pipe, scoring = 'neg_root_mean_squared_error',param_grid=param_grid, cv=3)
gs.fit(xtrain,ytrain)
print(gs.best_params_)
print(gs.best_score_)

{'model__min_samples_leaf': 1e-05, 'model__min_weight_fraction_leaf': 0.001, 'model__n_estimators': 100}
-1.053225116196143


In [17]:
best = gs.best_estimator_
print('train score:',root_mean_squared_error(ytrain, best.predict(xtrain)))
print('test score:',root_mean_squared_error(ytest, best.predict(xtest)))

train score: 1.046752488345783
test score: 1.0524693239288856


In [22]:
# AdaBoost based on Decision Tree 
# Wrapped in Bagging Regressor to take bootstrapped sample of 10% of train set (itself a subsample of full dataset) and generate an ensemble of 10 Adaboost models

pipe = Pipeline([('scaler',StandardScaler()),('model',BaggingRegressor(n_estimators=100, estimator=AdaBoostRegressor(DecisionTreeRegressor(), n_estimators=50),max_samples=0.01,n_jobs=-1))])
param_grid = {'model__estimator__estimator__min_weight_fraction_leaf':np.logspace(-5,-3,3),
              'model__estimator__estimator__min_samples_leaf':[0.1,0.01,0.001],
               'model__estimator__learning_rate':[0.01,0.1,1],
              'model__estimator__estimator__max_depth':[None]}
gs = GridSearchCV(pipe, scoring = 'neg_root_mean_squared_error',param_grid=param_grid, cv=3)
gs.fit(xtrain,ytrain)
print(gs.best_params_)
print(gs.best_score_)

KeyboardInterrupt: 

In [15]:
best = gs.best_estimator_
print('train score:',root_mean_squared_error(ytrain, best.predict(xtrain)))
print('test score:',root_mean_squared_error(ytest, best.predict(xtest)))

train score: 1.0632095248828417
test score: 1.0634364212994891


In [23]:
pipe = Pipeline([('scaler',StandardScaler()),('model',BaggingRegressor(
    n_estimators=100, estimator=AdaBoostRegressor(
        DecisionTreeRegressor(
            min_weight_fraction_leaf=0.001, min_samples_leaf=0.01), learning_rate = 0.1, n_estimators=50),max_samples=0.01,n_jobs=-1))])

In [None]:
best = pipe
pipe.fit(xtrain,ytrain)
print('train score:',root_mean_squared_error(ytrain, best.predict(xtrain)))
print('test score:',root_mean_squared_error(ytest, best.predict(xtest)))

In [18]:
sample_submission = pd.read_csv('dataset/test.csv',index_col=0)

In [19]:
sample_submission = preprocess_data(sample_submission)

In [20]:
pred = np.expm1(best.predict(sample_submission[significant_features]))

In [21]:
pd.DataFrame().from_dict({'id':sample_submission.index,'Premium Amount':pred}).to_csv('my_sample_submission.csv',index=False)