# Connecting to GDrive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Libraries

In [None]:
!pip install vecstack
!pip install feature_engine

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score
import feature_engine as fe
from scipy import stats
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import time
import seaborn as sn
import matplotlib.pyplot as plt
from matplotlib import pyplot
from vecstack import stacking
from collections import Counter
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import StackingClassifier
from collections import defaultdict
from imblearn.over_sampling import SMOTE 
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from collections import Counter

# Reading required data files

In [None]:
#Read training data file
trainfile = r'/content/drive/MyDrive/CIS508/Individual Assignment 3/train.csv'
train_data = pd.read_csv(trainfile)

#Read validation data file, test file will be created as a split of the train data
testfile = r'/content/drive/MyDrive/CIS508/Individual Assignment 3/test.csv'
test_data = pd.read_csv(testfile)

# Getting sense of the data

In [None]:
#Looking at data shape
print(train_data.shape)
print(test_data.shape)

(260753, 299)
(173836, 298)


In [None]:
train_data.sample(5)

Unnamed: 0,QuoteNumber,Original_Quote_Date,QuoteConversion_Flag,Field6,Field7,Field8,Field9,Field10,Field11,Field12,...,GeographicField59A,GeographicField59B,GeographicField60A,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64
226343,377353,2013-07-05,0,B,23,0.9403,0.0006,965,1.02,N,...,12,14,-1,24,-1,8,-1,8,N,CA
172785,287930,2015-03-11,0,B,25,0.9153,0.0007,935,1.02,N,...,13,16,-1,20,-1,8,-1,8,N,CA
130303,216894,2013-09-30,1,B,24,0.9403,0.0006,965,1.02,N,...,4,2,-1,2,-1,23,25,25,N,CA
224701,374563,2014-02-01,0,E,14,0.9472,0.0006,1487,1.3045,N,...,22,24,-1,13,-1,22,-1,21,N,IL
171700,286131,2014-05-03,1,F,7,1.0006,0.004,548,1.2433,N,...,10,11,-1,23,-1,10,-1,16,N,NJ


In [None]:
test_data.sample(5)

Unnamed: 0,QuoteNumber,Original_Quote_Date,Field6,Field7,Field8,Field9,Field10,Field11,Field12,CoverageField1A,...,GeographicField59A,GeographicField59B,GeographicField60A,GeographicField60B,GeographicField61A,GeographicField61B,GeographicField62A,GeographicField62B,GeographicField63,GeographicField64
24150,60621,2013-10-04,J,20,0.9497,0.0004,1165,1.2665,N,6,...,14,17,-1,15,-1,4,-1,21,N,TX
168398,421309,2014-10-21,F,23,1.0006,0.004,548,1.2433,N,5,...,11,13,-1,23,-1,18,-1,8,N,NJ
49436,123767,2014-10-15,J,26,0.8793,0.0004,1113,1.2665,N,6,...,3,2,-1,1,-1,15,-1,23,N,TX
137040,342660,2014-06-19,F,7,1.0006,0.004,548,1.2433,N,15,...,15,19,-1,21,-1,4,-1,18,N,NJ
142948,357158,2015-01-29,F,7,1.0005,0.004,548,1.2433,N,15,...,18,22,-1,22,-1,15,-1,24,N,NJ


In [None]:
# Checking the balance of dependent variable in the data recieved
train_data['QuoteConversion_Flag'].value_counts()

#This is a imbalanced class problem, evident from the difference in classes below

0    211859
1     48894
Name: QuoteConversion_Flag, dtype: int64

In [None]:
#Checking for null values in each column
(train_data.isnull().sum()*100/len(train_data)).sort_values(ascending = False)
#the dataset does not have any null values

PropertyField29      76.963640
PersonalField84      47.634351
PropertyField38       0.467876
PersonalField7        0.043336
PropertyField36       0.043336
                       ...    
PersonalField58       0.000000
PersonalField57       0.000000
PersonalField56       0.000000
PersonalField55       0.000000
GeographicField64     0.000000
Length: 299, dtype: float64

In [None]:
(test_data.isnull().sum()*100/len(test_data)).sort_values(ascending = False)

PropertyField29      77.052509
PersonalField84      47.638004
PropertyField38       0.486666
PropertyField3        0.039693
PersonalField7        0.039693
                       ...    
PersonalField58       0.000000
PersonalField57       0.000000
PersonalField56       0.000000
PersonalField55       0.000000
GeographicField64     0.000000
Length: 298, dtype: float64

# EDA and Data Transformations

In [None]:
# transforming dates to year and month and then dropping the original dates along with the data identifier
train_data['year'] = pd.DatetimeIndex(train_data['Original_Quote_Date']).year
train_data['month'] = pd.DatetimeIndex(train_data['Original_Quote_Date']).month
train_data.drop(['Original_Quote_Date','QuoteNumber'], axis = 1, inplace=True)

In [None]:
test_data['year'] = pd.DatetimeIndex(test_data['Original_Quote_Date']).year
test_data['month'] = pd.DatetimeIndex(test_data['Original_Quote_Date']).month
quote_num = test_data['QuoteNumber']
test_data.drop(['Original_Quote_Date','QuoteNumber'], axis = 1, inplace=True)

In [None]:
# dropping fields that have a very high null value percentage
train_data = train_data.drop(labels = ["PropertyField29","PersonalField84"], axis = 1)
test_data = test_data.drop(labels = ["PropertyField29","PersonalField84"], axis = 1)

In [None]:
# Creating the set of dependent and independent variables for test and train data
X_train = train_data.drop(labels = "QuoteConversion_Flag", axis = 1)
y_train = train_data["QuoteConversion_Flag"]
X_test = test_data

## Data Transformation and OneHotEncoding categorical variables




In [None]:
# Drop duplicate and correlated variables
from feature_engine.selection import DropDuplicateFeatures
from feature_engine.selection import DropCorrelatedFeatures

transformer = DropDuplicateFeatures()
transformer.fit(X_train)
X_train = transformer.transform(X_train)

tr = DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.8)
X_train = tr.fit_transform(X_train)

X_train.shape

(260753, 158)

In [None]:
# keeping the same variables as train
train_cols = list(X_train.columns)
X_test = X_test[train_cols]

In [None]:
# Creating a list of categorical variable columns
cat_features = X_train.select_dtypes(include=['object']).columns.tolist()
len(cat_features)

27

In [None]:
# converting all object columns to dummy variables
for col in  cat_features:
    X_train = pd.concat([X_train.drop(col, axis=1),pd.get_dummies(X_train[col], prefix=col, prefix_sep='_',drop_first=True, dummy_na=False)], axis=1)
for col in  cat_features:
    X_test = pd.concat([X_test.drop(col, axis=1),pd.get_dummies(X_test[col], prefix=col, prefix_sep='_',drop_first=True, dummy_na=False)], axis=1)

In [None]:
# considering columns that are present in both test and train
_common = []
for i in X_train.columns:
    if i in X_test.columns:
        _common.append(i)
len(_common)

421

In [None]:
X_train = X_train[_common]
X_test = X_test[_common]

In [None]:
X_train.shape
X_test.shape

(173836, 421)

## Data Sampling with SMOTE

In [None]:
# finalised on 0.4 sampling strategy after trying out multiple percentages
print('Original dataset shape %s' % Counter(y_train))
sm = SMOTE(sampling_strategy=0.40)
X_train, y_train = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_train))

Original dataset shape Counter({0: 211859, 1: 48894})
Resampled dataset shape Counter({0: 211859, 1: 84743})


# Building individual models


### Creating a function for hyperparameter tuning for RandomForest

In [None]:
start_time = time.time()

grid = {
'min_samples_leaf' : range(10,100,10),'max_depth': 
            range(5,30,2),'criterion':['gini','entropy']
}

rf_class = RandomForestClassifier()

hp_tuning = RandomizedSearchCV(estimator = rf_class, param_distributions = grid, n_iter = 100)

hp_tuning.fit(S_train, y_train)

best_params_dict = hp_tuning.best_params_

print(best_params_dict)
print(time.time()-start_time)

In [None]:
# Logistic regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_test = lr.predict_proba(X_test)

kaggle_sub = pd.DataFrame({'QuoteNumber': quote_num,'QuoteConversion_Flag': y_test[:,1]})

In [None]:
from google.colab import files
kaggle_sub.to_csv('HomeSiteLR.csv') 
files.download('HomeSiteLR.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Decision tree model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_test = dt.predict_proba(X_test)

kaggle_sub = pd.DataFrame({'QuoteNumber': quote_num,'QuoteConversion_Flag': y_test[:,1]})

In [None]:
from google.colab import files
kaggle_sub.to_csv('HomeSiteDT.csv') 
files.download('HomeSiteDT.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#RandomForest model
rf = RandomForestClassifier(n_estimators=50,min_samples_leaf = best_params_dict['min_samples_leaf']
           ,max_depth = best_params_dict['max_depth']
           ,criterion = best_params_dict['criterion'])
rf.fit(X_train, y_train)
y_test = rf.predict_proba(X_test)

kaggle_sub = pd.DataFrame({'QuoteNumber': quote_num,'QuoteConversion_Flag': y_test[:,1]})

In [None]:
from google.colab import files
kaggle_sub.to_csv('HomeSiteRF.csv') 
files.download('HomeSiteRF.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Building stacked models

In [None]:
# Building the level 0(Base) model in stacking
models = [KNeighborsClassifier(), LinearSVC(), RandomForestClassifier(), DecisionTreeClassifier(), MLPClassifier() ]

S_train, S_test = stacking(models,                     # list of models
                           X_train, y_train ,X_test,   # data
                           regression=False,           # classification task (if you need 
                                                       #     regression - set to True)
                           mode='oof_pred_bag',        # mode: oof for train set, predict test 
                                                       #     set in each fold and vote
                           needs_proba=False,          # predict class labels (if you need 
                                                       #     probabilities - set to True) 
                           save_dir=None,              # do not save result and log (to save 
                                                       #     in current dir - set to '.')
                           metric=roc_auc_score,      # metric: callable
                           n_folds=2,                  # number of folds
                           stratified=True,            # stratified split for folds
                           shuffle=True,               # shuffle the data
                           random_state=0,             # ensure reproducibility
                           verbose=2)                  # print all info

task:         [classification]
n_classes:    [2]
metric:       [roc_auc_score]
mode:         [oof_pred_bag]
n_models:     [5]

model  0:     [KNeighborsClassifier]
    fold  0:  [0.66127189]
    fold  1:  [0.66109083]
    ----
    MEAN:     [0.66118136] + [0.00009053]
    FULL:     [0.66118133]

model  1:     [LinearSVC]




    fold  0:  [0.61967761]




    fold  1:  [0.59173600]
    ----
    MEAN:     [0.60570681] + [0.01397080]
    FULL:     [0.60570949]

model  2:     [RandomForestClassifier]
    fold  0:  [0.87567254]
    fold  1:  [0.88190295]
    ----
    MEAN:     [0.87878775] + [0.00311521]
    FULL:     [0.87878777]

model  3:     [DecisionTreeClassifier]
    fold  0:  [0.85257000]
    fold  1:  [0.85205282]
    ----
    MEAN:     [0.85231141] + [0.00025859]
    FULL:     [0.85231142]

model  4:     [MLPClassifier]
    fold  0:  [0.66234978]
    fold  1:  [0.87616521]
    ----
    MEAN:     [0.76925750] + [0.10690772]
    FULL:     [0.76925666]



### Building Level 1 models on Level 0 models

In [None]:
# Logistic regression model (Level 1 model)
lr = LogisticRegression()
lr.fit(S_train, y_train)
y_test = lr.predict_proba(S_test)

kaggle_sub = pd.DataFrame({'QuoteNumber': quote_num,'QuoteConversion_Flag': y_test[:,1]})

In [None]:
from google.colab import files
kaggle_sub.to_csv('HomeSiteLR_stacked.csv') 
files.download('HomeSiteLR_stacked.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Decision tree model (Level 1 model)
dt = DecisionTreeClassifier()
dt.fit(S_train, y_train)
y_test = dt.predict_proba(S_test)

kaggle_sub = pd.DataFrame({'QuoteNumber': quote_num,'QuoteConversion_Flag': y_test[:,1]})

In [None]:
from google.colab import files
kaggle_sub.to_csv('HomeSiteDT_stacked.csv') 
files.download('HomeSiteDT_stacked.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# RandomForest model (Using the hyperparameter tuning for RF) (Level 1 model)
rf = RandomForestClassifier(n_estimators=50,min_samples_leaf = best_params_dict['min_samples_leaf']
           ,max_depth = best_params_dict['max_depth']
           ,criterion = best_params_dict['criterion'])
rf.fit(S_train, y_train)
y_test = rf.predict_proba(S_test)

kaggle_sub = pd.DataFrame({'QuoteNumber': quote_num,'QuoteConversion_Flag': y_test[:,1]})

In [None]:
from google.colab import files
kaggle_sub.to_csv('HomeSiteRF_stacked.csv') 
files.download('HomeSiteRF_stacked.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>