In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline 

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split,cross_val_predict,StratifiedKFold
from sklearn.metrics import classification_report,roc_auc_score, accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold
from sklearn.ensemble import VotingClassifier,RandomForestClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [2]:


train_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')

In [3]:
train_df.drop(['Holding_Policy_Duration','Holding_Policy_Type'],axis=1,inplace=True)
test_df.drop(['Holding_Policy_Duration','Holding_Policy_Type'],axis=1,inplace=True)

In [4]:
train_df['Accomodation_Type'] = train_df['Accomodation_Type'].map({'Rented' :0, 'Owned':1})
dummy = pd.get_dummies(train_df['Accomodation_Type'], drop_first=True,prefix='Accomodation')
train_df = pd.concat([train_df, dummy] , axis=1)
train_df.drop('Accomodation_Type',axis=1,inplace=True)


train_df['Is_Spouse'] = train_df['Is_Spouse'].map({'No' :0, 'Yes':1})
dummy = pd.get_dummies(train_df['Is_Spouse'], drop_first=True,prefix='Is_Spouse')
train_df = pd.concat([train_df, dummy] , axis=1)
train_df.drop('Is_Spouse',axis=1,inplace=True)

train_df['Reco_Insurance_Type'] = train_df['Reco_Insurance_Type'].map({'No' :0, 'Yes':1})
dummy = pd.get_dummies(train_df['Reco_Insurance_Type'], drop_first=True,prefix='Reco_Insurance_Type')
train_df = pd.concat([train_df, dummy] , axis=1)
train_df.drop('Reco_Insurance_Type',axis=1,inplace=True)

In [5]:
test_df['Accomodation_Type'] = test_df['Accomodation_Type'].map({'Rented' :0, 'Owned':1})
dummy = pd.get_dummies(test_df['Accomodation_Type'], drop_first=True,prefix='Accomodation')
test_df = pd.concat([test_df, dummy] , axis=1)
test_df.drop('Accomodation_Type',axis=1,inplace=True)


test_df['Is_Spouse'] = test_df['Is_Spouse'].map({'No' :0, 'Yes':1})
dummy = pd.get_dummies(test_df['Is_Spouse'], drop_first=True,prefix='Is_Spouse')
test_df = pd.concat([test_df, dummy] , axis=1)
test_df.drop('Is_Spouse',axis=1,inplace=True)

test_df['Reco_Insurance_Type'] = test_df['Reco_Insurance_Type'].map({'No' :0, 'Yes':1})
dummy = pd.get_dummies(test_df['Reco_Insurance_Type'], drop_first=True,prefix='Reco_Insurance_Type')
test_df = pd.concat([test_df, dummy] , axis=1)
test_df.drop('Reco_Insurance_Type',axis=1,inplace=True)

In [6]:
train_df.head(5)

Unnamed: 0,ID,City_Code,Region_Code,Upper_Age,Lower_Age,Health Indicator,Reco_Policy_Cat,Reco_Policy_Premium,Response,Accomodation_1,Is_Spouse_1
0,1,C3,3213,36,36,X1,22,11628.0,0,0,0
1,2,C5,1117,75,22,X2,22,30510.0,0,1,0
2,3,C5,3732,32,32,,19,7450.0,1,1,0
3,4,C24,4378,52,48,X1,19,17780.0,0,1,0
4,5,C8,2190,44,44,X2,16,10404.0,0,0,0


In [7]:
dummy_data_train = train_df.copy()
dummy_data_test = test_df.copy()

In [8]:
col = ['Upper_Age','Lower_Age','Accomodation_1','Reco_Policy_Cat','Reco_Policy_Premium','Is_Spouse_1','Health Indicator']
df = dummy_data_train[col]
test_df = df[df["Health Indicator"].isnull()]
df = df.dropna()

y_train = df["Health Indicator"]
X_train = df.drop("Health Indicator", axis=1)
X_test = test_df.drop("Health Indicator", axis=1)

#train model to fit dataset and predict missing values from column 
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

#replace the missing values with predicted values
dummy_data_train.loc[dummy_data_train['Health Indicator'].isnull(), 'Health Indicator'] = y_pred

In [9]:
col = ['Upper_Age','Lower_Age','Accomodation_1','Reco_Policy_Cat','Reco_Policy_Premium','Is_Spouse_1','Health Indicator']
df = dummy_data_test[col]
test_df = df[df["Health Indicator"].isnull()]
df = df.dropna()

y_train = df["Health Indicator"]
X_train = df.drop("Health Indicator", axis=1)
X_test = test_df.drop("Health Indicator", axis=1)

#train model to fit dataset and predict missing values from column 
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

#replace the missing values with predicted values
dummy_data_test.loc[dummy_data_test['Health Indicator'].isnull(), 'Health Indicator'] = y_pred

In [17]:
dummy_data_test['Health Indicator'].unique()

array(['X1', 'X3', 'X2', 'X5', 'X4', 'X7', 'X6', 'X9', 'X8'], dtype=object)

In [11]:
dummy_data_train.head(5)

Unnamed: 0,ID,City_Code,Region_Code,Upper_Age,Lower_Age,Health Indicator,Reco_Policy_Cat,Reco_Policy_Premium,Response,Accomodation_1,Is_Spouse_1
0,1,C3,3213,36,36,X1,22,11628.0,0,0,0
1,2,C5,1117,75,22,X2,22,30510.0,0,1,0
2,3,C5,3732,32,32,X1,19,7450.0,1,1,0
3,4,C24,4378,52,48,X1,19,17780.0,0,1,0
4,5,C8,2190,44,44,X2,16,10404.0,0,0,0


In [12]:
le = LabelEncoder()
le.fit(dummy_data_train['City_Code'])
dummy_data_train['City_Code'] = le.transform(dummy_data_train['City_Code'])
dummy_data_test['City_Code'] = le.transform(dummy_data_test['City_Code'])

In [13]:
le.fit(dummy_data_train['Health Indicator'])
dummy_data_train['Health Indicator'] = le.transform(dummy_data_train['Health Indicator'])
dummy_data_test['Health Indicator'] = le.transform(dummy_data_test['Health Indicator'])

In [15]:
dummy_data_train.drop('ID',axis=1,inplace=True)

In [18]:
dummy_data_train.to_csv('new_approach/final_data_train.csv')
dummy_data_test.to_csv('new_approach/final_data_test.csv')

In [34]:


X = dummy_data_train.drop(['Response'],axis=1)
y = dummy_data_train.Response


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [50]:
rf = RandomForestClassifier(n_estimators=500, max_depth=10)
rf.fit(X_train, y_train)
y_pred = rf.predict_proba(X_test)

In [51]:
y_pred = y_pred[:, 1]
roc_auc_score(y_test,y_pred)

0.6334028195894441

In [3]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [4]:
corr_features = correlation(dummy_data_train, 0.7)
corr_features

{'Lower_Age', 'Reco_Policy_Premium'}

In [11]:
X = dummy_data_train.drop(['Response'],axis=1)
y = dummy_data_train.Response

In [12]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
rf = GradientBoostingClassifier()
rf.fit(X_train, y_train),
y_pred = rf.predict(X_test)
prob = rf.predict_proba(X_test)
prob = prob[:, 1]

roc_auc_score(y_test,prob)

0.6284575788578237

In [13]:
accuracy_score(y_test,y_pred)

0.7546576527002594

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      1.00      0.86      9605
           1       0.00      0.00      0.00      3116

    accuracy                           0.75     12721
   macro avg       0.38      0.50      0.43     12721
weighted avg       0.57      0.75      0.65     12721



In [15]:
confusion_matrix(y_test, y_pred)

array([[9600,    5],
       [3116,    0]], dtype=int64)

In [54]:
def make_csv(name , ID_col, prob):
    prob = prob[:, 1]
    final_csv = pd.DataFrame(list(zip(ID_col,prob)) ,columns =['ID', 'Response'])
    final_csv.to_csv(name,index=False)

In [56]:
ID_col = dummy_data_test.ID
test_me = dummy_data_test.drop(['ID','Lower_Age'],axis=1)

p = rf.predict_proba(test_me)
make_csv('gboosting.csv',ID_col, p)

## with undersampling using SMOTE

In [13]:
dummy_data_train= pd.read_csv('new_approach/final_data_train.csv')
dummy_data_test = pd.read_csv('new_approach/final_data_test.csv')

In [14]:
X = dummy_data_train.drop(['Response','Lower_Age'],axis=1)
y = dummy_data_train.Response

In [9]:
# steps = [('over', SMOTE()),('under',RandomUnderSampler()) ('model',GradientBoostingClassifier())]
# pipeline = Pipeline(steps=steps)
# # evaluate pipeline
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
# print('Mean ROC AUC: %.3f' % np.mean(scores))

model = RandomForestClassifier()
over = SMOTE()

steps = [('over', over), ('model', model)]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % np.mean(scores))

Mean ROC AUC: 0.584


In [8]:
np.mean(scores)

nan

In [36]:
k_values = [1, 2, 3, 4, 5, 6, 7]
for k in k_values:
    # define pipeline
    model = RandomForestClassifier()
    over = SMOTE( k_neighbors=k)
    steps = [('over', over), ('model', model)]
    pipeline = Pipeline(steps=steps)
    # evaluate pipeline
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
    score = np.mean(scores)
    print('> k=%d, Mean ROC AUC: %.3f' % (k, score))

> k=1, Mean ROC AUC: 0.590
> k=2, Mean ROC AUC: 0.590
> k=3, Mean ROC AUC: 0.589
> k=4, Mean ROC AUC: 0.590
> k=5, Mean ROC AUC: 0.589
> k=6, Mean ROC AUC: 0.590
> k=7, Mean ROC AUC: 0.590


In [62]:
gb = GradientBoostingClassifier()
params = {
    'learning_rate':[0.1,0.12,0.15,0.2,0.25],
    'n_estimators':[100,200,300,400,500,1000],
    'max_depth' : [1,3,5,10]
    
    
}
grid_gb = GridSearchCV(estimator = gb,
                        param_grid = params,
                        scoring = 'roc_auc', 
                        cv = 5, 
                        verbose = 3,
                        n_jobs = -1)
grid_gb.fit(X, y)
print(grid_gb.best_estimator_), 
print(grid_gb.best_score_)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
GradientBoostingClassifier(learning_rate=0.2, max_depth=1, n_estimators=400)
0.6154613733086665


In [63]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
rf = GradientBoostingClassifier(learning_rate=0.2, max_depth=1, n_estimators=400)
rf.fit(X_train, y_train),
y_pred = rf.predict(X_test)
prob = rf.predict_proba(X_test)
prob = prob[:, 1]

roc_auc_score(y_test,prob)

0.6201944891239921

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
models = [('random_forest',RandomForestClassifier()), ('gboost',GradientBoostingClassifier(n_estimators=1000, learning_rate=0.15,max_depth=10))]
vc = VotingClassifier(models , voting='hard')
vc.fit(X_train,y_train)

VotingClassifier(estimators=[('random_forest', RandomForestClassifier()),
                             ('gboost',
                              GradientBoostingClassifier(learning_rate=0.15,
                                                         max_depth=10,
                                                         n_estimators=1000))])

In [16]:
y_pred = vc.predict(X_test)
# prob = vc.predict(X_test)
# prob = prob[:, 1]

roc_auc_score(y_test,y_pred)

AttributeError: predict_proba is not available when voting='hard'

In [17]:
roc_auc_score(y_test,y_pred)

0.5080177438874035