# 1. Import Libraries

In [1]:
import pandas as pd
import pandas.api.types as ptypes
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
import math, os, shutil, datetime
warnings.filterwarnings('ignore')
mpl.rcParams['figure.dpi'] = 500
pd.set_option('display.max_columns', 1000)
%matplotlib inline

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import FunctionTransformer

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [4]:
from sklearn.metrics import f1_score, auc, plot_confusion_matrix, plot_roc_curve, roc_auc_score

In [5]:
from sklearn.model_selection import cross_val_score

# Load Data

In [6]:
train_data = pd.read_csv('../Data/Train/Train.csv')
firstcamp_data = pd.read_csv('../Data/Train/First_Health_Camp_Attended.csv')
secondcamp_data = pd.read_csv('../Data/Train/Second_Health_Camp_Attended.csv')
thirdcamp_data = pd.read_csv('../Data/Train/Third_Health_Camp_Attended.csv')
healthcamp_data = pd.read_csv('../Data/Train/Health_Camp_Detail.csv')
patient_data = pd.read_csv('../Data/Train/Patient_Profile.csv')

test_data = pd.read_csv('../Data/test_l0Auv8Q.csv')
submission_data = pd.read_csv('../Data/sample_submmission.csv')

# Data Cleaning

In [7]:
train_final_data = pd.merge(train_data, firstcamp_data.drop('Unnamed: 4', axis=1), how='left', on=['Patient_ID', 'Health_Camp_ID'], indicator='camp1_merge_ind')
train_final_data = pd.merge(train_final_data, secondcamp_data, how='left', on=['Patient_ID', 'Health_Camp_ID'], indicator='camp2_merge_ind')
train_final_data = pd.merge(train_final_data, thirdcamp_data, how='left', on=['Patient_ID', 'Health_Camp_ID'], indicator='camp3_merge_ind')
train_final_data = pd.merge(train_final_data, healthcamp_data, how='left', on='Health_Camp_ID', indicator='healthcamp_merge_ind')
train_final_data = pd.merge(train_final_data, patient_data, how='left', on='Patient_ID', indicator='patient_merge_ind')

In [8]:
train_final_data['Outcome'] = 0
train_final_data.loc[(train_final_data['camp1_merge_ind']=='both') | 
                     (train_final_data['camp2_merge_ind']=='both') |
                     ((train_final_data['camp3_merge_ind']=='both') & (train_final_data['Number_of_stall_visited']>0))
                     ,'Outcome'] = 1

In [9]:
test_final_data = pd.merge(test_data, firstcamp_data.drop('Unnamed: 4', axis=1), how='left', on=['Patient_ID', 'Health_Camp_ID'], indicator='camp1_merge_ind')
test_final_data = pd.merge(test_final_data, secondcamp_data, how='left', on=['Patient_ID', 'Health_Camp_ID'], indicator='camp2_merge_ind')
test_final_data = pd.merge(test_final_data, thirdcamp_data, how='left', on=['Patient_ID', 'Health_Camp_ID'], indicator='camp3_merge_ind')
test_final_data = pd.merge(test_final_data, healthcamp_data, how='left', on='Health_Camp_ID', indicator='healthcamp_merge_ind')
test_final_data = pd.merge(test_final_data, patient_data, how='left', on='Patient_ID', indicator='patient_merge_ind')

In [10]:
date_cols = ['Registration_Date', 'Camp_Start_Date', 'Camp_End_Date', 'First_Interaction']

In [11]:
def to_date(df):
    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], format='%d-%b-%y')
    return df

In [12]:
train_final_data = to_date(train_final_data)
test_final_data = to_date(test_final_data)

In [13]:
num_cols = ['Income', 'Education_Score', 'Age']

In [14]:
def to_numeric(df, columns):
    for col in df.columns:
        if (col in num_cols) & ~(ptypes.is_numeric_dtype(df[col])):
            df[col] = df[col].replace({'None':''})
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

In [15]:
train_final_data = to_numeric(train_final_data, num_cols)
test_final_data = to_numeric(test_final_data, num_cols)

## Drop unnecessary columns

In [16]:
unnec_cols = ['camp1_merge_ind', 'camp2_merge_ind', 'camp3_merge_ind', 'healthcamp_merge_ind', 'patient_merge_ind']

In [17]:
train_final_data = train_final_data.drop(unnec_cols, axis=1)
test_final_data = test_final_data.drop(unnec_cols, axis=1)

## Missing Imputations

In [18]:
from sklearn.impute import SimpleImputer

### Mean Imputation

In [19]:
mean_impute_cols = ['Age']
imp_mean = SimpleImputer(strategy='mean')
imp_mean.fit(train_final_data[mean_impute_cols])
train_final_data[mean_impute_cols] = imp_mean.transform(train_final_data[mean_impute_cols])
test_final_data[mean_impute_cols] = imp_mean.transform(test_final_data[mean_impute_cols])

### Frequent Imputation

In [20]:
freq_impute_cols = ['Income', 'Education_Score', 'City_Type', 'Employer_Category']
imp_freq = SimpleImputer(strategy='most_frequent')
imp_freq.fit(train_final_data[freq_impute_cols])
train_final_data[freq_impute_cols] = imp_freq.transform(train_final_data[freq_impute_cols])
test_final_data[freq_impute_cols] = imp_freq.transform(test_final_data[freq_impute_cols])

### Zero Imputation

In [21]:
zero_impute_cols = ['Donation', 'Health_Score', 'Health Score', 'Number_of_stall_visited', 'Last_Stall_Visited_Number']
train_final_data[zero_impute_cols] = train_final_data[zero_impute_cols].fillna(0)
test_final_data[zero_impute_cols] = test_final_data[zero_impute_cols].fillna(0)

### Missing Date Imputation

In [22]:
def date_impute(df):
    midpoint = df['Camp_Start_Date'] + (df['Camp_End_Date'] - df['Camp_Start_Date'])/2
    df['Registration_Date'] = df['Registration_Date'].fillna(midpoint)
    df['Registration_Date'] = pd.to_datetime(df['Registration_Date'], format='%Y-%m-%d')
    return df

In [23]:
train_final_data = date_impute(train_final_data)
test_final_data = date_impute(test_final_data)

# Feature Engineering

## 1. Duration of camp

In [25]:
train_final_data['Camp Duration'] = (train_final_data['Camp_End_Date'] - train_final_data['Camp_Start_Date']).dt.days
test_final_data['Camp Duration'] = (test_final_data['Camp_End_Date'] - test_final_data['Camp_Start_Date']).dt.days

## 2. Registered before/after start of camp

In [26]:
train_final_data['reg_start_diff'] = (train_final_data['Camp_Start_Date'] - train_final_data['Registration_Date']).dt.days
test_final_data['reg_start_diff'] = (test_final_data['Camp_Start_Date'] - test_final_data['Registration_Date']).dt.days

## 3. Is this the first camp?

In [45]:
temp = train_final_data.sort_values(['Patient_ID','Registration_Date'])

In [46]:
temp.shift(1)

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Donation,Health_Score,Health Score,Number_of_stall_visited,Last_Stall_Visited_Number,Camp_Start_Date,Camp_End_Date,Category1,Category2,Category3,Online_Follower,LinkedIn_Shared,Twitter_Shared,Facebook_Shared,Income,Education_Score,Age,First_Interaction,City_Type,Employer_Category,Outcome,Camp Duration,reg_start_diff
69348,,,NaT,,,,,,,,,,,NaT,NaT,,,,,,,,,,,NaT,,,,,
64479,485679.0,6578.0,2005-08-22,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,4.0,4.0,2005-08-16,2005-10-14,Third,G,2.0,0.0,0.0,0.0,0.0,0.0,82.0,47.906068,2005-08-12,I,Technology,1.0,59.0,-6.0
6484,485679.0,6555.0,2005-08-31,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,2005-09-15,2005-09-19,Second,A,2.0,0.0,0.0,0.0,0.0,0.0,82.0,47.906068,2005-08-12,I,Technology,0.0,4.0,15.0
18999,485680.0,6543.0,2006-07-10,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,2005-09-27,2007-11-07,First,F,2.0,0.0,0.0,0.0,0.0,0.0,82.0,47.906068,2006-07-10,A,Technology,0.0,771.0,-286.0
2604,485681.0,6580.0,2004-12-20,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,2004-12-22,2005-01-06,First,E,2.0,0.0,0.0,0.0,1.0,0.0,82.0,46.000000,2004-12-19,G,Technology,0.0,15.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18124,528656.0,6543.0,2006-04-18,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,2005-09-27,2007-11-07,First,F,2.0,0.0,0.0,0.0,0.0,0.0,82.0,47.906068,2005-10-12,D,Technology,0.0,771.0,-203.0
32744,528657.0,6531.0,2004-12-11,0.0,0.0,0.0,0.0,0.0,20.0,0.670886,0.000000,0.0,0.0,2004-12-09,2004-12-14,First,C,2.0,0.0,0.0,0.0,0.0,0.0,82.0,47.906068,2004-10-25,D,Technology,1.0,5.0,-2.0
7632,528657.0,6580.0,2004-12-18,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,2004-12-22,2005-01-06,First,E,2.0,0.0,0.0,0.0,0.0,0.0,82.0,47.906068,2004-10-25,D,Technology,0.0,15.0,4.0
24471,528657.0,6526.0,2004-12-30,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,2005-01-03,2005-02-20,First,E,2.0,0.0,0.0,0.0,0.0,0.0,82.0,47.906068,2004-10-25,D,Technology,0.0,48.0,4.0


In [47]:
temp

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Donation,Health_Score,Health Score,Number_of_stall_visited,Last_Stall_Visited_Number,Camp_Start_Date,Camp_End_Date,Category1,Category2,Category3,Online_Follower,LinkedIn_Shared,Twitter_Shared,Facebook_Shared,Income,Education_Score,Age,First_Interaction,City_Type,Employer_Category,Outcome,Camp Duration,reg_start_diff
69348,485679,6578,2005-08-22,0,0,0,0,0,0.0,0.000000,0.000000,4.0,4.0,2005-08-16,2005-10-14,Third,G,2,0,0,0,0,0.0,82.0,47.906068,2005-08-12,I,Technology,1,59,-6
64479,485679,6555,2005-08-31,0,0,0,0,0,0.0,0.000000,0.000000,0.0,0.0,2005-09-15,2005-09-19,Second,A,2,0,0,0,0,0.0,82.0,47.906068,2005-08-12,I,Technology,0,4,15
6484,485680,6543,2006-07-10,0,0,0,0,0,0.0,0.000000,0.000000,0.0,0.0,2005-09-27,2007-11-07,First,F,2,0,0,0,0,0.0,82.0,47.906068,2006-07-10,A,Technology,0,771,-286
18999,485681,6580,2004-12-20,0,0,0,0,0,0.0,0.000000,0.000000,0.0,0.0,2004-12-22,2005-01-06,First,E,2,0,0,0,1,0.0,82.0,46.000000,2004-12-19,G,Technology,0,15,2
2604,485681,6526,2005-01-01,0,0,0,0,0,0.0,0.000000,0.000000,0.0,0.0,2005-01-03,2005-02-20,First,E,2,0,0,0,1,0.0,82.0,46.000000,2004-12-19,G,Technology,0,48,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18124,528657,6531,2004-12-11,0,0,0,0,0,20.0,0.670886,0.000000,0.0,0.0,2004-12-09,2004-12-14,First,C,2,0,0,0,0,0.0,82.0,47.906068,2004-10-25,D,Technology,1,5,-2
32744,528657,6580,2004-12-18,0,0,0,0,0,0.0,0.000000,0.000000,0.0,0.0,2004-12-22,2005-01-06,First,E,2,0,0,0,0,0.0,82.0,47.906068,2004-10-25,D,Technology,0,15,4
7632,528657,6526,2004-12-30,0,0,0,0,0,0.0,0.000000,0.000000,0.0,0.0,2005-01-03,2005-02-20,First,E,2,0,0,0,0,0.0,82.0,47.906068,2004-10-25,D,Technology,0,48,4
24471,528657,6536,2005-02-13,0,0,0,0,0,0.0,0.000000,0.102063,0.0,0.0,2005-02-15,2005-02-18,Second,D,2,0,0,0,0,0.0,82.0,47.906068,2004-10-25,D,Technology,1,3,2


In [41]:
first_registration = train_final_data[['Patient_ID','Registration_Date']].sort_values(['Patient_ID','Registration_Date']).drop_duplicates('Patient_ID')

In [42]:
first_registration

Unnamed: 0,Patient_ID,Registration_Date
69348,485679,2005-08-22
6484,485680,2006-07-10
18999,485681,2004-12-20
60515,485682,2006-04-01
10868,485684,2005-05-23
...,...,...
24954,528651,2006-04-23
34692,528653,2005-02-14
49193,528655,2004-12-25
4130,528656,2005-10-12


In [38]:
train_final_data[['Patient_ID','Registration_Date']].sort_values(['Patient_ID','Registration_Date'])

Unnamed: 0,Patient_ID,Registration_Date
69348,485679,2005-08-22
64479,485679,2005-08-31
6484,485680,2006-07-10
18999,485681,2004-12-20
2604,485681,2005-01-01
...,...,...
18124,528657,2004-12-11
32744,528657,2004-12-18
7632,528657,2004-12-30
24471,528657,2005-02-13


In [33]:
train_final_data.sort_values(['Patient_ID','Registration_Date'])

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5,Donation,Health_Score,Health Score,Number_of_stall_visited,Last_Stall_Visited_Number,Camp_Start_Date,Camp_End_Date,Category1,Category2,Category3,Online_Follower,LinkedIn_Shared,Twitter_Shared,Facebook_Shared,Income,Education_Score,Age,First_Interaction,City_Type,Employer_Category,Outcome,Camp Duration,reg_start_diff
69348,485679,6578,2005-08-22,0,0,0,0,0,0.0,0.000000,0.000000,4.0,4.0,2005-08-16,2005-10-14,Third,G,2,0,0,0,0,0.0,82.0,47.906068,2005-08-12,I,Technology,1,59,-6
64479,485679,6555,2005-08-31,0,0,0,0,0,0.0,0.000000,0.000000,0.0,0.0,2005-09-15,2005-09-19,Second,A,2,0,0,0,0,0.0,82.0,47.906068,2005-08-12,I,Technology,0,4,15
6484,485680,6543,2006-07-10,0,0,0,0,0,0.0,0.000000,0.000000,0.0,0.0,2005-09-27,2007-11-07,First,F,2,0,0,0,0,0.0,82.0,47.906068,2006-07-10,A,Technology,0,771,-286
18999,485681,6580,2004-12-20,0,0,0,0,0,0.0,0.000000,0.000000,0.0,0.0,2004-12-22,2005-01-06,First,E,2,0,0,0,1,0.0,82.0,46.000000,2004-12-19,G,Technology,0,15,2
2604,485681,6526,2005-01-01,0,0,0,0,0,0.0,0.000000,0.000000,0.0,0.0,2005-01-03,2005-02-20,First,E,2,0,0,0,1,0.0,82.0,46.000000,2004-12-19,G,Technology,0,48,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18124,528657,6531,2004-12-11,0,0,0,0,0,20.0,0.670886,0.000000,0.0,0.0,2004-12-09,2004-12-14,First,C,2,0,0,0,0,0.0,82.0,47.906068,2004-10-25,D,Technology,1,5,-2
32744,528657,6580,2004-12-18,0,0,0,0,0,0.0,0.000000,0.000000,0.0,0.0,2004-12-22,2005-01-06,First,E,2,0,0,0,0,0.0,82.0,47.906068,2004-10-25,D,Technology,0,15,4
7632,528657,6526,2004-12-30,0,0,0,0,0,0.0,0.000000,0.000000,0.0,0.0,2005-01-03,2005-02-20,First,E,2,0,0,0,0,0.0,82.0,47.906068,2004-10-25,D,Technology,0,48,4
24471,528657,6536,2005-02-13,0,0,0,0,0,0.0,0.000000,0.102063,0.0,0.0,2005-02-15,2005-02-18,Second,D,2,0,0,0,0,0.0,82.0,47.906068,2004-10-25,D,Technology,1,3,2


## Constants

In [None]:
ID1 = 'Patient_ID'
ID2 = 'Health_Camp_ID'
target = 'Outcome'
date_columns = ['Registration_Date', 'Camp_Start_Date', 'Camp_End_Date', 'First_Interaction']
discrete_columns = ['Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Category1', 'Category2', 'Category3', 'Online_Follower', 
                   'LinkedIn_Shared', 'Twitter_Shared', 'Facebook_Shared', 'City_Type', 'Employer_Category']

In [None]:
train_final_data.columns

In [None]:
random_state = 1234

In [None]:
should_ohe = True
should_scale = True

## Scaling

In [None]:
if should_scale:
    for col in train_final_data.columns:
        if (col != target) and (col != ID1) and (col != ID2) and (col not in date_columns) and (col not in discrete_columns):
            mms = MinMaxScaler()
            ss = StandardScaler()
            rs = RobustScaler()
            pt = PowerTransformer()
            ft_log = FunctionTransformer(np.log)
            
            train_final_data[f"{col}_MMS"] = mms.fit_transform(train_final_data[[col]])
            test_final_data[f"{col}_MMS"] = mms.transform(test_final_data[[col]])
            
            train_final_data[f"{col}_SS"] = ss.fit_transform(train_final_data[[col]])
            test_final_data[f"{col}_SS"] = ss.transform(test_final_data[[col]])
            
            train_final_data[f"{col}_RS"] = rs.fit_transform(train_final_data[[col]])
            test_final_data[f"{col}_RS"] = rs.transform(test_final_data[[col]])
            
            train_final_data[f"{col}_PT"] = pt.fit_transform(train_final_data[[col]])
            test_final_data[f"{col}_PT"] = pt.transform(test_final_data[[col]])
            
#             train_final_data[f"{col}_FT_log"] = ft_log.fit_transform(train_final_data[[col]])
#             test_final_data[f"{col}_FT_log"] = ft_log.transform(test_final_data[[col]])

## One hot encoding

In [None]:
train_final_data['is_train'] = True
test_final_data['is_train'] = False

In [None]:
cols_for_ohe = ['Category1', 'Category2', 'City_Type', 'Employer_Category']

In [None]:
train_final_data = pd.concat([train_final_data.drop(cols_for_ohe,axis=1),pd.get_dummies(train_final_data[cols_for_ohe])],axis=1)
test_final_data = pd.concat([test_final_data.drop(cols_for_ohe,axis=1),pd.get_dummies(test_final_data[cols_for_ohe])],axis=1)

# Test Train Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
ignore_cols_train = [ID1, ID2, target, 'Registration_Date', 'Camp_Start_Date', 'Camp_End_Date', 'First_Interaction']
ignore_cols_test = [ID1, ID2, 'Registration_Date', 'Camp_Start_Date', 'Camp_End_Date', 'First_Interaction']
X, y = train_final_data.drop(ignore_cols_train, axis=1), train_final_data[target]
X_test = test_final_data.drop(ignore_cols_test, axis=1)

In [None]:
sum(X_test.isna().sum())

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=random_state)

# Base Models

In [None]:
n_esitmators = 1000
classifiers = {
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(n_estimators=n_esitmators, random_state=random_state),
    "GBM": GradientBoostingClassifier(n_estimators=n_esitmators, random_state=random_state),
    "GBM_ES": GradientBoostingClassifier(n_estimators=n_esitmators, validation_fraction=0.2, 
                                         n_iter_no_change=5,tol=0.01,random_state=random_state)
}

In [None]:
classifiers.items()

In [None]:
clf_ct=0
for model_name,clf in classifiers.items():
    print(f"{clf_ct+1} Building {model_name} starts..")
    start_ts = datetime.datetime.now()
    
    cv_scores = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=5)
    print(f"\t Mean ROC AUC: {np.mean(cv_scores)} +/- {np.std(cv_scores)}")
    
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_val)
    auc_score = roc_auc_score(y_val, predictions)
    print(f"\t ROC AUC: {auc_score}")
    
    fig = plt.figure(figsize=(12,7))
    ax = fig.add_subplot(111)
    disp = plot_confusion_matrix(clf, X_val, y_val, normalize='true', ax=ax, cmap=plt.cm.Blues)
    disp.ax_.set_title(f"Confustion Matrix for model: {model_name}")
    os.makedirs("../plots/confusion_matrix_scale_ohe",exist_ok=True)
    plt.savefig(f"../plots/confusion_matrix_scale_ohe/{model_name}.png", dpi=300)
    end_ts = datetime.datetime.now()
    
    print(f"It took {end_ts - start_ts} time to finish the modelling")
    print(f"{clf_ct+1} Building {model_name} starts..")
    
    clf_ct+=1
    

In [None]:
sum(train_final_data.isna().sum())

In [None]:
var_imp = pd.DataFrame({'var':X.columns, 'var_imp':classifiers['GBM'].feature_importances_})

In [None]:
var_imp.sort_values('var_imp', ascending=False).head(30)

In [None]:
X.columns