### IMPORTING DEPENDENCIES

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

### DATA LOADING AND SPLITTING

In [2]:
train=pd.read_csv('../data/train.csv')
test=pd.read_csv('../data/test.csv')

##### In the cell below the provided train data is being split into train and val data

In [3]:
y = train['Survived']
X = train.drop(columns=['PassengerId', 'Survived', 'Ticket', 'Cabin'])
X_test = test[['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=23)

### FEATURE ENGINEERING

In [4]:
#survived:      Survival            0 = No, 1 = Yes
#PassengerId:   Unique Id
#pclass:        Ticket class        1 = 1st, 2 = 2nd, 3 = 3rd
#sex:           Sex
#Age:           Age in years 
#SibSp:         # of siblings / spouses aboard the Titanic
#parch:         # of parents / children aboard the Titanic
#ticket:        Ticket number 
#fare:          Passenger fare 
#cabin:         Cabin number 
#embarked:      Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton

##### Below two new features ('FamilySize' and 'IsAlone') are made based on two existing features ('SibSp' and 'Parch')

In [5]:
### DETERMINE FAMILIY SIZE

data = [X_train, X_val, X_test]
for dataset in data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch']
    dataset.drop(['SibSp', 'Parch'], axis=1, inplace=True)
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0

##### Below a new feature ('Title') is created based on the feature 'Name'

In [6]:
### EXTRACT TITLES FROM Name COLUMN

data = [X_train, X_val, X_test]
for dataset in data:
    dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    dataset.drop(['Name'], axis=1, inplace=True)
    dataset['Title'].loc[dataset['Title'] == 'Miss'] = 'Mrs'
    rare_titles = dataset.Title.value_counts() < 10
    dataset['Title'] = dataset.Title.apply(lambda x: 'rare' if rare_titles[x] else x)

### PIPELINE TO PERFOM IMPUTATION, ONE-HOT ENCODING AND PRINCIPAL COMPONENT ANALYSIS

##### Below two pipelines are created with different combination of transformers

In [7]:
### PIPELINE FOR CATEGORICAL DATA

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False)),
    ('pca', PCA(n_components=10))
])

### PIPELINE FOR NUMERICAL DATA

numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', RobustScaler())
])

##### Below two different groups of columns are created

In [8]:
categorical_columns = ['Pclass', 'Sex', 'Embarked', 'IsAlone', 'Title']

numerical_columns = ['Age', 'Fare', 'FamilySize']

### PROCESSING DATA BY PACKING TWO PIPELINES VIA COLUMNTRANSFORMER 

##### Below a specific ColumnTransformer is created, which is combining two different transformer pipelines for two different column groups, respectively.

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical_columns),
        ('categorical', categorical_transformer, categorical_columns)
    ])

### FINAL PIPELINE TO INCLUDE THE CLASSIFIER MODEL

##### Below seven different pipelines are created to perform data preprocessing and subsequently, training a specific estimator. A list is created at the end.

In [10]:
### Random Forest:

clf_rfc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100))
])

### Logistic Regression:

clf_lor = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

### Stochastic Gradient Descent (SGD):

clf_sgdc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(max_iter=5, tol=None))
])

### K Nearest Neighbor:

clf_knc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=3))
])

### Gaussian Naive Bayes:

clf_gnb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])

### Perceptron:

clf_per = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', Perceptron(max_iter=5))
])

### Linear Support Vector Machine:

clf_lsvc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC())
])

### Decision Tree:

clf_dtc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier())
])


model_pipelines = [clf_rfc, clf_lor, clf_sgdc, clf_knc, clf_gnb, clf_per, clf_lsvc, clf_dtc]

##### A loop is created to perform the following using each pipline from the model_pipeleines list.
- It trains an estimator
- It calculates accuracies for train and val data
- It calculates cross_val_score using train data 

In [11]:
scores = []
for pipeline in model_pipelines:
    pipeline.fit(X_train, y_train)
    accuracy_train = round(pipeline.score(X_train, y_train), 3)
    accuracy_val = round(pipeline.score(X_val, y_val), 3)
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    cv_mean_accuracy_train = round(cv_scores.mean(),3)
    cv_std_accuracy_train = round(cv_scores.std(), 3)
    scores.append((str(pipeline[1]), accuracy_train, accuracy_val, cv_mean_accuracy_train, cv_std_accuracy_train))


model_specific_scores = pd.DataFrame(scores, columns=['Model', 'Accu(train)', 'Accu(val)', 'cv_mean_Accu(train)',
                                                      'cv_score_std'])
model_specific_scores

Unnamed: 0,Model,Accu(train),Accu(val),cv_mean_Accu(train),cv_score_std
0,RandomForestClassifier(),0.986,0.81,0.79,0.035
1,LogisticRegression(),0.839,0.81,0.828,0.022
2,"SGDClassifier(max_iter=5, tol=None)",0.615,0.593,0.74,0.066
3,KNeighborsClassifier(n_neighbors=3),0.883,0.81,0.799,0.023
4,GaussianNB(),0.795,0.806,0.783,0.022
5,Perceptron(max_iter=5),0.73,0.728,0.713,0.09
6,LinearSVC(),0.836,0.81,0.836,0.023
7,DecisionTreeClassifier(),0.986,0.772,0.767,0.021


##### Different dictionaries are created for each pipeline created above with a set of parameters as keys and a list of parameter settings as values. A list of dictionaries is created at the end.

In [12]:
numerical_transformer_dist = {'preprocessor__numerical__imputer__n_neighbors': list(range(2, 15)),
                              'preprocessor__numerical__imputer__add_indicator': [True, False]}
categorical_transformer_dist = {'preprocessor__categorical__imputer__strategy': ['most_frequent', 'constant'],
                                'preprocessor__categorical__imputer__add_indicator': [True, False],
                                'preprocessor__categorical__pca__n_components': list(range(2, 15))}
rfc_dist = {'classifier__bootstrap': [True, False],
            'classifier__max_depth': list(range(2, 20)),
            'classifier__n_estimators': list(range(50, 500))}

lor_dist = {'classifier__C': [100, 10, 1.0, 0.1, 0.01],
            'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            'classifier__penalty': ['none', 'l1', 'l2', 'elasticnet']}

sgdc_dist = {"classifier__n_iter_no_change": [1, 5, 10],
             "classifier__alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
             "classifier__penalty": ['l1', 'l2', 'elasticnet']}

knc_dist = {"classifier__leaf_size": list(range(1,50)),
            "classifier__n_neighbors": list(range(1,30)),
            "classifier__p": [1,2]}

gnb_dist = {'classifier__var_smoothing': np.logspace(0,-9, num=100)}

per_dist = {"classifier__penalty": ['l1', 'l2', 'elasticnet'],
            "classifier__alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}

lsvc_dist = {'classifier__C': [0.1,1, 10, 100],
             "classifier__penalty": ['l1', 'l2'],
             'classifier__loss': ['hinge', 'squared_hinge']}

dtc_dist = {'classifier__criterion': ['gini', 'entropy'],
            'classifier__max_depth': [2,4,6,8,10,12]}

model_params = [rfc_dist, lor_dist, sgdc_dist, knc_dist, gnb_dist, per_dist, lsvc_dist, dtc_dist]

##### A loop is created to perform hyper-parameter optimization using RandomizedSearchCV for each model specific pipeline and corresponding parameter dictionary.

- It implements RandomizedSearchCV
- It calculates mean cross-validation score
- It extracts the best parameter settings only based on what were given as input
- obtained best_estimator settings are used to train an estimator again
- Accuracies are calculated while using train and val data
- It calculates cross_val_score using train data (This step is not necessary as the best_score_ already provides mean cross-validated score of the best_estimator)

In [13]:
best_values = []
for i in range(len(model_params)):
    #print(model_params[i])
    '''
    performing randomized search on hyper parameters and noting best score, which is a mean
    cross-validated score of the best_estimator, and best parameters
    '''
    param_dist = {**numerical_transformer_dist, **categorical_transformer_dist, **model_params[i]}
    rscv = RandomizedSearchCV(model_pipelines[i], param_distributions=param_dist, n_iter=100)
    rscv.fit(X_train, y_train)
    #rscv_best_score = round(rscv.best_score_, 3) *** 
    best_parameters = rscv.best_params_
    '''
    training the estimator again with best best parameter setting to get scores for comparing
    '''
    model_best = rscv.best_estimator_
    model_best.fit(X_train, y_train)
    model_new_train_score = round(model_best.score(X_train, y_train), 3)
    model_new_val_score = round(model_best.score(X_val, y_val), 3)
    
    '''
    performing cross-validation again with best estimator setting to get scores for comparing
    '''
    cv_scores_with_best_params = cross_val_score(model_best, X_train, y_train, cv=5, scoring='accuracy')
    cv_best_param_mean_accuracy_train = round(cv_scores_with_best_params.mean(),3)
    cv_best_param_std_accuracy_train = round(cv_scores_with_best_params.std(), 3)
    '''
    making predictions with best_estimator and writing out a .csv file uploading it to kaggle
    '''
    ypred = model_best.predict(X_test)
    output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': ypred})
    output.to_csv("pred_{}.csv".format(str(model_pipelines[i][1]).split('()')[0]), index=False)
    '''
    updating the created empty dictionary 'best_params' with different scores and best parameters for each estimator
    '''
    best_values.append((str(model_pipelines[i][1]), best_parameters, model_new_train_score, 
                       model_new_val_score, cv_best_param_mean_accuracy_train,
                       cv_best_param_std_accuracy_train))

best_values_df = pd.DataFrame(best_values, columns=['Model', 'Best_parameters', 'Accu_new(train)', 'Accu_new(val)',
                                                    'cv_mean_Accu_new(train)', 'cv_score_std_new(val)'])
best_values_df

Unnamed: 0,Model,Best_parameters,Accu_new(train),Accu_new(val),cv_mean_Accu_new(train),cv_score_std_new(val)
0,RandomForestClassifier(),{'preprocessor__numerical__imputer__n_neighbor...,0.888,0.802,0.825,0.026
1,LogisticRegression(),{'preprocessor__numerical__imputer__n_neighbor...,0.849,0.799,0.838,0.022
2,"SGDClassifier(max_iter=5, tol=None)",{'preprocessor__numerical__imputer__n_neighbor...,0.825,0.802,0.823,0.032
3,KNeighborsClassifier(n_neighbors=3),{'preprocessor__numerical__imputer__n_neighbor...,0.823,0.806,0.828,0.011
4,GaussianNB(),{'preprocessor__numerical__imputer__n_neighbor...,0.807,0.799,0.807,0.015
5,Perceptron(max_iter=5),{'preprocessor__numerical__imputer__n_neighbor...,0.669,0.701,0.779,0.047
6,LinearSVC(),{'preprocessor__numerical__imputer__n_neighbor...,0.838,0.81,0.84,0.029
7,DecisionTreeClassifier(),{'preprocessor__numerical__imputer__n_neighbor...,0.865,0.81,0.832,0.03


### COMPARING

In [14]:
Accu_before = model_specific_scores[['Model', 'Accu(train)', 'cv_mean_Accu(train)']]
Accu_after = best_values_df[['Model', 'Accu_new(train)', 'cv_mean_Accu_new(train)']]
compare_accu_df = Accu_before.merge(Accu_after)
compare_accu_df.sort_values(by=['cv_mean_Accu_new(train)'], ascending=False)
#Accu_before
#Accu_after

Unnamed: 0,Model,Accu(train),cv_mean_Accu(train),Accu_new(train),cv_mean_Accu_new(train)
6,LinearSVC(),0.836,0.836,0.838,0.84
1,LogisticRegression(),0.839,0.828,0.849,0.838
7,DecisionTreeClassifier(),0.986,0.767,0.865,0.832
3,KNeighborsClassifier(n_neighbors=3),0.883,0.799,0.823,0.828
0,RandomForestClassifier(),0.986,0.79,0.888,0.825
2,"SGDClassifier(max_iter=5, tol=None)",0.615,0.74,0.825,0.823
4,GaussianNB(),0.795,0.783,0.807,0.807
5,Perceptron(max_iter=5),0.73,0.713,0.669,0.779


### CONCLUSION
- Above table shows that the difference between accuracy and the mean accuracy after cross-validation is larger in many cases campared to that after optimization, which highlights the usefulness of hyper-parameter optimization.