In [1]:
import pandas as pd
import os
import urllib.request
import numpy as np

In [2]:
TITANIC_PATH = os.path.join("datasets", "titanic")
DOWNLOAD_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/titanic/"

def fetch_titanic_data(url=DOWNLOAD_URL, path=TITANIC_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename in ("train.csv", "test.csv"):
        filepath = os.path.join(path, filename)
        if not os.path.isfile(filepath):
            print("Downloading", filename)
            urllib.request.urlretrieve(url + filename, filepath)

fetch_titanic_data()   

In [3]:
#load the csv file that is extracted by the help of above function
def load_data(data_path, data_name):
    csv_path = os.path.join(data_path, data_name)
    return pd.read_csv(csv_path)

#get path of this notebook and join /Downloads to it
path = os.getcwd()

#get y_train from X_train
X_train = load_data(path, 'train.csv')
y_train = X_train['Survived']
#X_train = X_train.drop('Survived', axis=1) do it later

#We don't have given y_test, instead will submit our predictions on Kaggle 
X_test = load_data(path, 'test.csv')

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import SVC

class ClfSwitcher(BaseEstimator):

    def __init__(self, estimator = SVC()):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """ 
        self.estimator = estimator

    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

In [5]:
#Data Imputation(1) -Embarked, -Age, -Cabin, -Fare 
from copy import copy

class Data_Imputer(BaseEstimator, TransformerMixin):
    
    def __init__(self, embarked = True, age = True, fare = True, mean = True): # no *args or **kargs
        self.embarked = embarked
        self.age = age
        self.fare = fare
        self.mean = mean
        
    def fit(self, X_df, y=None):
        return self  # nothing else to do
    
    def transform(self, X): #takes and outputs dataframe
        X_df = copy(X)
        X_df = X_df.reset_index(drop = True) #crucial since CV takes random data chunks

        #get col indexes of the columns below
        cols = [ 'Age', 'Sex', 'Pclass', 'Fare'] 
        age_ix, sex_ix, pclass_ix, fare_ix = [X_df.columns.get_loc(c) for c in cols] 
        
        #Impute Embarked
        X_df['Embarked'] = X_df['Embarked'].mask(X_df['Embarked'].isnull(), 'S')

        
        #Impute Age
        if self.age:
            if self.mean:
                group_metric = X_df.groupby(['Pclass', 'Sex']).mean()['Age'].round() #calculate group means
            else:
                group_metric = X_df.groupby(['Pclass', 'Sex']).median()['Age'].round() # median
            
            impute_index = X_df[X_df['Age'].isnull()].index #list of indexes
            
            if len(impute_index) != 0:
                for row_ix in impute_index:
                
                    if (X_df.iloc[row_ix, pclass_ix] == 1) & (X_df.iloc[row_ix, sex_ix] == 'female'):
                        X_df.iloc[row_ix, age_ix] = group_metric[1][0]
                    
                    if (X_df.iloc[row_ix, pclass_ix] == 1) & (X_df.iloc[row_ix, sex_ix] == 'male'):
                        X_df.iloc[row_ix, age_ix] = group_metric[1][1]
                    
                    if (X_df.iloc[row_ix, pclass_ix] == 2) & (X_df.iloc[row_ix, sex_ix] == 'female'):
                        X_df.iloc[row_ix, age_ix] = group_metric[2][0]
                    
                    if (X_df.iloc[row_ix, pclass_ix] == 2) & (X_df.iloc[row_ix, sex_ix] == 'male'):
                        X_df.iloc[row_ix, age_ix] = group_metric[2][1]
                    
                    if (X_df.iloc[row_ix, pclass_ix] == 3) & (X_df.iloc[row_ix, sex_ix] == 'female'):
                        X_df.iloc[row_ix, age_ix] = group_metric[3][0]
                    
                    if (X_df.iloc[row_ix, pclass_ix] == 3) & (X_df.iloc[row_ix, sex_ix] == 'male'):
                        X_df.iloc[row_ix, age_ix] = group_metric[3][1]
            
            else:
                pass

        
        #Impute Fare
        if self.fare:
            if self.mean:
                group_metric = X_df.groupby(['Pclass']).mean()['Fare'].round() #calculate group means
                
            else:
                group_metric = X_df.groupby(['Pclass']).median()['Fare'].round() # median
            
            impute_index = X_df[X_df['Fare'].isnull()].index #list of indexes
        
        
            for row_ix in impute_index:
                
                if (X_df.iloc[row_ix, pclass_ix] == 1):
                    X_df.iloc[row_ix, fare_ix] = group_metric[1]
                    
                if (X_df.iloc[row_ix, pclass_ix] == 2):
                    X_df.iloc[row_ix, fare_ix] = group_metric[2]
                    
                if (X_df.iloc[row_ix, pclass_ix] == 3):
                    X_df.iloc[row_ix, fare_ix] = group_metric[3]

        
        return X_df #DataFrame

In [6]:
#Feature Engineering (2)
class Feature_Engineer(BaseEstimator, TransformerMixin):
    
    def __init__(self, familysize = True, isalone = True, title = True): # no *args or **kargs
        self.familysize = familysize
        self.isalone = isalone
        self.title = title
        
    def fit(self, X, y=None):
        return self  # nothing else to do
    
    def transform(self, X):
        X_df = copy(X)
        
        X_df['Cabin'] = X_df['Cabin'].mask(X_df['Cabin'].isnull(), 0)
        X_df['Cabin'] = X_df['Cabin'].mask(X_df['Cabin']!=0, 1)
        
            
        if self.familysize:
            
            X_df['Famsize'] = X_df['SibSp'] + X_df['Parch'] + 1
            X_df['Famsize'] = X_df['Famsize'].astype(object)
            
        if self.isalone:
            
            X_df['Famsize'] = X_df['SibSp'] + X_df['Parch'] + 1
            X_df['Is_Alone'] = X_df['Famsize'].mask(X_df['Famsize']==1, 1)
            X_df['Is_Alone'] = X_df['Famsize'].mask(X_df['Famsize']!=1, 0)
            X_df['Is_Alone'] = X_df['Is_Alone'].astype(object)
        
        if self.title:
            
            X_df['Title'] = X_df['Name'].str.extract('([A-Za-z]+)\.')
            X_df['Title'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
                         ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'], inplace=True)
            

        #Change data type # crucial for features to have correct dtype for the successing ColumnTransformer
        X_df['Pclass'] = X_df['Pclass'].astype(object)
        X_df['SibSp'] = X_df['SibSp'].astype(object)
        X_df['Parch'] = X_df['Parch'].astype(object)
        
        #delete
        X_df.drop(['PassengerId'], axis=1, inplace=True) #delete passangerId
        X_df.drop(['Ticket'], axis=1, inplace=True) #delete Ticket (be sure that passangerID comes before Ticket (+1))
        X_df.drop(['Name'], axis=1, inplace=True)  #delete Name
        
        try:
            X_df.drop(['Survived'], axis=1, inplace=True)
            
        except:
            pass
        
        return X_df #DataFrame

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


#preprocessing pipeline 
pp_pipeline = Pipeline([
    ('Data_Imputer', Data_Imputer()),
    ('Feature_Engineer', Feature_Engineer())
       ])

#transformer pipeline
tr_pipeline = ColumnTransformer([
        ("Scaler", StandardScaler(),make_column_selector(dtype_include="float")),
        ("Hot_cat", OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include="object")),
        
    ], remainder='drop')


#combine previous two piplines + estimator
full_pipeline = Pipeline([
        ("pp_pipeline", pp_pipeline),
        ("tr_pipeline", tr_pipeline),
        ("clf", ClfSwitcher()),
    ])

#GridSearch CrossValidation:
parameters = [
    {
        'clf__estimator': [SVC()],
        'clf__estimator__gamma': ['scale', 'auto'],
        'clf__estimator__C': [10, 30, 50],
        'pp_pipeline__Feature_Engineer__isalone':[True, False],
        'pp_pipeline__Feature_Engineer__familysize': [True, False],
        'pp_pipeline__Data_Imputer__mean': [True, False],
        
        
    },
    
    {
        'clf__estimator': [RandomForestClassifier()],
        'clf__estimator__n_estimators': [250],
        'clf__estimator__criterion': ['gini', 'entropy'],
        'clf__estimator__max_depth': [3, 4, 5],
        'pp_pipeline__Feature_Engineer__isalone':[True, False],
        'pp_pipeline__Feature_Engineer__familysize': [True, False],
        'pp_pipeline__Data_Imputer__mean': [True, False],
    },
]

gscv = GridSearchCV(full_pipeline, param_grid=parameters, cv=2, verbose=3)
gscv.fit(X_train, y_train)

Fitting 2 folds for each of 96 candidates, totalling 192 fits
[CV 1/2] END clf__estimator=SVC(), clf__estimator__C=10, clf__estimator__gamma=scale, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Feature_Engineer__isalone=True;, score=0.791 total time=   0.3s
[CV 2/2] END clf__estimator=SVC(), clf__estimator__C=10, clf__estimator__gamma=scale, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Feature_Engineer__isalone=True;, score=0.793 total time=   0.4s
[CV 1/2] END clf__estimator=SVC(), clf__estimator__C=10, clf__estimator__gamma=scale, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Feature_Engineer__isalone=False;, score=0.791 total time=   0.3s
[CV 2/2] END clf__estimator=SVC(), clf__estimator__C=10, clf__estimator__gamma=scale, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Featu

[CV 2/2] END clf__estimator=SVC(), clf__estimator__C=30, clf__estimator__gamma=scale, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Feature_Engineer__isalone=True;, score=0.789 total time=   0.4s
[CV 1/2] END clf__estimator=SVC(), clf__estimator__C=30, clf__estimator__gamma=scale, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Feature_Engineer__isalone=False;, score=0.780 total time=   0.3s
[CV 2/2] END clf__estimator=SVC(), clf__estimator__C=30, clf__estimator__gamma=scale, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Feature_Engineer__isalone=False;, score=0.784 total time=   0.3s
[CV 1/2] END clf__estimator=SVC(), clf__estimator__C=30, clf__estimator__gamma=scale, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=False, pp_pipeline__Feature_Engineer__isalone=True;, score=0.791 total time=   0.3s
[

[CV 1/2] END clf__estimator=SVC(), clf__estimator__C=50, clf__estimator__gamma=scale, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Feature_Engineer__isalone=False;, score=0.771 total time=   0.3s
[CV 2/2] END clf__estimator=SVC(), clf__estimator__C=50, clf__estimator__gamma=scale, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Feature_Engineer__isalone=False;, score=0.787 total time=   0.3s
[CV 1/2] END clf__estimator=SVC(), clf__estimator__C=50, clf__estimator__gamma=scale, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=False, pp_pipeline__Feature_Engineer__isalone=True;, score=0.771 total time=   0.3s
[CV 2/2] END clf__estimator=SVC(), clf__estimator__C=50, clf__estimator__gamma=scale, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=False, pp_pipeline__Feature_Engineer__isalone=True;, score=0.784 total time=   0.3s


[CV 2/2] END clf__estimator=RandomForestClassifier(), clf__estimator__criterion=gini, clf__estimator__max_depth=3, clf__estimator__n_estimators=250, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Feature_Engineer__isalone=False;, score=0.787 total time=   0.8s
[CV 1/2] END clf__estimator=RandomForestClassifier(), clf__estimator__criterion=gini, clf__estimator__max_depth=3, clf__estimator__n_estimators=250, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=False, pp_pipeline__Feature_Engineer__isalone=True;, score=0.783 total time=   0.7s
[CV 2/2] END clf__estimator=RandomForestClassifier(), clf__estimator__criterion=gini, clf__estimator__max_depth=3, clf__estimator__n_estimators=250, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=False, pp_pipeline__Feature_Engineer__isalone=True;, score=0.778 total time=   0.8s
[CV 1/2] END clf__estimator=RandomForestClassifier(), clf__es

[CV 1/2] END clf__estimator=RandomForestClassifier(), clf__estimator__criterion=gini, clf__estimator__max_depth=4, clf__estimator__n_estimators=250, pp_pipeline__Data_Imputer__mean=False, pp_pipeline__Feature_Engineer__familysize=False, pp_pipeline__Feature_Engineer__isalone=False;, score=0.794 total time=   0.8s
[CV 2/2] END clf__estimator=RandomForestClassifier(), clf__estimator__criterion=gini, clf__estimator__max_depth=4, clf__estimator__n_estimators=250, pp_pipeline__Data_Imputer__mean=False, pp_pipeline__Feature_Engineer__familysize=False, pp_pipeline__Feature_Engineer__isalone=False;, score=0.804 total time=   0.8s
[CV 1/2] END clf__estimator=RandomForestClassifier(), clf__estimator__criterion=gini, clf__estimator__max_depth=5, clf__estimator__n_estimators=250, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Feature_Engineer__isalone=True;, score=0.771 total time=   0.8s
[CV 2/2] END clf__estimator=RandomForestClassifier(), clf_

[CV 2/2] END clf__estimator=RandomForestClassifier(), clf__estimator__criterion=entropy, clf__estimator__max_depth=3, clf__estimator__n_estimators=250, pp_pipeline__Data_Imputer__mean=False, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Feature_Engineer__isalone=True;, score=0.780 total time=   0.9s
[CV 1/2] END clf__estimator=RandomForestClassifier(), clf__estimator__criterion=entropy, clf__estimator__max_depth=3, clf__estimator__n_estimators=250, pp_pipeline__Data_Imputer__mean=False, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Feature_Engineer__isalone=False;, score=0.816 total time=   0.9s
[CV 2/2] END clf__estimator=RandomForestClassifier(), clf__estimator__criterion=entropy, clf__estimator__max_depth=3, clf__estimator__n_estimators=250, pp_pipeline__Data_Imputer__mean=False, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Feature_Engineer__isalone=False;, score=0.780 total time=   0.9s
[CV 1/2] END clf__estimator=RandomForestClassifier

[CV 2/2] END clf__estimator=RandomForestClassifier(), clf__estimator__criterion=entropy, clf__estimator__max_depth=5, clf__estimator__n_estimators=250, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=True, pp_pipeline__Feature_Engineer__isalone=False;, score=0.818 total time=   0.8s
[CV 1/2] END clf__estimator=RandomForestClassifier(), clf__estimator__criterion=entropy, clf__estimator__max_depth=5, clf__estimator__n_estimators=250, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=False, pp_pipeline__Feature_Engineer__isalone=True;, score=0.783 total time=   0.8s
[CV 2/2] END clf__estimator=RandomForestClassifier(), clf__estimator__criterion=entropy, clf__estimator__max_depth=5, clf__estimator__n_estimators=250, pp_pipeline__Data_Imputer__mean=True, pp_pipeline__Feature_Engineer__familysize=False, pp_pipeline__Feature_Engineer__isalone=True;, score=0.804 total time=   0.7s
[CV 1/2] END clf__estimator=RandomForestClassifier()

In [15]:
best = gscv.best_estimator_.fit(X_train, y_train)
pred = best.predict(X_test)

d = np.c_[X_test.PassengerId.values,pred]
surv = pd.DataFrame(data = d, columns = ['PassengerId', 'Survived'])
surv.to_csv(path, index =False)

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,