In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from scipy.stats.distributions import randint, uniform
from scipy.stats import mode
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('../train.csv')

In [3]:
test = pd.read_csv('../test.csv')

In [4]:
X_train = df.drop('Survived', axis=1)
y_train = df['Survived'].ravel()

In [5]:
def write_test(df, mod, n=1):
    dft = pd.DataFrame(np.c_[df['PassengerId'],mod.predict(df)], columns=['PassengerId', 'Survived'])
    dft.to_csv('Predictions_Teste{}.csv'.format(n), index=False)

In [6]:
class Impute_Better(BaseEstimator, TransformerMixin):
    """ Impute missing data in a more effective way, groupwise by Sex and Pclass"""
    def __init__(self, col='Age', func=np.mean):
        self.col = col
        self.func = func
    def fit(self, X, y=None):
        self.vals = X.groupby(by=['Sex', 'Pclass'])[self.col].agg(self.func)
        return self
    def transform(self, X, y=None):
        Xt = X.copy()
        for i in range(len(self.vals)):
            Xt.loc[((X['Sex'] == self.vals.index[i][0]) & (X['Pclass'] == self.vals.index[i][1])),self.col] = self.vals[i]
        return Xt

In [7]:
class New_Features(BaseEstimator, TransformerMixin):
    """ Add new features, in this case Parch + SibSp """
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        Xt = X.copy()
        Xt['Nrelatives'] = Xt['Parch'] + Xt['SibSp']
        return Xt

In [8]:
class Transf_Conti(BaseEstimator, TransformerMixin):
    """ Transforms a feature in categorical or startardizes it"""
    def __init__(self, n=4, q=False, conti=False):
        self.n = n
        self.q = q
        self.conti=conti
    def fit(self, X, y=None):
        if not self.conti:
            if not self.q:
                self.binst = pd.cut(X.values[:,0], bins=self.n, retbins=True, duplicates='drop')[1]
            if self.q:
                self.binst = pd.qcut(X.values[:,0], q=self.n, retbins=True, duplicates='drop')[1]
        return self
    def transform(self, X, y=None):
        if not self.conti:
            return pd.cut(X.values[:,0], bins=self.binst, labels=False, include_lowest=True, duplicates='drop').reshape(-1, 1)
        else:
            return StandardScaler().fit_transform(X)

In [9]:
class Select_Features(BaseEstimator, TransformerMixin):
    def __init__(self, select=False):
        self.select=select
    def fit(self, X, y=None):
        return self
    def transform(self, X,y=None):
        if not self.select:
            return X
        return X[:, self.select]

In [10]:
Pipe_Impute = Pipeline([
    #Impute Pipeline, the onde which imputes missing data and creates new features
    ('Age_Impute', Impute_Better(col='Age', func=np.median)),
    ('Fare_Impute', Impute_Better(col='Fare', func=np.median)),
    ('Embarked_Impute', Impute_Better(col='Embarked', func=lambda x: mode(x)[0][0])),
    ('New_Features', New_Features())
])

In [11]:
Pipe_Preprocess = ColumnTransformer([
    ### Preprocessing pipeline. Encodes, standardizes
    ('Ordinal_Encode', OrdinalEncoder(), ['Sex','Pclass']), 
    ('Age_Encode', Transf_Conti(), ['Age']), 
    ('Fare_Encode', Transf_Conti(), ['Fare']), 
    ('Keep', 'passthrough', ['Parch', 'SibSp', 'Nrelatives']), 
    ('Embarked_Encode', OneHotEncoder(), ['Embarked']),
])

# 1 - Logistic Regression

In [67]:
FullPipePredict_LogRegression = Pipeline([
    ('Impute_Pipeline', Pipe_Impute),
    ('Preprocess_Pipeline', Pipe_Preprocess),
    ('Feature_Selection', Select_Features()),
    ('Predict', LogisticRegression(max_iter=1000, n_jobs=-1))
    #('Predict', RandomForestClassifier())
])

In [72]:
param_LogReg = {
    'Preprocess_Pipeline__Age_Encode__n':randint(3,7),
    'Preprocess_Pipeline__Age_Encode__q':randint(0,2),
    'Preprocess_Pipeline__Age_Encode__conti':randint(0,2),
    'Preprocess_Pipeline__Fare_Encode__n':randint(4,7),
    'Preprocess_Pipeline__Fare_Encode__conti':randint(0,2),
    'Preprocess_Pipeline__Fare_Encode__q':randint(0,2),
    'Feature_Selection__select':[[0,1,2,3,6]],
    'Predict__C':uniform(0.1, 2.5)
}

In [73]:
grid_LogReg = RandomizedSearchCV(FullPipePredict_LogRegression, param_distributions=param_LogReg, cv=5, n_iter=300, 
                                scoring='accuracy')

In [74]:
grid_LogReg.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('Impute_Pipeline',
                                              Pipeline(steps=[('Age_Impute',
                                                               Impute_Better(func=<function median at 0x000002047D47D700>)),
                                                              ('Fare_Impute',
                                                               Impute_Better(col='Fare',
                                                                             func=<function median at 0x000002047D47D700>)),
                                                              ('Embarked_Impute',
                                                               Impute_Better(col='Embarked',
                                                                             func=<function <lambda> at 0x0000020408108C10>)),
                                                              ('New_Features'...
                                 

In [78]:
grid_LogReg.best_params_

{'Feature_Selection__select': [0, 1, 2, 3, 6],
 'Predict__C': 1.1579992326110315,
 'Preprocess_Pipeline__Age_Encode__conti': 0,
 'Preprocess_Pipeline__Age_Encode__n': 4,
 'Preprocess_Pipeline__Age_Encode__q': 0,
 'Preprocess_Pipeline__Fare_Encode__conti': 1,
 'Preprocess_Pipeline__Fare_Encode__n': 6,
 'Preprocess_Pipeline__Fare_Encode__q': 0}

In [79]:
best_logreg = grid_LogReg.best_estimator_

In [82]:
write_test(test, best_logreg, 'Best_LogReg') #Produced: 0.77751 accuracy, best

# 2 - SVC

In [13]:
from sklearn.svm import SVC, LinearSVC

In [71]:
FullPipePredict_SVC = Pipeline([
    ('Impute_Pipeline', Pipe_Impute),
    ('Preprocess_Pipeline', Pipe_Preprocess),
    ('Feature_Selection', Select_Features()),
    ('Predict', SVC(max_iter=1000000))
])

In [91]:
param_SVC = [{
    'Preprocess_Pipeline__Age_Encode__n':randint(3,7),
    'Preprocess_Pipeline__Age_Encode__q':randint(0,2),
    'Preprocess_Pipeline__Age_Encode__conti':randint(0,2),
    'Preprocess_Pipeline__Fare_Encode__conti':[1],
    'Feature_Selection__select':[[0,1,2,3,6]],
    'Predict__C':uniform(0.2, 1),
    'Predict__kernel':['poly'],
    'Predict__degree':[1,2,3,4]
}
]

grid_SVC = RandomizedSearchCV(FullPipePredict_SVC, param_distributions=param_SVC, cv=5, n_iter=300, 
                                scoring='f1', random_state=42)

In [92]:
grid_SVC.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('Impute_Pipeline',
                                              Pipeline(steps=[('Age_Impute',
                                                               Impute_Better(func=<function median at 0x0000018A981D0820>)),
                                                              ('Fare_Impute',
                                                               Impute_Better(col='Fare',
                                                                             func=<function median at 0x0000018A981D0820>)),
                                                              ('Embarked_Impute',
                                                               Impute_Better(col='Embarked',
                                                                             func=<function <lambda> at 0x0000018A9D6FD280>)),
                                                              ('New_Features'...
                                 

In [96]:
grid_SVC.best_params_

{'Feature_Selection__select': [0, 1, 2, 3, 6],
 'Predict__C': 0.8375574713552132,
 'Predict__degree': 3,
 'Predict__kernel': 'poly',
 'Preprocess_Pipeline__Age_Encode__conti': 0,
 'Preprocess_Pipeline__Age_Encode__n': 3,
 'Preprocess_Pipeline__Age_Encode__q': 0,
 'Preprocess_Pipeline__Fare_Encode__conti': 1}

In [88]:
SVC_best = grid_SVC.best_estimator_

In [89]:
write_test(test, SVC_best, 'SVC_PolyKernel3')

# 3 - Random Forest

In [84]:
FullPipePredict_RF = Pipeline([
    ('Impute_Pipeline', Pipe_Impute),
    ('Preprocess_Pipeline', Pipe_Preprocess),
    ('Feature_Selection', Select_Features()),
    ('Predict', RandomForestClassifier(n_jobs=-1))
])

In [119]:
param_RF = {
    'Preprocess_Pipeline__Age_Encode__n':randint(4,7),
    'Preprocess_Pipeline__Age_Encode__q':randint(0,2),
    #'Preprocess_Pipeline__Age_Encode__conti':randint(0,1),
    #'Preprocess_Pipeline__Fare_Encode__n':randint(4,7),
    'Preprocess_Pipeline__Fare_Encode__conti':[1],
    #'Preprocess_Pipeline__Fare_Encode__q':randint(0,2),
    'Feature_Selection__select':[[0,1,2,3,4],[0,1,2,3,6]],
    'Predict__n_estimators':randint(100, 190),
    'Predict__min_samples_split':randint(6,9)
}

In [120]:
grid_RF = RandomizedSearchCV(FullPipePredict_RF, param_distributions=param_RF, cv=5, n_iter=60,
                            scoring='accuracy')

In [121]:
grid_RF.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('Impute_Pipeline',
                                              Pipeline(steps=[('Age_Impute',
                                                               Impute_Better(func=<function median at 0x000002047D47D700>)),
                                                              ('Fare_Impute',
                                                               Impute_Better(col='Fare',
                                                                             func=<function median at 0x000002047D47D700>)),
                                                              ('Embarked_Impute',
                                                               Impute_Better(col='Embarked',
                                                                             func=<function <lambda> at 0x0000020408108C10>)),
                                                              ('New_Features'...
                                 