In [None]:
import pandas as pd

titanic = pd.read_csv("dataset/train.csv")
X_train = titanic.drop(['Survived'], axis=1)
y_train = titanic['Survived'].values
X_test = pd.read_csv("dataset/test.csv")

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

group1 = titanic[['Age', 'Pclass']].dropna().groupby(['Pclass']).median()

class AgeImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        ret = X.copy()
        for passenger in X[X['Age'].isnull()].index:
            ret.loc[passenger, 'Age'] = group1.loc[X.loc[passenger, 'Pclass'], 'Age']
        return ret

titanic_age_imputer = AgeImputer()
titanic_age_imputed = titanic_age_imputer.transform(titanic)

In [None]:
num = titanic_age_imputed.loc[:, ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
cat = titanic_age_imputed.loc[:, ['Sex', 'Embarked']]

In [None]:
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy='median')
num_imputed = num_imputer.fit_transform(num)

In [None]:
titanic_age_bucket = pd.DataFrame(num_imputer.fit_transform(titanic[['Survived', 'Age']]), columns=['Survived', 'Age'])
titanic_age_bucket['AgeBucket'] = titanic_age_bucket['Age'] // 15 * 15
age_bucket = titanic_age_bucket[
    ['Survived', 'AgeBucket']].groupby(['AgeBucket']).mean()

class AttributeAdder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        sibsp_parch = X[:, 2] + X[:, 3]
        age_surv_rate = [age_bucket.loc[X[i, 1] // 15 * 15] for i in range(len(X))]
        return np.c_[np.delete(X, [1, 2, 3], axis=1), sibsp_parch, age_surv_rate]

num_attribs_adder = AttributeAdder()
num_attribs_added = num_attribs_adder.fit_transform(num_imputed)

In [None]:
from sklearn.preprocessing import StandardScaler

num_scaler = StandardScaler()
num_scaled = num_scaler.fit_transform(num_attribs_added)

In [None]:
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('num_imputer', SimpleImputer(strategy='median')), 
    ('num_attribs_adder', AttributeAdder()), 
    ('num_scaler', StandardScaler()), 
    ])

num_preprocessed = num_pipeline.fit_transform(num)

In [None]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X], index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

cat_imputer = MostFrequentImputer()
cat_imputed = cat_imputer.fit_transform(cat)

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder(sparse=False)
cat_encoded = cat_encoder.fit_transform(cat_imputed)

In [None]:
cat_pipeline = Pipeline([
    ('cat_imputer', MostFrequentImputer()), 
    ('cat_encoder', OneHotEncoder(sparse=False)), 
    ])

cat_preprocess = cat_pipeline.fit_transform(cat)

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
cat_attribs = ['Sex', 'Embarked']

num_cat_pipeline = ColumnTransformer([
    ('num_pipeline', num_pipeline, num_attribs), 
    ('cat_pipeline', cat_pipeline, cat_attribs), 
    ])

manual_preprocessed = np.c_[num_scaled, cat_encoded]

In [None]:
import numpy as np

preprocess_pipeline = Pipeline([
    ('data_age_imputer', AgeImputer()), 
    ('num_cat_pipeline', num_cat_pipeline), 
    ])

X_train_preprocessed = preprocess_pipeline.fit_transform(X_train)

assert np.allclose(manual_preprocessed, X_train_preprocessed)

In [None]:
from sklearnex import patch_sklearn

patch_sklearn()

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

svm_clf = SVC(gamma='auto')
svm_scores = cross_val_score(svm_clf, X_train_preprocessed, y_train,
                             cv=10, scoring='accuracy')
print("svm_score: ", svm_scores.mean())

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100)
forest_scores = cross_val_score(forest_clf, X_train_preprocessed, y_train, 
                                cv=10, scoring='accuracy')
print("forest_score: ", forest_scores.mean())

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
        {'kernel': ['linear'], 
          'C': [10., 30., 100., 300.]},
        {'kernel': ['rbf'], 
          'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
          'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
        ]

grid_search = GridSearchCV(svm_clf, param_grid, cv=5, 
                            scoring='accuracy', verbose=2)
grid_search.fit(X_train_preprocessed, y_train)

In [None]:
full_pipeline = Pipeline([
    ('preprocess_pipeline', preprocess_pipeline), 
    ('classifier', grid_search.best_estimator_) , 
    ])

In [None]:
import joblib

full_pipeline.fit(X_train, y_train)
joblib.dump(full_pipeline, "model.pkl")

In [None]:
model = joblib.load("model.pkl")
predictions = model.predict(X_test)

In [None]:
import os

def write_csv(predictions, name='submit.csv'):
    dframe = pd.DataFrame(np.array([list(range(892, 892 + 418)), predictions]).T, 
                          columns=['PassengerId', 'Survived'])
    csv_path = os.path.join("data", name)
    dframe.to_csv(csv_path, index=False)

write_csv(predictions, "predictions.csv")