# Imports

In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn import preprocessing
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

import xgboost as xgb

# Getting the data

In [2]:
train = pd.read_csv("train.csv", sep=",")
test = pd.read_csv("test.csv", sep=",")

## Splitting the sets - train & validation

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(train, train["Sex"]):
    train_set = train.loc[train_index]
    test_set = train.loc[test_index]

In [4]:
X_train = train_set.drop("Survived",axis=1)
y_train = train_set["Survived"]
X_validation = test_set.drop("Survived",axis=1)
y_validation = test_set["Survived"]

# Preprocessing

In [5]:
num = ["Age", "SibSp", "Parch", "Fare"]
cat = ["Pclass", "Sex", "Embarked"]

In [7]:
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [85]:
class Custom_num(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X['IsAlone'] = np.where(X.SibSp + X.Parch ==0, 1, 0).astype(int)
        X_group = X.groupby(["Pclass", "Sex"]).mean()[["Age", "Fare"]].reset_index()
        X = pd.merge(X, X_group, how='left', on=["Pclass", "Sex"])
        X.loc[X.Age_x.isna(), "Age_x"] = X.Age_y
        X.loc[X.Fare_x.isna(), "Fare_x"] = X.Fare_y
        X.drop(["Pclass", "Sex"], axis=1)
        return X

In [101]:
num_pipeline = Pipeline([
        ("custom_num", Custom_num()),
        ('scaler', StandardScaler())
    ])

In [98]:
cat_pipeline = Pipeline([
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])

In [103]:
preprocessor = ColumnTransformer([
        ("num_pipeline", num_pipeline, num),
        ("cat_pipeline", cat_pipeline, cat),
    ])
preprocessor_num = ColumnTransformer([
        ("num_pipeline", num_pipeline, num),
    ])

In [105]:
class Custom(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X["Age"] = X.Age * 2
        return X

In [108]:
preprocessor = ColumnTransformer([
        ("cust1", Custom(), ["Age"]),
        ("cust2", Custom(), ["Age"]),
    ])
preprocessor.fit_transform(X_train)

array([[ 2.,  2.],
       [nan, nan],
       [nan, nan],
       ...,
       [nan, nan],
       [98., 98.],
       [ 6.,  6.]])

In [104]:
preprocessor_num.fit_transform(X_train).shape

(712, 9)

# Model

In [69]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
X_train_prep = preprocessor.fit_transform(X_train)
forest_clf.fit(X_train_prep, y_train)
forest_scores = cross_val_score(forest_clf, X_train_prep, y_train, cv=10, scoring='f1')
forest_scores.mean()

0.7449564750411198

In [70]:
xgboost = xgb.XGBClassifier(n_estimators=100, max_depth=5, random_state=42)
X_train_prep = preprocessor.fit_transform(X_train)
xgboost.fit(X_train_prep, y_train)
scores = cross_val_score(xgboost, X_train_prep, y_train, cv=10, scoring='f1')
scores.mean()

0.7570850360452928

In [71]:
X_validation_prep = preprocessor.fit_transform(X_validation)
y_validation_pred = xgboost.predict(X_validation_prep)
print(confusion_matrix(y_validation, y_validation_pred))
print("Validation f1_score : %f"%f1_score(y_validation, y_validation_pred))

[[97 17]
 [19 46]]
Validation f1_score : 0.718750


In [40]:
X_test_prep = preprocessor.transform(test)

In [41]:
y_test_pred = xgboost.predict(X_test_prep)
submission = pd.DataFrame({
    "PassengerId": test['PassengerId'],
    "Survived": y_test_pred
})
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [42]:
submission.to_csv('submission.csv', index=False)