## Imports

In [11]:
import numpy as np
import pandas as pd

## Warning suppression

In [12]:
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
warnings.simplefilter("ignore")

## Telechargement du dataset

In [13]:
from sklearn.datasets import fetch_openml
data = pd.read_csv("../kaggle/kaggle_titanic/train.csv")

In [14]:
X = data.drop('Survived', axis=1)
X = X.drop('PassengerId', axis=1)
y = data["Survived"]

## Data exploration

In [6]:
X.shape

(891, 10)

In [7]:
y.shape

(891,)

In [8]:
X.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,891.0,714.0,891.0,891.0,891.0
mean,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,20.125,0.0,0.0,7.9104
50%,3.0,28.0,0.0,0.0,14.4542
75%,3.0,38.0,1.0,0.0,31.0
max,3.0,80.0,8.0,6.0,512.3292


## Training and test set creation

In [15]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(X, y):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [90]:
X_train = strat_train_set.drop("Survived", axis=1)
y_train = strat_train_set["Survived"]

In [96]:
X_test = strat_test_set.drop("Survived", axis=1)
y_test = strat_test_set["Survived"]

In [17]:
strat_train_set.shape

(712, 12)

In [18]:
strat_test_set.shape

(179, 12)

### Correlation

In [14]:
corr_matrix = strat_train_set.corr()
corr_matrix['Survived'].sort_values(ascending=False)

Survived       1.000000
Fare           0.275499
Parch          0.084178
PassengerId    0.011892
SibSp         -0.026115
Age           -0.084268
Pclass        -0.348007
Name: Survived, dtype: float64

In [21]:
# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(strat_train_set, figsize=(12, 8))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f298bf83ac8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f298bd02cc0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f298bcb4080>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f298bc5c5f8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f298bc83b70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f298bc34128>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f298bbdc6a0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f298bc01c50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f298bc01c88>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f298bb5a748>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f298bb82cc0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f298bb33278>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f298bada7f0>,
        <matplotlib.axes._subplots.Ax

## Features selection

In [111]:
from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

## Pipelines creation

In [190]:
from sklearn.base import BaseEstimator, TransformerMixin

# column index
Age_ix, SibSp_ix, parch_ix, fare_ix = 0, 1, 2, 3

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self): # no *args or **kargs
        pass
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        nb_familly = X[:, SibSp_ix] + X[:, parch_ix]
        is_alone = (nb_familly[:] == 1)
#         print(is_alone)

        return np.c_[X, nb_familly, is_alone]

In [191]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [192]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import KBinsDiscretizer

class Discretization(BaseEstimator, TransformerMixin):
    def __init__(self, attributs, bins):
        self.attributs = attributs
        self.bins = bins
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for attr, bins in zip(self.attributs, self.bins):
            X[:, attr] = KBinsDiscretizer(n_bins=bins,
                                          encode='ordinal',
                                          strategy='uniform').fit_transform([X[:, attr]])
        return X

In [193]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
    ('num_imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
#     ('std_scaler', StandardScaler()),
])
sdsdsd = num_pipeline.fit_transform(X_train)

In [117]:
# Inspired from stackoverflow.com/questions/25239958
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [118]:
name_idx = 3

class Keep_last_name(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
#         X[name_idx] = 
        return X

In [164]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked", "Cabin"])),
        ('str_imputer', MostFrequentImputer()),
#         ('keep_lastname', Keep_last_name()),
        ("cat", OneHotEncoder(sparse=False)),
    ])

In [165]:
from sklearn.pipeline import FeatureUnion

preprocess_pipeline = FeatureUnion(transformer_list=[
#         ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [166]:
preprocessed_train = preprocess_pipeline.fit_transform(X_train)
preprocessed_train.shape

(712, 8)

In [167]:
preprocessed_data = preprocess_pipeline.fit_transform(X)
preprocessed_data.shape

(891, 8)

## Classifier creation and training

In [168]:
import xgboost as xgb
model_xgb = xgb.XGBClassifier(n_estimators=1000, max_depth=6, learning_rate=0.1)

In [169]:
model_xgb.fit(preprocessed_data, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

## Testing

In [183]:
preprocessed_test = preprocess_pipeline.fit_transform(X_test)

In [184]:
y_test_pred = model_xgb.predict(preprocessed_test)

In [185]:
# Fonctionne sur des classifications binaire

from sklearn.metrics import precision_score, recall_score
precision_score(y_test, y_test_pred) # == 4096 / (4096 + 1522)
recall_score(y_test, y_test_pred) # == 4096 / (4096 + 1325)

0.5507246376811594

## Final prediction for kaggle

In [177]:
final_dataframe = pd.read_csv("../kaggle/kaggle_titanic/test.csv")

In [178]:
x_final = preprocess_pipeline.fit_transform(final_dataframe)
x_final.shape

(418, 8)

In [181]:
pred = model_xgb.predict(x_final)

In [182]:
solution = pd.DataFrame({"PassengerId":test_dataframe.PassengerId, "Survived":pred})
solution.to_csv("sol.csv", index = False)