### Titanic classifier
The aim of this project is to detect who survived and who died in the Titanic. 
Firstly, we will browse through the data. Secondly, we will clean data.
Finally, we will try some models and analyse the outcome.

In [1]:
from common_functions import load_housing_data
import pandas as pd
PATH = "datasets/Titanic"

train_dataset = load_housing_data(PATH, "train.csv")
titanic_test = load_housing_data(PATH, "test.csv")

FileNotFoundError: [Errno 2] File datasets/Titanic\train.csv does not exist: 'datasets/Titanic\\train.csv'

In [None]:
# Split the labels and the features
# X_train_prepared = X_train.drop("Survived", axis=1)
# y_train_labels = X_train["Survived"]

In [None]:
train_dataset.info()
# X_train.head()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
# Drop titanic objects
class FeatureDropper(BaseEstimator, TransformerMixin):
    def __init__(self, features_names):
        self._features_names = features_names

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.drop(self._features_names, axis=1)
        return X
    

First of all, a good idea is to drop the columns of the name, the passengers id, the ticket and cabine.Afertward,  we 
have to encode several columns.
### pClass
- 1 = 1st
- 2 = 2nd
- 3 = 3rd

### Port of embarkation
- C = Cherbourg
- Q = Queenstown
- S = Southampton

### Sex
- Male
- Female

We have also to scale in a standard way the age, fare, SibSp and Parch.

In [None]:
train_dataset = train_dataset.dropna(axis=0, subset=["Embarked"])

In [None]:
# Split the labels and the features
X_train = train_dataset.drop("Survived", axis=1)
y_train = train_dataset["Survived"]

In [None]:
# Data cleaning
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

dropped_columns = ["Name", "PassengerId", "Ticket", "Cabin"]
categorical_features = ['Embarked', 'Sex', 'Pclass']
numeric_features = ["Age", "Fare", "SibSp", "Parch"]
dropped_features = ["Pclass", "Sex", "Embarked", "Age", "Fare"]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

dropping_transformer = Pipeline([
    ('dropping', FeatureDropper(dropped_columns))
])

col_transformer = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features),
        # ('drop', dropping_transformer, dropped_features)
    ]
)

prepare_data = Pipeline([
    ('preprocessing_dropping', FeatureDropper(dropped_columns)),
    ('col', col_transformer),
])
# union_pipeline = Pipeline([
#     ('feats', FeatureUnion([
#         ('ngram', )
#     ]))
# ])

X_train_pipeline = prepare_data.fit_transform(X_train)


In [None]:
# Convert to DataFrame
encoder_columns = ['C', 'Q', 'S', 'Female', 'Male', '1st', '2nd', '3rd', "Age", "Fare", "SibSp", "Parch"]
X_train_prepared = pd.DataFrame(data = X_train_pipeline, columns=encoder_columns)

In [None]:
# Concat the dataframe -> change to feature union
# X_train.reset_index(drop=True, inplace=True)
# X_train_transformed.reset_index(drop=True, inplace=True)
# X_train_transformed = pd.concat([X_train, X_train_transformed], axis=1)
# X_train_transformed

In [None]:
from sklearn import set_config
set_config(display='diagram')
prepare_data

In [None]:
# Test set
# titanic_test.dropna(axis=0, subset=["Embarked"])
# X_test_prepared = prepare_data.fit_transform(titanic_test)
# X_test_prepared
# X_test_transformed = pd.DataFrame(data = X_test_prepared, columns=encoder_columns)
# X_test_transformed

Classifier

In [None]:
# SGD Classifier
from sklearn.linear_model import SGDClassifier

# y_label=1 means that the person survived 
y_train_survived = (y_train == 1)

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train_prepared, y_train_survived)


In [None]:
some_people = X_train_prepared.iloc[[3]]
sgd_clf.predict(some_people)

Measuring Accuracy Using Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train_prepared, y_train_survived, cv=5, scoring="accuracy")

Confusion matrix

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

# Show the prediction for each rows
y_train_pred = cross_val_predict(sgd_clf, X_train_prepared, y_train_survived, cv=3)

confusion_matrix(y_train_survived, y_train_pred)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def display_scores(y_train, y_pred):
    print("Precision : ", precision_score(y_train, y_pred))
    print("Recall : ", recall_score(y_train, y_pred))
    print("f1 : ", f1_score(y_train, y_pred))

display_scores(y_train_survived, y_train_pred)



RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train_prepared, y_train_survived)

Measuring Accuracy Using Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(forest_clf, X_train_prepared, y_train_survived, cv=5, scoring="accuracy")

Confusion matrix

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

# Show the prediction for each rows
y_train_pred = cross_val_predict(forest_clf, X_train_prepared, y_train_survived, cv=3)

confusion_matrix(y_train_survived, y_train_pred)
display_scores(y_train_survived, y_train_pred)

In [None]:
# Get several values we can change in the forest_clf
forest_clf.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap' : ['True', 'False'],
    'max_features' : ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy'],
    'n_estimators' :[3, 10, 30, 100, 300, 1000]
}

CV_rfc = GridSearchCV(estimator=forest_clf, param_grid=param_grid, cv= 5)

### Best parameters for the RandomForestClassifier
{'bootstrap': 'True',
 'criterion': 'entropy',
 'max_depth': 7,
 'max_features': 'auto',
 'n_estimators': 30}

In [None]:
# CV_rfc.fit(X_train_prepared, y_train_survived)

In [None]:
# CV_rfc.best_params_

In [None]:
best_rfc = RandomForestClassifier(bootstrap=True, criterion='entropy', max_depth=7, max_features='auto', n_estimators=30)
best_rfc.fit(X_train_prepared, y_train_survived)
y_pred = best_rfc.predict(X_train_prepared)
display_scores(y_train_survived, y_pred)

Try to adjust the features

In [None]:
y_scores = cross_val_predict(sgd_clf, X_train_prepared, y_train_survived, cv=3, method="decision_function")

In [None]:
from sklearn.metrics import precision_recall_curve
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np

precisions, recalls, thresholds = precision_recall_curve(y_train_survived, y_scores)

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([0, 1])
    # Zoom on the curves
    # plt.xlim([-400, -200])
    # Intersection
    intersection = np.argwhere(np.diff(np.sign(precisions[:-1] - recalls[:-1]))).flatten()
    plt.plot(thresholds[intersection], precisions[:-1][intersection], 'ro')
    
    
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
# plt.show()


In [None]:
def plot_recall_vs_precision(precisions, recalls):
    plt.plot(recalls[:-1], precisions[:-1], "b-")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    
plot_recall_vs_precision(precisions, recalls)
# plt.show()



In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_survived, y_scores)

def plot_roc_curve(fpr, trp, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    
plot_roc_curve(fpr, tpr)
# plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_pred, y_train_survived)

Try to test the model

In [None]:
X_test = titanic_test.dropna(axis=0, subset=["Embarked"])
len(X_test)

In [None]:
X_test_prepared = prepare_data.fit_transform(X_test)


In [None]:
y_test = load_housing_data(PATH, "gender_submission.csv")
len(y_test)
y_test_survived = (y_test['Survived'] == 1)


In [None]:
# Use the best rfc model
y_test_pred = best_rfc.predict(X_test_prepared)
display_scores(y_test_survived, y_test_pred)



In [None]:
confusion_matrix(y_test_survived, y_test_pred)

