# Titanic

The goal of this kaggle challenge is to predict which passengers (`PassengerId`) survived (`Survived`) the Titanic disaster.

## Imports

In [1]:
# Scipy stuff
import pandas as pd
import numpy as np

# Jupyter/IPython stuff
from IPython.display import display, HTML

## Utils

In [2]:
def DF_DISP(df):
    """Pretty print a dataframe"""
    display(HTML(df.to_html()))

def LIST_DISP(lst):
    """Pretty print columns of a dataframe"""
    display(", ".join(lst))

## Dataset

In [3]:
# Read data
train = pd.read_csv("train.csv")
holdout = pd.read_csv("test.csv")

In [4]:
display(HTML("<h3>First 10 of {} training entries</h3>".format(len(train.index))))
DF_DISP(train.head(10))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


Column meanings:

* **Survived** Survived (1) or died (0)
* **Pclass** Passenger's class
* **Name** Passenger's name
* **Sex** Passenger's sex
* **Age** Passenger's age
* **SibSp** Number of siblings/spouses aboard
* **Parch** Number of parents/children aboard
* **Ticket** Ticket number
* **Fare** Fare
* **Cabin** Cabin
* **Embarked** Port of embarkation

In [5]:
display(HTML("<h3>Columns with missing values</h3>"))
LIST_DISP(train.columns[train.isna().any()])
display(HTML("<h3>Column types</h3>"))
display(train.dtypes)
display(HTML("<h3>Unique entries in integer columns</h3>"))
display(train.select_dtypes(include=['int']).apply(lambda x: len(set(x))))
display(HTML("<h3>Description of numeric columns</h3>"))
DF_DISP(train.describe()) # Numeric
display(HTML("<h3>Description of categorical columns</h3>"))
DF_DISP(train.describe(include=['O'])) # Categoricals

'Age, Cabin, Embarked'

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

PassengerId    891
Survived         2
Pclass           3
SibSp            7
Parch            7
dtype: int64

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",male,347082,B96 B98,S
freq,1,577,7,4,644


## Preprocess data

In [6]:
def process_missing(df):
    """Handle various missing values from the data set"""
    df["Fare"] = df["Fare"].fillna(train["Fare"].mean())
    df["Embarked"] = df["Embarked"].fillna("S")
    return df


def process_age(df):
    """Process the Age column into pre-defined 'bins' """
    df["Age"] = df["Age"].fillna(-0.5)
    cut_points = [-1, 0, 5, 12, 18, 35, 60, 100]
    label_names = ["Missing", "Infant", "Child",
                   "Teenager", "Young Adult", "Adult", "Senior"]
    df["Age_categories"] = pd.cut(df["Age"], cut_points, labels=label_names)
    return df


def process_fare(df):
    """Process the Fare column into pre-defined 'bins'"""
    cut_points = [-1, 12, 50, 100, 1000]
    label_names = ["0-12", "12-50", "50-100", "100+"]
    df["Fare_categories"] = pd.cut(df["Fare"], cut_points, labels=label_names)
    return df


def process_cabin(df):
    """Process the Cabin column into pre-defined 'bins'"""
    df["Cabin_type"] = df["Cabin"].str[0]
    df["Cabin_type"] = df["Cabin_type"].fillna("Unknown")
    df = df.drop('Cabin', axis=1)
    return df


def process_titles(df):
    """Extract and categorize the title from the name column"""
    titles = {
        "Mr":         "Mr",
        "Mme":         "Mrs",
        "Ms":          "Mrs",
        "Mrs":        "Mrs",
        "Master":     "Master",
        "Mlle":        "Miss",
        "Miss":       "Miss",
        "Capt":        "Officer",
        "Col":         "Officer",
        "Major":       "Officer",
        "Dr":          "Officer",
        "Rev":         "Officer",
        "Jonkheer":    "Royalty",
        "Don":         "Royalty",
        "Sir":        "Royalty",
        "Countess":    "Royalty",
        "Dona":        "Royalty",
        "Lady":       "Royalty"
    }
    extracted_titles = df["Name"].str.extract(' ([A-Za-z]+)\.', expand=False)
    df["Title"] = extracted_titles.map(titles)
    return df


def process_isalone(df):
    print("ALONE")
    df["isalone"] = (df["Parch"]+df["SibSp"] == 0).astype("int")
    return df


def create_dummies(df, column_name):
    """Create Dummy Columns (One Hot Encoding) from a single Column"""
    dummies = pd.get_dummies(df[column_name], prefix=column_name)
    df = pd.concat([df, dummies], axis=1)
    return df

In [7]:
PROCESSES = (
    process_missing, process_age, process_fare, process_titles, process_cabin, process_isalone)
def process_dfs(dfs):
    for process in PROCESSES:
        for df in dfs:
            df = process(df)
    return dfs

In [8]:
train, holdout = process_dfs((train, holdout))

ALONE
ALONE


In [9]:
train = create_dummies(train, "Age_categories")
holdout = create_dummies(holdout, "Age_categories")
train = create_dummies(train, "Fare_categories")
holdout = create_dummies(holdout, "Fare_categories")
train = create_dummies(train, "Title")
holdout = create_dummies(holdout, "Title")
train = create_dummies(train, "Cabin_type")
holdout = create_dummies(holdout, "Cabin_type")
train = create_dummies(train, "Sex")
holdout = create_dummies(holdout, "Sex")
train = create_dummies(train, "Embarked")
holdout = create_dummies(holdout, "Embarked")

In [10]:
print(train.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked' 'Age_categories' 'Fare_categories'
 'Title' 'Cabin_type' 'isalone' 'Age_categories_Missing'
 'Age_categories_Infant' 'Age_categories_Child' 'Age_categories_Teenager'
 'Age_categories_Young Adult' 'Age_categories_Adult'
 'Age_categories_Senior' 'Fare_categories_0-12' 'Fare_categories_12-50'
 'Fare_categories_50-100' 'Fare_categories_100+' 'Title_Master'
 'Title_Miss' 'Title_Mr' 'Title_Mrs' 'Title_Officer' 'Title_Royalty'
 'Cabin_type_A' 'Cabin_type_B' 'Cabin_type_C' 'Cabin_type_D'
 'Cabin_type_E' 'Cabin_type_F' 'Cabin_type_G' 'Cabin_type_T'
 'Cabin_type_Unknown' 'Sex_female' 'Sex_male' 'Embarked_C' 'Embarked_Q'
 'Embarked_S']


In [13]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
def select_features(df):
    df=df._get_numeric_data().dropna(axis=1)
    all_X = df.drop(['PassengerId', 'Survived'], axis=1)
    all_y = df["Survived"]
    rf = RandomForestClassifier(random_state=1, n_estimators=20)
    rfecv=RFECV(rf, cv=10)
    rfecv.fit(all_X, all_y)
    best_columns = list(all_X.columns[rfecv.support_])
    print("Best Columns \n"+"-"*12+"\n{}\n".format(best_columns))    
    return best_columns
best_cols = select_features(train)

Best Columns 
------------
['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'isalone', 'Age_categories_Missing', 'Age_categories_Infant', 'Age_categories_Young Adult', 'Age_categories_Adult', 'Fare_categories_0-12', 'Fare_categories_12-50', 'Fare_categories_50-100', 'Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Cabin_type_D', 'Cabin_type_E', 'Cabin_type_Unknown', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_S']



In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


def select_model(df, features):

    all_X = df[features]
    all_y = df["Survived"]

    # List of dictionaries, each containing a model name,
    # it's estimator and a dict of hyperparameters
    models = [
        {
            "name": "LogisticRegression",
            "estimator": LogisticRegression(),
            "hyperparameters":
                {
                    "solver": ["newton-cg", "lbfgs", "liblinear"]
            }
        },
        {
            "name": "KNeighborsClassifier",
            "estimator": KNeighborsClassifier(),
            "hyperparameters":
                {
                    "n_neighbors": range(1, 40, 2),
                    "weights": ["distance", "uniform"],
                    "algorithm": ["ball_tree", "kd_tree", "brute"],
                    "p": [1, 2]
            }
        },
        {
            "name": "RandomForestClassifier",
            "estimator": RandomForestClassifier(random_state=1),
            "hyperparameters":
                {
                    "n_estimators": [4, 6, 9, 18],
                    "criterion": ["entropy", "gini"],
                    "max_depth": [2, 5, 10, 20],
                    "max_features": ["log2", "sqrt"],
                    "min_samples_leaf": [1, 5, 8],
                    "min_samples_split": [2, 3, 5]

            }
        }
    ]

    for model in models:
        print(model['name'])
        print('-'*len(model['name']))

        grid = GridSearchCV(model["estimator"],
                            param_grid=model["hyperparameters"],
                            cv=10)
        grid.fit(all_X, all_y)
        model["best_params"] = grid.best_params_
        model["best_score"] = grid.best_score_
        model["best_model"] = grid.best_estimator_

        print("Best Score: {}".format(model["best_score"]))
        print("Best Parameters: {}\n".format(model["best_params"]))

    return models


result = select_model(train, best_cols)

LogisticRegression
------------------




Best Score: 0.8226711560044894
Best Parameters: {'solver': 'newton-cg'}

KNeighborsClassifier
--------------------




Best Score: 0.7800224466891134
Best Parameters: {'algorithm': 'ball_tree', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}

RandomForestClassifier
----------------------
Best Score: 0.8417508417508418
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 18}



In [15]:
def save_submission_file(model, cols, filename="submission.csv"):
    holdout_data = holdout[cols]
    predictions = model.predict(holdout_data)

    holdout_ids = holdout["PassengerId"]
    submission_df = {"PassengerId": holdout_ids,
                     "Survived": predictions}
    submission = pd.DataFrame(submission_df)

    submission.to_csv(filename, index=False)


best_rf_model = result[2]["best_model"]
save_submission_file(best_rf_model, cols)