<a href="https://colab.research.google.com/github/tlcuzick/data-science-projects/blob/main/predicting-titanic-fatalities/predicting_titanic_fatalities.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
train = pd.read_csv('train.csv')
holdout = pd.read_csv('test.csv')

In [3]:
holdout.iloc[0:5]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


**Data Processing Functions**

In [4]:
def process_missing(df):
    """Handle various missing values from the data set

    Usage
    ------

    holdout = process_missing(holdout)
    """
    df["Fare"] = df["Fare"].fillna(train["Fare"].mean())
    df["Embarked"] = df["Embarked"].fillna("S")
    return df

def process_age(df):
    """Process the Age column into pre-defined 'bins' 

    Usage
    ------

    train = process_age(train)
    """
    df["Age"] = df["Age"].fillna(-0.5)
    cut_points = [-1,0,5,12,18,35,60,100]
    label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

def process_fare(df):
    """Process the Fare column into pre-defined 'bins' 

    Usage
    ------

    train = process_fare(train)
    """
    cut_points = [-1,12,50,100,1000]
    label_names = ["0-12","12-50","50-100","100+"]
    df["Fare_categories"] = pd.cut(df["Fare"],cut_points,labels=label_names)
    return df

def process_cabin(df):
    """Process the Cabin column into pre-defined 'bins' 

    Usage
    ------

    train process_cabin(train)
    """
    df["Cabin_type"] = df["Cabin"].str[0]
    df["Cabin_type"] = df["Cabin_type"].fillna("Unknown")
    df = df.drop('Cabin',axis=1)
    return df

def process_titles(df):
    """Extract and categorize the title from the name column 

    Usage
    ------

    train = process_titles(train)
    """
    titles = {
        "Mr" :         "Mr",
        "Mme":         "Mrs",
        "Ms":          "Mrs",
        "Mrs" :        "Mrs",
        "Master" :     "Master",
        "Mlle":        "Miss",
        "Miss" :       "Miss",
        "Capt":        "Officer",
        "Col":         "Officer",
        "Major":       "Officer",
        "Dr":          "Officer",
        "Rev":         "Officer",
        "Jonkheer":    "Royalty",
        "Don":         "Royalty",
        "Sir" :        "Royalty",
        "Countess":    "Royalty",
        "Dona":        "Royalty",
        "Lady" :       "Royalty"
    }
    extracted_titles = df["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
    df["Title"] = extracted_titles.map(titles)
    return df

def create_dummies(df,column_name):
    """Create Dummy Columns (One Hot Encoding) from a single Column

    Usage
    ------

    train = create_dummies(train,"Age")
    """
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

**Engineer "isalone" feature**

This will categorize passengers by whether they were accompanied by anyone else on board.

In [5]:
def is_alone(df):
    fam_size = df['SibSp'] + df['Parch']
    df['isalone'] = fam_size.apply(lambda x: 0 if x > 0 else 1)
    return df

**Clean and Process DataFrame**

Employ data processing functions above to clean and transform the DataFrame, creating dummy columns as necessary from categorical features.

In [6]:
def process_dataframe(df):
    df = process_missing(df)
    df = process_age(df)
    df = process_fare(df)
    df = process_titles(df)
    df = process_cabin(df)
    df = create_dummies(df, 'Age_categories')
    df = create_dummies(df, 'Fare_categories')
    df = create_dummies(df, 'Title')
    df = create_dummies(df, 'Cabin_type')
    df = create_dummies(df, 'Sex')
    df = is_alone(df)
    return df

**Select Features**

Select features for training that perform best in a Random Forest model.

In [7]:
def select_features(df):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    df = df.select_dtypes(include=numerics)
    df = df.dropna(axis=1)
    drop_cols = ['PassengerId', 'Survived']
    X_cols = [x for x in df.columns if x not in drop_cols]
    all_X = df[X_cols]
    all_y = df['Survived']
    rf = RandomForestClassifier(random_state=1)
    selector = RFECV(rf, cv=10)
    selector.fit(all_X, all_y)
    best_column_indices = selector.get_support(indices=True)
    best_columns = [X_cols[x] for x in best_column_indices]
    #print(best_columns)
    return best_columns

**Select and Tune Model**

Tune Logistic Regression, KNN, and Random Forest models across a range of parameter values, printing the name of the best performing model and returning the tuned model itself.

In [8]:
def select_model(df, features, target):
    all_X = df[features]
    all_y = df[target]
    
    models = [
        {
            'name': 'LogisticRegression',
            'estimator': LogisticRegression(),
            'hyperparameters': 
                {
                    "solver": ["newton-cg", "lbfgs", "liblinear"]
                }
        },
        {
            'name': 'KNeighborsClassifier',
            'estimator': KNeighborsClassifier(),
            'hyperparameters': 
                {
                    "n_neighbors": range(1,20,2),
                    "weights": ["distance", "uniform"],
                    "algorithm": ["ball_tree", "kd_tree", "brute"],
                    "p": [1,2]
                }
        },
        {
            'name': 'RandomForestClassifier',
            'estimator': RandomForestClassifier(),
            'hyperparameters': 
                {
                    "n_estimators": [4, 6, 9],
                    "criterion": ["entropy", "gini"],
                    "max_depth": [2, 5, 10],
                    "max_features": ["log2", "sqrt"],
                    "min_samples_leaf": [1, 5, 8],
                    "min_samples_split": [2, 3, 5]
                }
        }        
    ]
    
    best_score = 0
    best_model = None
    best_model_name = None
    
    for m in models:
        grid = GridSearchCV(m['estimator'], param_grid=m['hyperparameters'], cv=10)
        grid.fit(all_X, all_y)
        
        score = grid.best_score_
        params = grid.best_params_
        
        if score > best_score:
            best_model = grid.best_estimator_
            best_model_name = m['name']        
        
        print("{} - best score: {}".format(m['name'], score))
        print("{} - best params: {}".format(m['name'], params))
        
    print("Best overall model: {}".format(best_model_name))
        
    return best_model

**Put it All Together!**

In [10]:
train = process_dataframe(train)
holdout = process_dataframe(holdout)

In [11]:
best_columns = select_features(train)
best_model = select_model(train, best_columns, 'Survived')

LogisticRegression - best score: 0.7206242197253434
LogisticRegression - best params: {'solver': 'newton-cg'}
KNeighborsClassifier - best score: 0.6981897627965045
KNeighborsClassifier - best params: {'algorithm': 'brute', 'n_neighbors': 17, 'p': 1, 'weights': 'distance'}
RandomForestClassifier - best score: 0.7408489388264671
RandomForestClassifier - best params: {'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 3, 'n_estimators': 9}
Best overall model: RandomForestClassifier
