In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from string import punctuation

from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [48]:
pd.options.display.max_rows = 100
pd.options.mode.chained_assignment = None
plt.style.use('ggplot')

## Import Data

In [49]:
train_data = pd.read_csv('train.csv')
train_data.shape

(891, 12)

In [50]:
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [51]:
test_data.shape

(418, 11)

## Explore Dataset
Check nulls, class balance and look at some value counts for different columns.

In [52]:
# Count nulls in each column.
[[col, train_data[col].isnull().sum()] for col in train_data.columns]

[['PassengerId', 0],
 ['Survived', 0],
 ['Pclass', 0],
 ['Name', 0],
 ['Sex', 0],
 ['Age', 177],
 ['SibSp', 0],
 ['Parch', 0],
 ['Ticket', 0],
 ['Fare', 0],
 ['Cabin', 687],
 ['Embarked', 2]]

Look at the class balance. Makes sense to balance classes for model input data.
Maybe look at bagging techniques.

In [53]:
train_data['Survived'].value_counts(dropna=False)

0    549
1    342
Name: Survived, dtype: int64

In [54]:
train_data['Pclass'].value_counts(dropna=False)

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [55]:
train_data['Sex'].value_counts(dropna=False)

male      577
female    314
Name: Sex, dtype: int64

Notably, `Age` is missing a significant number of values. Maybe we can later
impute values using averages across other fields.

In [56]:
train_data['Age'].value_counts(dropna=False).head(10)

NaN     177
24.0     30
22.0     27
18.0     26
28.0     25
19.0     25
30.0     25
21.0     24
25.0     23
36.0     22
Name: Age, dtype: int64

In [57]:
train_data['Embarked'].value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

Looking at each of the ports passengers left from:
- There are significantly different class sizes
- There are different survival rates

In [58]:
# pd.get_dummies(df, columns=['Cabin'], prefix='dang')

In [59]:
def embarkment_port_pivot(input_df):
    '''Create a pivot counting the survival/deaths of passengers embarking
    from each separate port.
    '''
    
    df = input_df.pivot_table(
        index=['Embarked'],
        columns=['Survived'],
        values=['PassengerId'],
        aggfunc=len
    )
    
    df.columns = df.columns.droplevel()

    return df

df = embarkment_port_pivot(train_data)
df
df[1] / (df[0] + df[1])

Survived,0,1
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,75,93
Q,47,30
S,427,217


Embarked
C    0.553571
Q    0.389610
S    0.336957
dtype: float64

In [60]:
train_data['SibSp'].value_counts(dropna=False)

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [61]:
train_data['Embarked'].value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

In [62]:
train_data['Name'].head()

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

## Feature Engineering
Select and preprocess some features before modelling.

In [63]:
def featurize_column(input_df, input_col, index_col=None):
    '''Convert a column of text labels into a dataframe of binary
    columns in each unique label value.
    
    Args:
    input_df (DataFrame):
    index_col (str):
    input_col (str):
    
    Returns:
    feature_piv (DataFrame):
    
    Example:
    
    User | Job               User | Carpenter | Electrician   
    Joe  | Carpenter    -->  Joe  |    1      |     0
    Jane | Electrician       Jane |    0      |     1

    '''
                    
    feature_piv = input_df[[input_col]]
    feature_piv['count'] = 1
    
    feature_piv = feature_piv.pivot_table(
        index=feature_piv.index,
        columns=[input_col],
        values=['count']
    )
    
    feature_piv.columns = feature_piv.columns.droplevel()
    feature_piv.columns = [input_col + '_' + col for col in feature_piv.columns]
    
    feature_piv = feature_piv.fillna(0)
    
    return feature_piv

def replace_with_featurized_column(input_df, input_col, index_col=None):
    '''Replace a column of text labels in a dataframe with a set of binary
    columns in each unique label value.
    
    Args:
    input_df (DataFrame):
    index_col (str):
    input_col (str):
    
    Returns:
    output_df (DataFrame):

    '''
                    
    feature_piv = featurize_column(input_df, input_col, index_col)
    
    output_df = input_df.drop(input_col, axis=1)
    output_df = output_df.join(feature_piv, how='left')
    
    return output_df


In [64]:
def select_features(input_df):
    '''Select features to be input and preprocessed for model.
    '''
    
    output_df = input_df.set_index('PassengerId')
    
    features = [
        'Survived',
        'Name',
        'Pclass',
        'Sex',
        'Age',
        'SibSp',
        'Parch',
        'Fare',
        'Cabin',
        'Embarked'
    ]
    
    output_df = output_df[features]
    
    return output_df

In [65]:
def strip_punctuation(input_str):
    '''Remove all punctuation characters from string. Leaves
    whitespace characters as is.
    '''
    stripped = ''.join(c for c in input_str if c not in punctuation)
    return stripped

def get_unique_name_words(input_df):
    '''Find the set of unique words in all names across the dataset.
    Useful for exploring bag of words analysis.
    '''
    s = input_df['Name'].apply(strip_punctuation).str.split(' ')
    s = s.apply(pd.Series).values
    
    s = pd.Series(np.concatenate(s)).dropna()
    s = s.value_counts(dropna=False)
    
    return s

In [66]:
def get_married_female_column(input_df):
    ''' Find if a female is married by checking if her husband's name is
    contained in the 'Name' column, which seems to be demarcated by round
    brackets.
    '''
    output_df = input_df.copy()
    is_woman = output_df['Sex'] == 'female'
    is_married = ((output_df['Name'].str.split(' \(').str.len() + 1) % 2).astype(bool)
    
    output_df['IsMarriedWoman'] = (is_woman & is_married).astype(int)
    
    output_df = output_df.drop('Name', axis=1)
    
    return output_df

In [67]:
def get_num_words_in_name(input_df):
    '''Find the number of words/terms in a passenger's name.
    '''
    
    num_words = input_df['Name'].apply(strip_punctuation).str.split(' ').str.len()
    
    return num_words



In [68]:
def get_cabin_label(input_df):
    cabin = input_df['Cabin'].str[0].fillna('N')
    
    cabin_counts = cabin.value_counts().index
    num_cabins = len(cabin_counts)
    
    cabin_remap = {cabin_counts[i]: i for i in range(num_cabins)}
    
    cabin_label = cabin.map(cabin_remap)
    
    return cabin_label

In [69]:
def get_mean_age(input_df):
    '''
    '''
    group_cols = ['Pclass', 'SibSp', 'Parch']
    mean_age = input_df.groupby(group_cols, as_index=False)['Age'].mean()
    
    mean_age = mean_age.rename(columns={'Age': 'mean_age'})
    
    df = input_df.merge(mean_age, on=group_cols, how='left')
    
    age_data = df[['Age', 'mean_age']].values
    imputed_age = [age[0] if age[0] == np.nan else age[1] for age in age_data]
    
    return imputed_age

In [70]:
def build_model_df(input_df, nulls='drop'):
    '''Preprocess features and rescale the data.
    '''
    
    model_df = select_features(input_df)
    
    model_df['num_words_in_name'] = get_num_words_in_name(model_df)
    
    model_df = get_married_female_column(model_df)
    
    # Turn sex into single binary column.
    model_df['Sex'] = (model_df['Sex'] == 'female').astype(int)
    
    # Turn embarkment port column into binary columns.
    for col in ['Embarked']:
        model_df = replace_with_featurized_column(model_df, col)
        
    model_df['Age'] = get_mean_age(model_df)
    
    model_df['Cabin'] = get_cabin_label(model_df)
      
    # Store columns before dataframe gets turned into array by scaler.
    model_cols = model_df.columns
        
    # Rescale the data.
    if nulls == 'drop':
        model_df = model_df.dropna()
    else:
        for col in model_df.columns:
            model_df[col] = model_df[col].fillna(model_df[col].mean())
    scaler = MinMaxScaler(feature_range=(0, 1))
    model_df = pd.DataFrame(
        data=scaler.fit_transform(model_df),
        columns=model_cols
    )
        
    return model_df

model_df = build_model_df(train_data)
model_df.shape

(882, 13)

In [71]:
model_df.shape

(882, 13)

In [72]:
model_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,num_words_in_name,IsMarriedWoman,Embarked_C,Embarked_Q,Embarked_S
0,0.0,1.0,0.0,0.367968,0.2,0.0,0.014151,0.0,0.090909,0.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.566577,0.2,0.0,0.139136,0.125,0.363636,1.0,1.0,0.0,0.0
2,1.0,1.0,1.0,0.421599,0.0,0.0,0.015469,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,1.0,0.566577,0.2,0.0,0.103644,0.125,0.363636,1.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.421599,0.0,0.0,0.015713,0.0,0.090909,0.0,0.0,0.0,1.0


Examine the feature correlations in the model data:

In [73]:
model_df.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,num_words_in_name,IsMarriedWoman,Embarked_C,Embarked_Q,Embarked_S
Survived,1.0,-0.332108,0.546314,0.123749,0.010423,0.097486,0.261231,0.247305,0.279949,0.332046,0.167518,0.00262,-0.148514
Pclass,-0.332108,1.0,-0.129719,-0.636811,0.04688,0.003756,-0.555888,-0.497466,-0.223917,-0.15418,-0.243455,0.223263,0.072733
Sex,0.546314,-0.129719,1.0,-0.050359,0.135569,0.250013,0.180173,0.093541,0.377056,0.558504,0.085594,0.075973,-0.12292
Age,0.123749,-0.636811,-0.050359,1.0,-0.517809,-0.323701,0.2488,0.29937,0.06556,0.081526,0.15762,-0.089575,-0.081733
SibSp,0.010423,0.04688,0.135569,-0.517809,1.0,0.390955,0.151287,-0.038452,0.173225,0.102293,-0.042829,-0.01271,0.045558
Parch,0.097486,0.003756,0.250013,-0.323701,0.390955,1.0,0.209362,0.030153,0.218377,0.239465,-0.00397,-0.077966,0.052617
Fare,0.261231,-0.555888,0.180173,0.2488,0.151287,0.209362,1.0,0.239122,0.152135,0.106443,0.274493,-0.115145,-0.168082
Cabin,0.247305,-0.497466,0.093541,0.29937,-0.038452,0.030153,0.239122,1.0,0.129417,0.08536,0.116328,-0.105265,-0.035645
num_words_in_name,0.279949,-0.223917,0.377056,0.06556,0.173225,0.218377,0.152135,0.129417,1.0,0.673214,0.021113,-0.148157,0.074861
IsMarriedWoman,0.332046,-0.15418,0.558504,0.081526,0.102293,0.239465,0.106443,0.08536,0.673214,1.0,0.056002,-0.092524,0.009213


Separate train and test datasets.

In [74]:
# model_df = model_df[['Survived', 'Pclass', 'Sex', 'Fare', 'num_words_in_name', 'IsMarriedWoman']]

features = model_df[model_df.columns[1:]]
targets = model_df[model_df.columns[0]]

X_train, X_test, y_train, y_test = train_test_split(
    features, 
    targets, 
    test_size=0.40, 
    random_state=1
)

## Build Model

### Logistic Regression
Since the target is binary let's go with the logisitic regression estimator. This appears to be performing
better than an SGD classifier with these input features.

First GridSearch over `C` and `penalty` params to find optimal model.

In [75]:
logistic_reg = LogisticRegression(
    max_iter=100, 
    tol=1e-3, 
    solver='liblinear'
)

pipe = Pipeline(
    steps=[('logistic', logistic_reg)]
)

# Set pipeline parameters and their ranges.
param_grid = {
    'logistic__C': np.arange(0.1, 1.1, 0.1),
    'logistic__penalty': ['l1', 'l2'],
}

search = GridSearchCV(pipe, param_grid, iid=True, cv=5)
search.fit(X_train, y_train)
"Best parameter (CV score=%0.3f):" % search.best_score_
search.best_params_

'Evaluate logistic regression model using optimal params:'

best_model = search.best_estimator_
y_pred = best_model.predict(X_test)

'Mean prediction accuracy: ' + str(best_model.score(X_test, y_test))
'Precision score: ' + str(precision_score(y_test, y_pred, average='binary'))
'Recall score: ' + str(recall_score(y_test, y_pred, average='binary'))

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('logistic',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None,
                                                           max_iter=100,
                                                           multi_class='warn',
                                                           n_jobs=None,
                                                           penalty='l2',
                                                           random_state=None,
                            

'Best parameter (CV score=0.809):'

{'logistic__C': 0.1, 'logistic__penalty': 'l2'}

'Evaluate logistic regression model using optimal params:'

'Mean prediction accuracy: 0.7960339943342776'

'Precision score: 0.7931034482758621'

'Recall score: 0.6571428571428571'

In [76]:
# Generate output using linear regressor

output_df = test_data.copy()
output_df['Survived'] = 1
output_df = build_model_df(output_df, nulls='mean')
output_df = output_df[output_df.columns[1:]]

output_y_pred = best_model.predict(output_df)

final = test_data[['PassengerId']]
final['Survived'] = output_y_pred.astype(int)

In [77]:
now = pd.to_datetime('now').strftime('%Y%m%dT%H%M%S')
# final.to_csv(now + 'titanic_submission.csv', index=False)

### Random Forest Classifier
The performance of the logistic regression is not great. Let's look at a random forest
classifier.

In [78]:
rfc = RandomForestClassifier()

pipe = Pipeline(
    steps=[('rfc', rfc)]
)

# Set pipeline parameters and their ranges.
param_grid = {
    'rfc__bootstrap': [True, False],
    'rfc__n_estimators': np.arange(15, 46, 5),
    'rfc__criterion': ['gini', 'entropy'],
}

search = GridSearchCV(pipe, param_grid, iid=True, cv=5)
search.fit(X_train, y_train)
"Best parameter (CV score=%0.3f):" % search.best_score_
search.best_params_

'Evaluate Random Forest Classifier model:'

best_model = search.best_estimator_
y_pred = best_model.predict(X_test)

'Mean prediction accuracy: ' + str(best_model.score(X_test, y_test))
'Precision score: ' + str(precision_score(y_test, y_pred, average='binary'))
'Recall score: ' + str(recall_score(y_test, y_pred, average='binary'))

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('rfc',
                                        RandomForestClassifier(bootstrap=True,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                                               min_samples_leaf=1,
                                                               min_samples_split=2,
                      

'Best parameter (CV score=0.767):'

{'rfc__bootstrap': True, 'rfc__criterion': 'gini', 'rfc__n_estimators': 15}

'Evaluate Random Forest Classifier model:'

'Mean prediction accuracy: 0.7677053824362606'

'Precision score: 0.7132352941176471'

'Recall score: 0.6928571428571428'

Let's try upsampling the minority class (survivors).

In [79]:
def upsample_minority(X_train, y_train):
    '''Artificially create more samples in the minority class using
    k-nearest neighbours.
    '''

    # Concatenate training data back together.
    X = pd.concat([X_train, y_train], axis=1)

    # Separate minority and majority classes.
    died = X[X['Survived'] == 0]
    survived = X[X['Survived'] == 1]

    # Upsample minority class.
    survived_upsampled = resample(
        survived,
        replace=True,
        n_samples=died.shape[0],
        random_state=2
    )

    # Replace minority data with upsampled data.
    upsampled = pd.concat([died, survived_upsampled])

    upsamp_y_train = upsampled['Survived']
    upsamp_X_train = upsampled.drop('Survived', axis=1)
    
    return upsamp_X_train, upsamp_y_train

upsamp_X_train, upsamp_y_train = upsample_minority(X_train, y_train)

# upsampled_model = LogisticRegression(solver='liblinear')
upsampled_model = RandomForestClassifier(n_estimators=20)
upsampled_model.fit(upsamp_X_train, upsamp_y_train)

upsampled_pred = upsampled_model.predict(X_test)

'Mean prediction accuracy: ' + str(upsampled_model.score(X_test, y_test))
'Precision score: ' + str(precision_score(y_test, upsampled_pred, average='binary'))
'Recall score: ' + str(recall_score(y_test, upsampled_pred, average='binary'))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

'Mean prediction accuracy: 0.7478753541076487'

'Precision score: 0.6808510638297872'

'Recall score: 0.6857142857142857'