In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from string import punctuation

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
pd.options.display.max_rows = 100

## Import Data

In [3]:
train_data = pd.read_csv('train.csv')
train_data.shape

(891, 12)

In [2]:
test_data = pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
test_data.shape

(418, 11)

## Explore Dataset
Check nulls, class balance and look at some value counts for different columns.

In [25]:
# Count nulls in each column.
[[col, train_data[col].isnull().sum()] for col in train_data.columns]

[['PassengerId', 0],
 ['Survived', 0],
 ['Pclass', 0],
 ['Name', 0],
 ['Sex', 0],
 ['Age', 177],
 ['SibSp', 0],
 ['Parch', 0],
 ['Ticket', 0],
 ['Fare', 0],
 ['Cabin', 687],
 ['Embarked', 2]]

Look at the class balance. Makes sense to balance classes for model input data.
Maybe look at bagging techniques.

In [4]:
train_data['Survived'].value_counts(dropna=False)

0    549
1    342
Name: Survived, dtype: int64

In [5]:
train_data['Pclass'].value_counts(dropna=False)

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [6]:
train_data['Sex'].value_counts(dropna=False)

male      577
female    314
Name: Sex, dtype: int64

Notably, `Age` is missing a significant number of values. Maybe we can later
impute values using averages across other fields.

In [204]:
train_data['Age'].value_counts(dropna=False)

NaN      177
24.00     30
22.00     27
18.00     26
28.00     25
19.00     25
30.00     25
21.00     24
25.00     23
36.00     22
29.00     20
32.00     18
26.00     18
35.00     18
27.00     18
16.00     17
31.00     17
34.00     15
23.00     15
33.00     15
20.00     15
39.00     14
17.00     13
42.00     13
40.00     13
45.00     12
38.00     11
50.00     10
2.00      10
4.00      10
44.00      9
48.00      9
47.00      9
54.00      8
9.00       8
1.00       7
51.00      7
37.00      6
14.00      6
49.00      6
52.00      6
3.00       6
41.00      6
15.00      5
43.00      5
58.00      5
11.00      4
8.00       4
60.00      4
62.00      4
56.00      4
5.00       4
46.00      3
65.00      3
7.00       3
6.00       3
61.00      3
55.00      2
71.00      2
28.50      2
63.00      2
0.83       2
30.50      2
70.00      2
57.00      2
0.75       2
13.00      2
59.00      2
10.00      2
64.00      2
40.50      2
45.50      2
32.50      2
20.50      1
24.50      1
0.67       1
70.50      1

In [8]:
train_data['Embarked'].value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

Looking at each of the ports passengers left from:
- There are significantly different class sizes
- There are different survival rates

In [131]:
def embarkment_port_pivot(input_df):
    '''Create a pivot counting the survival/deaths of passengers embarking
    from each separate port.
    '''
    
    df = input_df.pivot_table(
        index=['Embarked'],
        columns=['Survived'],
        values=['PassengerId'],
        aggfunc=len
    )
    
    df.columns = df.columns.droplevel()

    return df

df = embarkment_port_pivot(train_data)
df
df[1] / (df[0] + df[1])

Survived,0,1
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,75,93
Q,47,30
S,427,217


Embarked
C    0.553571
Q    0.389610
S    0.336957
dtype: float64

In [10]:
train_data['SibSp'].value_counts(dropna=False)

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [11]:
train_data['Embarked'].value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

In [77]:
train_data['Name'].head()

0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object

## Feature Engineering
Select and preprocess some features before modelling.

In [12]:
def featurize_column(input_df, input_col, index_col=None):
    '''Convert a column of text labels into a dataframe of binary
    columns in each unique label value.
    
    Args:
    input_df (DataFrame):
    index_col (str):
    input_col (str):
    
    Returns:
    feature_piv (DataFrame):
    
    Example:
    
    User | Job               User | Carpenter | Electrician   
    Joe  | Carpenter    -->  Joe  |    1      |     0
    Jane | Electrician       Jane |    0      |     1

    '''
                    
    feature_piv = input_df[[input_col]]
    feature_piv['count'] = 1
    
    feature_piv = feature_piv.pivot_table(
        index=feature_piv.index,
        columns=[input_col],
        values=['count']
    )
    
    feature_piv.columns = feature_piv.columns.droplevel()
    feature_piv.columns = [input_col + '_' + col for col in feature_piv.columns]
    
    feature_piv = feature_piv.fillna(0)
    
    return feature_piv

def replace_with_featurized_column(input_df, input_col, index_col=None):
    '''Replace a column of text labels in a dataframe with a set of binary
    columns in each unique label value.
    
    Args:
    input_df (DataFrame):
    index_col (str):
    input_col (str):
    
    Returns:
    output_df (DataFrame):

    '''
                    
    feature_piv = featurize_column(input_df, input_col, index_col)
    
    output_df = input_df.drop(input_col, axis=1)
    output_df = output_df.join(feature_piv, how='left')
    
    return output_df


In [39]:
def select_features(input_df):
    '''Select features to be input and preprocessed for model.
    '''
    
    output_df = input_df.set_index('PassengerId')
    
    features = [
        'Survived',
        'Name',
        'Pclass',
        'Sex',
        'Age',
        'SibSp',
        'Parch',
        'Fare',
        'Embarked'
    ]
    
    output_df = output_df[features]
    
    return output_df

In [123]:
def strip_punctuation(input_str):
    '''Remove all punctuation characters from string. Leaves
    whitespace characters as is.
    '''
    stripped = ''.join(c for c in input_str if c not in punctuation)
    return stripped

def get_unique_name_words(input_df):
    '''Find the set of unique words in all names across the dataset.
    Useful for exploring bag of words analysis.
    '''
    s = input_df['Name'].apply(strip_punctuation).str.split(' ')
    s = s.apply(pd.Series).values
    
    s = pd.Series(np.concatenate(s)).dropna()
    s = s.value_counts(dropna=False)
    
    return s

Mr              521
Miss            182
Mrs             129
William          64
John             44
Master           40
Henry            35
James            24
George           24
Charles          23
Thomas           22
Mary             20
Edward           18
Anna             17
Joseph           16
Johan            15
Frederick        15
Elizabeth        15
Richard          14
Arthur           13
Samuel           13
Margaret         12
Alfred           12
Alexander        11
Maria            11
Peter            11
Jr               10
Robert           10
Andersson         9
Ernest            9
Karl              9
Leonard           9
Annie             8
Victor            8
Martin            8
J                 8
H                 8
Albert            8
Alice             8
Helen             7
Frank             7
de                7
David             7
Kate              7
Catherine         7
Dr                7
Sage              7
Carter            6
Edith             6
Rev               6


In [52]:
def get_married_female_column(input_df):
    ''' Find if a female is married by checking if her husband's name is
    contained in the 'Name' column, which seems to be demarcated by round
    brackets.
    '''
    output_df = input_df.copy()
    is_woman = output_df['Sex'] == 'female'
    is_married = ((output_df['Name'].str.split(' \(').str.len() + 1) % 2).astype(bool)
    
    output_df['IsMarriedWoman'] = (is_woman & is_married).astype(int)
    
    output_df = output_df.drop('Name', axis=1)
    
    return output_df

In [111]:
def get_num_words_in_name(input_df):
    '''Find the number of words/terms in a passenger's name.
    '''
    
    num_words = input_df['Name'].apply(strip_punctuation).str.split(' ').str.len()
    
    return num_words



In [114]:
def build_model_df(input_df):
    '''Preprocess features and rescale the data.
    '''
    
    model_df = select_features(input_df)
    
    model_df['num_words_in_name'] = get_num_words_in_name(model_df)
    
    model_df = get_married_female_column(model_df)
    
    # Turn sex into single binary column.
    model_df['Sex'] = (model_df['Sex'] == 'female').astype(int)
    
    # Turn embarkment port column into binary columns.
    for col in ['Embarked']:
        model_df = replace_with_featurized_column(model_df, col)
      
    # Store columns before dataframe gets turned into array by scaler.
    model_cols = model_df.columns
        
    # Rescale the data.
    model_df = model_df.dropna()
    scaler = MinMaxScaler(feature_range=(0, 1))
    model_df = pd.DataFrame(
        data=scaler.fit_transform(model_df),
        columns=model_cols
    )
        
    return model_df

model_df = build_model_df(train_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [132]:
model_df.shape

(712, 12)

In [115]:
model_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,num_words_in_name,IsMarriedWoman,Embarked_C,Embarked_Q,Embarked_S
0,0.0,1.0,0.0,0.271174,0.2,0.0,0.014151,0.090909,0.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.472229,0.2,0.0,0.139136,0.363636,1.0,1.0,0.0,0.0
2,1.0,1.0,1.0,0.321438,0.0,0.0,0.015469,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,1.0,0.434531,0.2,0.0,0.103644,0.363636,1.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.434531,0.0,0.0,0.015713,0.090909,0.0,0.0,0.0,1.0


Examine the feature correlations in the model data:

In [133]:
model_df.corr()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,num_words_in_name,IsMarriedWoman,Embarked_C,Embarked_Q,Embarked_S
Survived,1.0,-0.356462,0.536762,-0.082446,-0.015523,0.095265,0.2661,0.257572,0.332121,0.195673,-0.048966,-0.159015
Pclass,-0.356462,1.0,-0.150826,-0.365902,0.065187,0.023666,-0.552893,-0.188996,-0.137312,-0.279194,0.131989,0.197831
Sex,0.536762,-0.150826,1.0,-0.099037,0.106296,0.249543,0.182457,0.375334,0.56836,0.103611,0.027256,-0.109078
Age,-0.082446,-0.365902,-0.099037,1.0,-0.307351,-0.187896,0.093143,0.061806,0.160059,0.038268,-0.021693,-0.025431
SibSp,-0.015523,0.065187,0.106296,-0.307351,1.0,0.383338,0.13986,0.158096,0.083013,-0.046227,0.051331,0.018968
Parch,0.095265,0.023666,0.249543,-0.187896,0.383338,1.0,0.206624,0.209213,0.240322,-0.009523,-0.009417,0.013259
Fare,0.2661,-0.552893,0.182457,0.093143,0.13986,0.206624,1.0,0.132568,0.081096,0.301337,-0.062346,-0.250994
num_words_in_name,0.257572,-0.188996,0.375334,0.061806,0.158096,0.209213,0.132568,1.0,0.678194,0.034772,-0.110123,0.019183
IsMarriedWoman,0.332121,-0.137312,0.56836,0.160059,0.083013,0.240322,0.081096,0.678194,1.0,0.037413,-0.047113,-0.012746
Embarked_C,0.195673,-0.279194,0.103611,0.038268,-0.046227,-0.009523,0.301337,0.034772,0.037413,1.0,-0.095623,-0.884986


## Build Model
Since the target is binary let's go with the logisitic regression estimator. This appears to be performing
better than an SGD classifier with these input features.

First GridSearch over `C` and `penalty` params to find optimal model.

In [202]:
logistic_reg = LogisticRegression(
    max_iter=100, 
    tol=1e-3, 
    solver='liblinear'
)

pipe = Pipeline(
    steps=[('logistic', logistic_reg)]
)

X_digits = model_df[model_df.columns[1:]]
y_digits = model_df[model_df.columns[0]]

# Set pipeline parameters and their ranges.
param_grid = {
    'logistic__C': np.arange(0.1, 1.1, 0.1),
    'logistic__penalty': ['l1', 'l2'],
}
search = GridSearchCV(pipe, param_grid, iid=True, cv=5)
search.fit(X_digits, y_digits)
"Best parameter (CV score=%0.3f):" % search.best_score_
search.best_params_

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('logistic',
                                        LogisticRegression(C=1.0,
                                                           class_weight=None,
                                                           dual=False,
                                                           fit_intercept=True,
                                                           intercept_scaling=1,
                                                           l1_ratio=None,
                                                           max_iter=100,
                                                           multi_class='warn',
                                                           n_jobs=None,
                                                           penalty='l2',
                                                           random_state=None,
                            

'Best parameter (CV score=0.781):'

{'logistic__C': 1.0, 'logistic__penalty': 'l1'}

Evaluate logistic regression model using optimal params.

In [203]:
best_model = search.best_estimator_
y_pred = best_model.predict(X_digits)

'Mean prediction accuracy: ' + str(best_model.score(X_digits, y_digits))
'Precision score: ' + str(precision_score(y_digits, y_pred, average='binary'))
'Recall score: ' + str(recall_score(y_digits, y_pred, average='binary'))

'Mean prediction accuracy: 0.8033707865168539'

'Precision score: 0.7781954887218046'

'Recall score: 0.71875'