# Trees
Use decision trees to solve the challenge.

In [58]:
import dataset as ds

train_df, eval_df = ds.get_formatted_splits()
train_df.dtypes
# train_df.head()

PassengerId        int64
Survived           int64
Pclass          category
Sex             category
Age              float64
SibSp              int64
Parch              int64
Fare             float64
Embarked        category
Title           category
TicketNumber     float64
Floor           category
dtype: object

## Approach 1: likelihood on gender
Let's try something naive: predict the probability of surviving given a binary variable like gender.

In [46]:
def get_prob_of_surviving_based_on_gender(df):
    prob_survival_on_gender = df.dropna(subset=['Sex'], inplace=False).groupby('Sex').agg({'Survived': ['mean', 'count']}).rename(columns={'mean': 'Prob_surviving', 'count': 'Total'})
    prob_survival_on_gender.columns = prob_survival_on_gender.columns.droplevel(0)
    prob_survival_on_gender = prob_survival_on_gender.assign(Fraction=prob_survival_on_gender['Total'] / prob_survival_on_gender['Total'].sum())
    return prob_survival_on_gender

In [47]:
prob_survival_on_gender_train = get_prob_of_surviving_based_on_gender(train_df)
prob_survival_on_gender_train

Unnamed: 0_level_0,Prob_surviving,Total,Fraction
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
male,0.196078,408,0.654896
female,0.753488,215,0.345104


Let's try to apply these probabilities to the evaluation data: if the probability t survive is higher than 0.5, then we classify that passenger as "survived".

In [48]:
prob_survival_on_gender_train['Prediction'] = prob_survival_on_gender_train['Prob_surviving'] > 0.5
prob_survival_on_gender_train

Unnamed: 0_level_0,Prob_surviving,Total,Fraction,Prediction
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
male,0.196078,408,0.654896,False
female,0.753488,215,0.345104,True


In [49]:
def evaluate_gender_likelihood_on_set(df):
    predictions = df['Sex'].apply(lambda gender: prob_survival_on_gender_train.loc[gender]['Prediction'])
    accuracy = (predictions == df['Survived']).mean()
    return accuracy

train_accuracy = evaluate_gender_likelihood_on_set(train_df)
print('Accuracy on training set: {}'.format(train_accuracy))

eval_accuracy = evaluate_gender_likelihood_on_set(eval_df)
print('Accuracy on evaluation set: {}'.format(eval_accuracy))

Accuracy on training set: 0.7865168539325843
Accuracy on evaluation set: 0.7873134328358209


## Approach 2: decision tree on gender
Just to double check: we should get the same results as approach 1.

In [50]:
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

gender_encoder = LabelEncoder().fit(train_df['Sex'])
tree_classifier = DecisionTreeClassifier().fit(gender_encoder.transform(train_df['Sex']).reshape(-1, 1), train_df['Survived'])

accuracy = tree_classifier.score(gender_encoder.transform(eval_df['Sex']).reshape(-1, 1), eval_df['Survived'])
print('Accuracy: {}'.format(accuracy))

Accuracy: 0.7873134328358209


## Approach 3: decision tree on single features
This cannot possibly be better than combining all the attributes, but I am curious to see which one is more significant.

In [51]:
def tree_classification_on_attribute(attribute_name):
    label_encoder = LabelEncoder().fit(train_df[attribute_name].dtype.categories)
    encoded_attributes = label_encoder.transform(train_df[attribute_name])
    dtc = DecisionTreeClassifier().fit(encoded_attributes.reshape(-1, 1), train_df['Survived'])
    acc = dtc.score(label_encoder.transform(eval_df[attribute_name]).reshape(-1, 1), eval_df['Survived'])
    return acc

# Try on categorical attributes first.
categorical_columns = ['Sex', 'Pclass', 'Embarked', 'Title', 'Floor']
for attribute_name in categorical_columns:
    print('Accuracy on {}: {}'.format(attribute_name, tree_classification_on_attribute(attribute_name)))

Accuracy on Sex: 0.7873134328358209
Accuracy on Pclass: 0.6940298507462687
Accuracy on Embarked: 0.6417910447761194
Accuracy on Title: 0.7910447761194029
Accuracy on Floor: 0.7126865671641791


## Approach 4: tree combining the categorical attributes

In [52]:
# DecisionTreeClassifier cannot handle categorical data, so we need to turn all the categorical columns to numerical.
dtc = DecisionTreeClassifier().fit(train_df[categorical_columns].apply(lambda x: x.cat.codes), train_df['Survived'])
acc = dtc.score(eval_df[categorical_columns].apply(lambda x: x.cat.codes), eval_df['Survived'])
print('Accuracy combining categorical attributes: {}'.format(acc))

Accuracy combining categorical attributes: 0.8097014925373134


In [53]:
import numpy as np
from bokeh.plotting import figure, show, output_notebook

output_notebook()

# Quick experiment on tree parameters.
def _compute_dtc_score(criterion, max_depth, class_weight):
    return DecisionTreeClassifier(class_weight=class_weight,
                                  criterion=criterion,
                                  max_depth=max_depth)\
            .fit(train_df[categorical_columns].apply(lambda x: x.cat.codes),
                train_df['Survived'])\
            .score(eval_df[categorical_columns].apply(lambda x: x.cat.codes),
                  eval_df['Survived'])

criteria = ['entropy', 'gini']
max_allowed_depth = 20
accuracies = np.zeros((2, len(criteria), max_allowed_depth))
for idx0, class_weight in enumerate([None, 'balanced']):
    for idx1, criterion in enumerate(criteria):
        for idx2, max_depth in enumerate(range(1, max_allowed_depth + 1)):
            accuracies[idx0, idx1, idx2] = _compute_dtc_score(criterion, max_depth, class_weight)

dt_plot = figure()
dt_plot.line(range(1, max_allowed_depth + 1), accuracies[0, 0, :], color='blue', legend_label='Unbalanced {}'.format(criteria[0]))
dt_plot.line(range(1, max_allowed_depth + 1), accuracies[0, 1, :], color='orange', legend_label='Unbalanced {}'.format(criteria[1]))
dt_plot.line(range(1, max_allowed_depth + 1), accuracies[1, 0, :], color='red', legend_label='Balanced {}'.format(criteria[0]))
dt_plot.line(range(1, max_allowed_depth + 1), accuracies[1, 1, :], color='green', legend_label='Balanced {}'.format(criteria[1]))
dt_plot.xaxis.axis_label = 'Max depth'
dt_plot.yaxis.axis_label = 'DTC accuracy'
show(dt_plot)

print('Max accuracy: {}'.format(np.max(accuracies)))

Max accuracy: 0.8134328358208955


## Approach 5: tree on a single continuous attribute

In [54]:
def tree_classification_on_attribute(attribute_name):
    return DecisionTreeClassifier().fit(train_df[attribute_name].values.reshape(-1, 1),
                                        train_df['Survived'])\
            .score(eval_df[attribute_name].values.reshape(-1, 1),
                                        eval_df['Survived'])

numerical_columns = ['SibSp', 'Parch', 'Fare']
for attribute_name in numerical_columns:
    print('Accuracy on {}: {}'.format(attribute_name, tree_classification_on_attribute(attribute_name)))

Accuracy on SibSp: 0.6231343283582089
Accuracy on Parch: 0.6492537313432836
Accuracy on Fare: 0.7164179104477612


## Approach 6: tree on all the attributes

In [60]:
attributes = [
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked',
    'Title',
    'TicketNumber',
    'Floor',
]

train_df.dropna(axis='index', how='any', inplace=True)
eval_df.dropna(axis='index', how='any', inplace=True)

categorical_columns = train_df.select_dtypes('category').columns
train_df[categorical_columns] = train_df[categorical_columns].apply(lambda x: x.cat.codes)
eval_df[categorical_columns] = eval_df[categorical_columns].apply(lambda x: x.cat.codes)

In [61]:
# Quick experiment on tree parameters.
def _compute_dtc_score(criterion, max_depth, class_weight):
    return DecisionTreeClassifier(class_weight=class_weight,
                                  criterion=criterion,
                                  max_depth=max_depth)\
            .fit(train_df[attributes], train_df['Survived'])\
            .score(eval_df[attributes], eval_df['Survived'])

criteria = ['entropy', 'gini']
max_allowed_depth = 20
accuracies = np.zeros((2, len(criteria), max_allowed_depth))
for idx0, class_weight in enumerate([None, 'balanced']):
    for idx1, criterion in enumerate(criteria):
        for idx2, max_depth in enumerate(range(1, max_allowed_depth + 1)):
            accuracies[idx0, idx1, idx2] = _compute_dtc_score(criterion, max_depth, class_weight)

dt_plot = figure()
dt_plot.line(range(1, max_allowed_depth + 1), accuracies[0, 0, :], color='blue', legend_label='Unbalanced {}'.format(criteria[0]))
dt_plot.line(range(1, max_allowed_depth + 1), accuracies[0, 1, :], color='orange', legend_label='Unbalanced {}'.format(criteria[1]))
dt_plot.line(range(1, max_allowed_depth + 1), accuracies[1, 0, :], color='red', legend_label='Balanced {}'.format(criteria[0]))
dt_plot.line(range(1, max_allowed_depth + 1), accuracies[1, 1, :], color='green', legend_label='Balanced {}'.format(criteria[1]))
dt_plot.xaxis.axis_label = 'Max depth'
dt_plot.yaxis.axis_label = 'DTC accuracy'
show(dt_plot)

print('Max accuracy: {}'.format(np.max(accuracies)))

Max accuracy: 0.8571428571428571
