# Trees
Use decision trees to solve the challenge.

In [25]:
import dataset as ds

train_df, eval_df = ds.get_formatted_splits()
train_df.dtypes
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,TicketNumber,Floor
857,858,1,1,male,51.0,0,0,26.55,S,Mr,113055.0,E
52,53,1,1,female,49.0,1,0,76.7292,C,Mrs,17572.0,D
386,387,0,3,male,1.0,5,2,46.9,S,Master,2144.0,Unknown
124,125,0,1,male,54.0,0,1,77.2875,S,Mr,35281.0,D
578,579,0,3,female,,1,0,14.4583,C,Mrs,2689.0,Unknown


## Approach 1: likelihood on gender
Let's try something naive: predict the probability of surviving given a binary variable like gender.

In [13]:
def get_prob_of_surviving_based_on_gender(df):
    prob_survival_on_gender = df.dropna(subset=['Sex'], inplace=False).groupby('Sex').agg({'Survived': ['mean', 'count']}).rename(columns={'mean': 'Prob_surviving', 'count': 'Total'})
    prob_survival_on_gender.columns = prob_survival_on_gender.columns.droplevel(0)
    prob_survival_on_gender = prob_survival_on_gender.assign(Fraction=prob_survival_on_gender['Total'] / prob_survival_on_gender['Total'].sum())
    return prob_survival_on_gender

In [14]:
prob_survival_on_gender_train = get_prob_of_surviving_based_on_gender(train_df)
prob_survival_on_gender_train

Unnamed: 0_level_0,Prob_surviving,Total,Fraction
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
male,0.196078,408,0.654896
female,0.753488,215,0.345104


Let's try to apply these probabilities to the evaluation data: if the probability t survive is higher than 0.5, then we classify that passenger as "survived".

In [15]:
prob_survival_on_gender_train['Prediction'] = prob_survival_on_gender_train['Prob_surviving'] > 0.5
prob_survival_on_gender_train

Unnamed: 0_level_0,Prob_surviving,Total,Fraction,Prediction
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
male,0.196078,408,0.654896,False
female,0.753488,215,0.345104,True


In [16]:
def evaluate_gender_likelihood_on_set(df):
    predictions = df['Sex'].apply(lambda gender: prob_survival_on_gender_train.loc[gender]['Prediction'])
    accuracy = (predictions == df['Survived']).mean()
    return accuracy

train_accuracy = evaluate_gender_likelihood_on_set(train_df)
print('Accuracy on training set: {}'.format(train_accuracy))

eval_accuracy = evaluate_gender_likelihood_on_set(eval_df)
print('Accuracy on evaluation set: {}'.format(eval_accuracy))

Accuracy on training set: 0.7865168539325843
Accuracy on evaluation set: 0.7873134328358209


## Approach 2: decision tree on gender
Just to double check: we should get the same results as approach 1.

In [17]:
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

gender_encoder = LabelEncoder().fit(train_df['Sex'])
tree_classifier = DecisionTreeClassifier().fit(gender_encoder.transform(train_df['Sex']).reshape(-1, 1), train_df['Survived'])

accuracy = tree_classifier.score(gender_encoder.transform(eval_df['Sex']).reshape(-1, 1), eval_df['Survived'])
print('Accuracy: {}'.format(accuracy))

Accuracy: 0.7873134328358209


## Approach 3: decision tree on single features
This cannot possibly be better than combining all the attributes, but I am curious to see which one is more significant.

In [18]:
def tree_classification_on_attribute(attribute_name):
    label_encoder = LabelEncoder().fit(train_df[attribute_name].dtype.categories)
    encoded_attributes = label_encoder.transform(train_df[attribute_name])
    dtc = DecisionTreeClassifier().fit(encoded_attributes.reshape(-1, 1), train_df['Survived'])
    acc = dtc.score(label_encoder.transform(eval_df[attribute_name]).reshape(-1, 1), eval_df['Survived'])
    return acc

# Try on categorical attributes first.
categorical_columns = ['Sex', 'Pclass', 'Embarked', 'Title', 'Floor']
for attribute_name in categorical_columns:
    print('Accuracy on {}: {}'.format(attribute_name, tree_classification_on_attribute(attribute_name)))

Accuracy on Sex: 0.7873134328358209
Accuracy on Pclass: 0.6940298507462687
Accuracy on Embarked: 0.6417910447761194
Accuracy on Title: 0.7910447761194029
Accuracy on Floor: 0.7126865671641791


## Approach 4: tree combining the categorical attributes

In [27]:
# DecisionTreeClassifier cannot handle categorical data, so we need to turn all the categorical columns to numerical.
dtc = DecisionTreeClassifier().fit(train_df[categorical_columns].apply(lambda x: x.cat.codes), train_df['Survived'])
acc = dtc.score(eval_df[categorical_columns].apply(lambda x: x.cat.codes), eval_df['Survived'])
print('Accuracy combining categorical attributes: {}'.format(acc))

In [46]:
import numpy as np
from bokeh.plotting import figure, show, output_notebook

output_notebook()

# Quick experiment on tree parameters.
def _compute_dtc_score(criterion, max_depth, class_weight):
    return DecisionTreeClassifier(class_weight=class_weight,
                                  criterion=criterion,
                                  max_depth=max_depth)\
            .fit(train_df[categorical_columns].apply(lambda x: x.cat.codes),
                train_df['Survived'])\
            .score(eval_df[categorical_columns].apply(lambda x: x.cat.codes),
                  eval_df['Survived'])

criteria = ['entropy', 'gini']
max_allowed_depth = 20
accuracies = np.zeros((2, len(criteria), max_allowed_depth))
for idx0, class_weight in enumerate([None, 'balanced']):
    for idx1, criterion in enumerate(criteria):
        for idx2, max_depth in enumerate(range(1, max_allowed_depth + 1)):
            accuracies[idx0, idx1, idx2] = _compute_dtc_score(criterion, max_depth, class_weight)

dt_plot = figure()
dt_plot.line(range(1, max_allowed_depth + 1), accuracies[0, 0, :], color='blue', legend_label='Unbalanced {}'.format(criteria[0]))
dt_plot.line(range(1, max_allowed_depth + 1), accuracies[0, 1, :], color='orange', legend_label='Unbalanced {}'.format(criteria[1]))
dt_plot.line(range(1, max_allowed_depth + 1), accuracies[1, 0, :], color='red', legend_label='Balanced {}'.format(criteria[0]))
dt_plot.line(range(1, max_allowed_depth + 1), accuracies[1, 1, :], color='green', legend_label='Balanced {}'.format(criteria[1]))
dt_plot.xaxis.axis_label = 'Max depth'
dt_plot.yaxis.axis_label = 'DTC accuracy'
show(dt_plot)

print('Max accuracy: {}'.format(np.max(accuracies)))

Max accuracy: 0.8208955223880597


In [22]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,TicketNumber,Floor
857,858,1,1,0,51.0,0,0,26.55,0,2,113055.0,3
52,53,1,1,1,49.0,1,0,76.7292,1,11,17572.0,1
386,387,0,0,0,1.0,5,2,46.9,0,15,2144.0,4
124,125,0,1,0,54.0,0,1,77.2875,0,2,35281.0,1
578,579,0,0,1,,1,0,14.4583,1,11,2689.0,4


## Approach 3: combine gender and class

In [None]:
def get_gender_and_class_based_samples_and_labels(df):
    samples_labels = df[['Pclass', 'Sex', 'Survived']].replace({'male': 1, 'female': 0}).values
    samples = samples_labels[:, :2]

    # See what happens to the accuracy if we set one of the variables to random values.
    # samples[:, 0] = np.random.randint(3, size=len(samples))
    # samples[:, 1] = np.random.randint(2, size=len(samples))

    labels = samples_labels[:, 2]
    return samples, labels

train_samples, train_labels = get_gender_and_class_based_samples_and_labels(train_df)
tree_classifier_3 = tree.DecisionTreeClassifier().fit(train_samples, train_labels)

eval_samples, eval_labels = get_gender_and_class_based_samples_and_labels(eval_df)
accuracy_3 = tree_classifier_3.score(eval_samples, eval_labels)
print('Accuracy 3: {}'.format(accuracy_3))

We get the same accuracy as using the gender only. This means that all the classification power is carried by the gender only: the class does not add any information.
Let's draw a matrix to correlate gender and class.

In [None]:
class_vs_gender_df = train_df.pivot_table(index='Sex', columns='Pclass', values='Survived', aggfunc=['count', 'mean'])
class_vs_gender_df

If we threshold the classification on 0.5, all the women would be classified as "Survived" and all the men as "Not survived".
All the information is "contained" in the gender.

## Approach 4: use the continuous variable "Age"

Remove NaN age values, ages less than 1 year and half-year ages.

In [None]:
def extract_age_df(df):
    age_df = df[['Survived', 'Age']].dropna(axis='index', how='any')
    age_df = age_df[age_df['Age'] > 1].round({'Age': 0})
    return age_df

age_train_df = extract_age_df(train_df)
age_eval_df = extract_age_df(eval_df)

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, FactorRange

source = ColumnDataSource(data=age_train_df)

output_notebook()

age_plot = figure()
age_plot.circle(x='Age', y='Survived', source=source)
age_plot.xaxis.axis_label = 'Age (years)'
age_plot.yaxis.axis_label = 'Survived'
show(age_plot)

In [None]:
min_bin = 0
max_bin = 100
num_bins = 10
step = int((max_bin - min_bin) / num_bins)
bins = range(min_bin, max_bin + 1, step)
hist_train_age_df = age_train_df.groupby([pd.cut(age_train_df.Age, bins)])['Survived'].agg(['count', 'mean'])
hist_train_age_df

In [None]:
age_hist = figure()
age_hist.quad(bottom=0,
              top=hist_train_age_df['mean'],
              left=hist_train_age_df.index.categories.left,
              right=hist_train_age_df.index.categories.right,
              fill_color='red',
              line_color='black')
age_hist.xaxis.axis_label = 'Age (years)'
age_hist.yaxis.axis_label = 'Prob of surviving'
show(age_hist)

The probability of surviving does not seem linear with the age: we exclude Logistic Regression. What about SVM?

In [None]:
from sklearn.svm import SVC

train_ages = np.array(age_train_df['Age']).reshape(-1, 1)
train_labels = np.array(age_train_df['Survived'])

C = 1.0 # default is 1.0
kernel = 'rbf' # default is "rbf"
degree = 5 # default is 3
max_iter = -1 # default is -1
gamma = 'scale'
svm_classifier = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, max_iter=max_iter).fit(train_ages, train_labels)

In [None]:
eval_ages = np.array(age_eval_df['Age']).reshape(-1, 1)
eval_labels = np.array(age_eval_df['Survived'])
accuracy_4 = svm_classifier.score(eval_ages, eval_labels)
accuracy_4

Not great. The best we get is 0.6298076923076923 using:
```
C = 1.0
kernel = 'rbf'
max_iter = -1
gamma = 'scale'
```

## Approach 5: maybe the fare alone does better?

In [None]:
source = ColumnDataSource(data=train_df)

fare_plot = figure()
fare_plot.circle(x='Fare', y='Survived', source=source)
fare_plot.xaxis.axis_label = 'Fare ($)'
fare_plot.yaxis.axis_label = 'Survived'
show(fare_plot)

There is a slight correlation: shall we give Logistic Regression a try?

In [None]:
from sklearn.linear_model import LogisticRegression

train_fares = np.array(train_df['Fare']).reshape(-1, 1)
train_labels = np.array(train_df['Survived'])

lr_fare_classifier = LogisticRegression().fit(train_fares, train_labels)

eval_fares = np.array(eval_df['Fare']).reshape(-1, 1)
eval_labels = np.array(eval_df['Survived'])
accuracy_5 = lr_fare_classifier.score(eval_fares, eval_labels)
accuracy_5

Does SVM perform as well?

In [None]:
C = 0.1 # default is 1.0
kernel = 'rbf' # default is "rbf"
degree = 5 # default is 3
max_iter = -1 # default is -1
gamma = 'scale'
svm_fare_classifier = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, max_iter=max_iter).fit(train_fares, train_labels)
accuracy_5B = svm_fare_classifier.score(eval_fares, eval_labels)
accuracy_5B

## Approach 6: age and fare together

In [None]:
survived_source = ColumnDataSource(data=train_df[train_df['Survived'] == 1])
not_survived_source = ColumnDataSource(data=train_df[train_df['Survived'] == 0])

fare_vs_age_plot = figure()
fare_vs_age_plot.circle(x='Age', y='Fare', alpha=0.5, color='blue', source=survived_source)
fare_vs_age_plot.square(x='Age', y='Fare', alpha=0.5, color='orange', source=not_survived_source)
fare_vs_age_plot.xaxis.axis_label = 'Age (years)'
fare_vs_age_plot.yaxis.axis_label = 'Fare ($)'
show(fare_vs_age_plot)

In [None]:
train_ages_fares = np.array(train_df.dropna(axis='index', how='any')[['Age', 'Fare']])
train_labels = np.array(train_df.dropna(axis='index', how='any')['Survived'])

eval_ages_fares = np.array(eval_df.dropna(axis='index', how='any')[['Age', 'Fare']])
eval_labels = np.array(eval_df.dropna(axis='index', how='any')['Survived'])

C = 0.1 # default is 1.0
kernel = 'rbf' # default is "rbf"
degree = 5 # default is 3
max_iter = -1 # default is -1
gamma = 'scale'
svm_age_fare_classifier = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, max_iter=max_iter).fit(train_ages_fares, train_labels)
accuracy_6 = svm_age_fare_classifier.score(eval_ages_fares, eval_labels)
accuracy_6

We do worse than using only the fare. My hypothesis is that the fare contains all the information and removing the samples with invalid age makes the accuracy drop because we lose those fare samples too.

However, we can recognize some regions populated mostly by blue or orange dots. KNN usually does well in these cases.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def _use_knn(n_neighbors, weights):
    return KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights).fit(train_ages_fares, train_labels).score(eval_ages_fares, eval_labels)

max_num_neighbours = 20
weights_str = ['uniform', 'distance']
knn_accuracies = np.zeros((len(weights_str), max_num_neighbours))
for row_idx, weights in enumerate(weights_str):
    for col_idx, n_neighbors in enumerate(range(1, max_num_neighbours + 1)):
        knn_accuracies[row_idx, col_idx] = _use_knn(n_neighbors, weights)

knn_plot = figure()
knn_plot.line(range(1, max_num_neighbours + 1), knn_accuracies[0, :], color='blue', legend_label=weights_str[0])
knn_plot.line(range(1, max_num_neighbours + 1), knn_accuracies[1, :], color='orange', legend_label=weights_str[1])
knn_plot.xaxis.axis_label = 'Num neighbours'
knn_plot.yaxis.axis_label = 'KNN accuracy'
show(knn_plot)

print('Max KNN accuracy: {}'.format(np.max(knn_accuracies)))

## Approach 7: use the number of family members

In [None]:
def extract_family_df(df):
    family_df = df[['Survived', 'SibSp', 'Parch']].dropna(axis='index', how='any')
    return family_df

train_family_df = extract_family_df(train_df)
train_family_df = train_family_df.groupby(['SibSp', 'Parch']).agg({'Survived': ['count', 'sum', 'mean']})
train_family_df.columns = train_family_df.columns.droplevel(0)
train_family_df.rename(columns={'count': 'Total', 'sum': 'NumSurvived', 'mean': 'ProbSurviving'}, inplace=True)
train_family_df = train_family_df.reset_index()
train_family_df

In [None]:
train_family_df.iloc[0, 2] = 0
train_family_df

In [None]:
from bokeh.transform import linear_cmap

output_notebook()

family_source = ColumnDataSource(data=train_family_df)

color_mapper = linear_cmap(field_name='ProbSurviving', palette='Turbo256' ,low=0, high=1)
# color_mapper = LinearColorMapper(palette='Magma256', low=0, high=1)

family_plot = figure()
family_plot.circle(x='SibSp', y='Parch', alpha=0.5, size='Total', color=color_mapper, source=family_source)
family_plot.xaxis.axis_label = '# siblings/spouses'
family_plot.yaxis.axis_label = '# parents/children'

show(family_plot)

In [None]:
train_family_df = extract_family_df(train_df)
train_samples = train_family_df[['SibSp', 'Parch']].values
train_labels = train_family_df['Survived'].values

eval_family_df = extract_family_df(eval_df)
eval_samples = eval_family_df[['SibSp', 'Parch']].values
eval_labels = eval_family_df['Survived'].values

def _use_dt(criterion, max_depth, class_weight):
    return tree.DecisionTreeClassifier(class_weight=class_weight, criterion=criterion, max_depth=max_depth).fit(train_samples, train_labels).score(eval_samples, eval_labels)

criterions = ['entropy', 'gini']
max_allowed_depth = 10
dt_accuracies = np.zeros((2, len(criterions), max_allowed_depth))
for ch_idx, class_weight in enumerate([None, 'balanced']):
    for row_idx, criterion in enumerate(criterions):
        for col_idx, max_depth in enumerate(range(1, max_allowed_depth + 1)):
            dt_accuracies[ch_idx, row_idx, col_idx] = _use_dt(criterion, max_depth, class_weight)

dt_plot = figure()
dt_plot.line(range(1, max_num_neighbours + 1), dt_accuracies[0, 0, :], color='blue', legend_label='Unbalanced {}'.format(criterions[0]))
dt_plot.line(range(1, max_num_neighbours + 1), dt_accuracies[0, 1, :], color='orange', legend_label='Unbalanced {}'.format(criterions[1]))
dt_plot.line(range(1, max_num_neighbours + 1), dt_accuracies[1, 0, :], color='red', legend_label='Balanced {}'.format(criterions[0]))
dt_plot.line(range(1, max_num_neighbours + 1), dt_accuracies[1, 1, :], color='green', legend_label='Balanced {}'.format(criterions[1]))
dt_plot.xaxis.axis_label = 'Max depth'
dt_plot.yaxis.axis_label = 'DT accuracy'
show(dt_plot)

print('Max DT accuracy: {}'.format(np.max(dt_accuracies)))

In [None]:
C = 0.1 # default is 1.0
kernel = 'rbf' # default is "rbf"
degree = 5 # default is 3
max_iter = -1 # default is -1
gamma = 'scale'
family_svm_classifier = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, max_iter=max_iter).fit(train_samples, train_labels)
accuracy_7B = family_svm_classifier.score(eval_samples, eval_labels)
print('Accuracy 7B: {}'.format(accuracy_7B))

Not great, but classes are highly imbalanced. Let's try to weight the samples.