In [31]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from mlxtend.evaluate import bias_variance_decomp
from sklearn.model_selection import train_test_split

import os
print(os.listdir("../data/titanic"))

['gender_submission.csv', 'test.csv', 'train.csv']


In [14]:
train_set = pd.read_csv('../data/titanic/train.csv')
test_set = pd.read_csv('../data/titanic/test.csv')

In [15]:
# Drop Features
train_set = train_set.drop(['Cabin', 'Ticket'], axis=1)
test_set = test_set.drop(['Cabin', 'Ticket'], axis=1)

# Create Features
train_set['Title'] = train_set.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
test_set['Title'] = test_set.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

combine=[train_set,test_set]
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
train_set['Title'] = train_set['Title'].map(title_mapping)
train_set['Title'] = train_set['Title'].fillna(0)
test_set['Title'] = test_set['Title'].map(title_mapping)
test_set['Title'] = test_set['Title'].fillna(0)

train_set = train_set.drop(['Name', 'PassengerId'], axis=1)
test_set = test_set.drop(['Name', 'PassengerId'], axis=1)

combine = [train_set, test_set]
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

guess_ages = np.zeros((2,3))

for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                                  (dataset['Pclass'] == j+1)]['Age'].dropna()

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j].astype(int)

    dataset['Age'] = dataset['Age']

train_set['AgeBand'] = pd.cut(train_set['Age'], 5)
train_set[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)


for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']

train_set = train_set.drop(['AgeBand'], axis=1)
combine = [train_set, test_set]

combine = [train_set, test_set]
for dataset in combine:
    dataset['FamilySize'] = dataset['Parch'] + dataset['SibSp'] + 1
train_set[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived')

for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train_set = train_set.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test_set = test_set.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine = [train_set, test_set]

for dataset in combine:
    dataset['Age*Class'] = dataset.Age * dataset.Pclass

freq_port = train_set.Embarked.dropna().mode()[0]
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
test_set['Fare'].fillna(test_set['Fare'].dropna().median(), inplace=True)
train_set['FareBand'] = pd.qcut(train_set['Fare'], 4)
for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_set = train_set.drop(['FareBand'], axis=1)
combine = [train_set, test_set]
    
X_train = train_set.drop("Survived", axis=1)
Y_train = train_set["Survived"]

X_train.shape, Y_train.shape

((891, 8), (891,))

### Train-test split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 8), (179, 8), (712,), (179,))

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

rf = GradientBoostingClassifier(n_estimators=150, max_depth=5, min_samples_split=2)
rf.fit(X_train, y_train)
rf.predict(X_test)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))


avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        rf, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

0.8721910112359551
0.8268156424581006
Average expected loss: 0.197
Average bias: 0.179
Average variance: 0.086


In [34]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=150, max_depth=5, min_samples_split=2)
rf.fit(X_train, y_train)
rf.predict(X_test)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))


avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        rf, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

0.8441011235955056
0.8156424581005587
Average expected loss: 0.176
Average bias: 0.168
Average variance: 0.054


In [36]:
from sklearn.linear_model import LogisticRegression

rf = LogisticRegression()
rf.fit(X_train, y_train)
rf.predict(X_test)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))


avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        rf, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

0.7963483146067416
0.7988826815642458
Average expected loss: 0.195
Average bias: 0.196
Average variance: 0.040


In [37]:
from sklearn.svm import SVC

rf = SVC()
rf.fit(X_train, y_train)
rf.predict(X_test)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        rf, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

0.7823033707865169
0.7821229050279329
Average expected loss: 0.206
Average bias: 0.218
Average variance: 0.036


In [39]:
from sklearn.neighbors import KNeighborsClassifier

rf = KNeighborsClassifier()
rf.fit(X_train, y_train)
rf.predict(X_test)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        rf, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

0.8539325842696629
0.7988826815642458
Average expected loss: 0.199
Average bias: 0.190
Average variance: 0.101


In [49]:
from sklearn.ensemble import AdaBoostClassifier

rf = AdaBoostClassifier()
rf.fit(X_train, y_train)
rf.predict(X_test)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        rf, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

0.8174157303370787
0.7877094972067039
Average expected loss: 0.208
Average bias: 0.207
Average variance: 0.060


In [40]:
from sklearn.tree import DecisionTreeClassifier

rf = DecisionTreeClassifier()
rf.fit(X_train, y_train)
rf.predict(X_test)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        rf, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

0.8721910112359551
0.8268156424581006
Average expected loss: 0.201
Average bias: 0.173
Average variance: 0.088


In [41]:
from sklearn.naive_bayes import GaussianNB

rf = GaussianNB()
rf.fit(X_train, y_train)
rf.predict(X_test)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        rf, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

0.7275280898876404
0.7150837988826816
Average expected loss: 0.356
Average bias: 0.291
Average variance: 0.131


In [43]:
from sklearn.naive_bayes import BernoulliNB

rf = BernoulliNB()
rf.fit(X_train, y_train)
rf.predict(X_test)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        rf, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

0.7851123595505618
0.7821229050279329
Average expected loss: 0.226
Average bias: 0.218
Average variance: 0.018


In [44]:
from sklearn.naive_bayes import MultinomialNB

rf = MultinomialNB()
rf.fit(X_train, y_train)
rf.predict(X_test)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        rf, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)

0.7359550561797753
0.7486033519553073
Average expected loss: 0.250
Average bias: 0.251
Average variance: 0.026


In [53]:
import xgboost as xgb

rf = xgb.XGBClassifier()
rf.fit(X_train, y_train)
rf.predict(X_test)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
        rf, X_train.values, y_train.values, X_test.values, y_test.values, 
        loss='0-1_loss',
        random_seed=123)

print('Average expected loss: %.3f' % avg_expected_loss)
print('Average bias: %.3f' % avg_bias)
print('Average variance: %.3f' % avg_var)



0.8693820224719101
0.8324022346368715














Average expected loss: 0.196
Average bias: 0.173
Average variance: 0.082


In [56]:
Y_train.value_counts()

0    549
1    342
Name: Survived, dtype: int64