In [2]:
import pandas as pd # obliczenia
import numpy as np # obliczenia
import matplotlib.pyplot as plt # grafika
import seaborn as sns # grafika
import csv
import re
%matplotlib inline

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [3]:
# Import data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
# Define helper functions
def impute_emb(cols):

    if pd.isnull(cols):
        return 1
    else:
        if cols == 'S':
            return 1
        elif cols == 'C':
            return 2
        elif cols == 'Q':
            return 3

def impute_sex(cols):
    if cols == 'male':
        return 1
    elif cols == 'female':
        return 0

# Function based on results of correlation:
# train['Age'].corr(train['Pclass']) <- max cor
# train['Age'].corr(train['Fare'])
# train['Age'].corr(train['Parch'])
# train['Age'].corr(train['SibSp']) <- 2nd max cor
# train['Age'].corr(train['Embarked'])
# train[['Age','Pclass','SibSp']].groupby(['Pclass','SibSp']).median()
# With the results in mind I decided to calculate age based on 2 most correlated factors 'Pclass' and 'SibSp'
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    sibsp = cols[2]

    if pd.isnull(Age):

        if Pclass == 1:
            if sibsp == 0:
                return 37
            elif sibsp == 1:
                return 38
            elif sibsp == 2:
                return 44
            else:
                return 23
        elif Pclass == 2:
            if sibsp == 0:
                return 30
            elif sibsp == 1:
                return 29
            elif sibsp == 2:
                return 24
            else:
                return 30
        else:
            if sibsp == 0:
                return 26
            elif sibsp == 1:
                return 25
            elif sibsp == 2:
                return 20
            elif sibsp == 3:
                return 6
            elif sibsp == 4:
                return 7
            else:
                return 11

    else:
        return Age


In [5]:
# INITIAL CLEAN UP
# Delete Cabins
train.drop('Cabin',axis=1,inplace=True)
# As fare/pclass is strongly correlated, I will drop fare
train.drop('Fare',axis=1,inplace=True)
# I will drop ticket as well, it doesn't seem to mean anything
train.drop('Ticket',axis=1,inplace=True)
# Drop passengerid, completely useless
train.drop('PassengerId',axis=1,inplace=True)
# Apply helper functions to Emabarked, Sex
train['Embarked'] = train['Embarked'].apply(impute_emb)
train['Sex'] = train['Sex'].apply(impute_sex)

In [6]:
# AGE clean up
# Impute Age
train['Age'] = train[['Age','Pclass','SibSp']].apply(impute_age,axis=1)
# Convert into age categories
train['Age'] = pd.cut(train['Age'], bins=[0,17,64,100],labels=[1,2,3])

In [7]:
# NAME clean up - extract title from name
# Idea and solution taken from https://www.kaggle.com/sinakhorami/titanic-best-working-classifier
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

train['Title'] = train['Name'].apply(get_title)

train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')

# pd.crosstab(train['Title'], train['Sex'])
train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

# Mapping
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
train['Title'] = train['Title'].map(title_mapping)
train['Title'] = train['Title'].fillna(0)

# Drop Name column
train.drop('Name',axis=1,inplace=True)

In [8]:
# Split into data and target
target = train['Survived']
train.drop('Survived',axis=1,inplace=True)

In [9]:
# Check correlations. Sex, Title and Pclass are the most correlated
for col in list(train):
    print("{}, {}".format(col, target.corr(train[col])))

Pclass, -0.33848103596101475
Sex, -0.543351380657755
Age, -0.10426452420931441
SibSp, -0.03532249888573559
Parch, 0.08162940708348365
Embarked, 0.10681138570891938
Title, 0.467332589152995


In [10]:
# Pick classifiers
clf4 = RidgeClassifier(tol=1e-4, solver="saga",
                               fit_intercept=True, alpha=0.05)
clf6 = GradientBoostingClassifier(n_estimators=50, max_features='auto', loss='exponential',max_depth=3)
clf8 = ExtraTreesClassifier(n_estimators=10,criterion='entropy')
clf13 = LogisticRegression(solver='newton-cg')
clf14 = GaussianNB()

In [None]:
# GRIDSEARCH classifiers to tune parameters
params = {
#clf8
#     'n_estimators':[3, 5, 10],
#     'criterion':['gini','entropy'],
#     'max_features':['auto'],
#clf6
#     'n_estimators':[10,20,30,40,50,60,70],
#     'max_features':['auto',None],
#     'max_depth':[2,3,4],
#     'loss':['deviance','exponential']
#clf4
#     'tol':[1e-4,1e-5],
#     'solver':['saga'],
#     'fit_intercept':[True],
#     'alpha':[0.05,0.1,0.5]
#clf13
#     'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#     'C':[1,2,5],
#     'tol':[1e-3,1e-4,1e-5],
}
clf = GridSearchCV(clf4, param_grid=params, n_jobs=7, verbose=10, cv=10)
clf.fit(train, target)
print("BEST:")
print(clf.best_params_)
print(clf.best_score_)

In [11]:
# Make lists for the Voting Classifier
list = [
    clf14,
    clf13,
#     clf4,
#     clf6,
    clf8,
]
list_names = [
    'naive Gaussian',
    'logistic',
#     'Rigde',
#     'GBC',
    'ExtraTrees',
]

In [12]:
# Check performance of each classifier and see results of Voting
k = 8
# Set up classifiers, see if I can get something. I will derive some stuff later.
for clf, label in zip(list, list_names):
        # scores = cross_val_score(clf, x_tfidf, y, cv=8, scoring='accuracy', n_jobs=-1)
        # print("Accuracy: %0.3f (+/- %0.3f) [%s]" % (scores.mean(), scores.std(), label))
    pred = cross_val_predict(
            clf, train,
            y=target,
            cv=k, n_jobs=-1, verbose=20
    )
    print(label)
    cm = confusion_matrix(target, pred)
    print(cm)
    accuracy = accuracy_score(target, pred)
    print(accuracy)

eclf = VotingClassifier(estimators=[(i, j) for i, j in zip(list_names, list)], voting='hard')
pred = cross_val_predict(
            eclf, train, y=target, cv=k, n_jobs=-1, verbose=20
        )
print("VOTING CLASSIFIER hard")
cr = classification_report(target, pred, target_names=['alive','dead'])
cm = confusion_matrix(target, pred)
accuracy = accuracy_score(target, pred)
# print(pred)
print(cr)
print(cm)
print(accuracy)

eclf = VotingClassifier(estimators=[(i, j) for i, j in zip(list_names, list)], voting='soft')
pred = cross_val_predict(
            eclf, train, y=target, cv=k, n_jobs=-1, verbose=20
        )
print("VOTING CLASSIFIER soft")
cr = classification_report(target, pred, target_names=['alive','dead'])
cm = confusion_matrix(target, pred)
accuracy = accuracy_score(target, pred)
# print(pred)
print(cr)
print(cm)
print(accuracy)

[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    1.7s remaining:    5.3s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    2.3s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    2.9s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    3.4s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    3.9s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.8s finished


naive Gaussian
[[481  68]
 [ 92 250]]
0.8204264870931538


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    1.7s remaining:    5.4s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    2.3s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    3.0s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    3.5s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    4.1s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    5.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    5.3s finished


logistic
[[484  65]
 [ 92 250]]
0.8237934904601572


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    1.7s remaining:    5.4s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    2.2s remaining:    3.8s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    2.8s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    3.3s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    3.9s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.9s finished


ExtraTrees
[[501  48]
 [125 217]]
0.8058361391694725


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    1.8s remaining:    5.6s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    2.4s remaining:    4.1s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    3.0s remaining:    3.0s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    3.6s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    4.2s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    5.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    5.3s finished


VOTING CLASSIFIER hard
             precision    recall  f1-score   support

      alive       0.84      0.90      0.87       549
       dead       0.81      0.73      0.77       342

avg / total       0.83      0.83      0.83       891

[[492  57]
 [ 91 251]]
0.8338945005611672


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:    1.9s remaining:    6.0s
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    2.6s remaining:    4.3s
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:    3.2s remaining:    3.2s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    3.8s remaining:    2.2s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:    4.5s remaining:    1.4s


VOTING CLASSIFIER soft
             precision    recall  f1-score   support

      alive       0.84      0.88      0.86       549
       dead       0.80      0.73      0.76       342

avg / total       0.82      0.82      0.82       891

[[485  64]
 [ 92 250]]
0.8249158249158249


[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    5.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    5.6s finished


In [15]:
# Ultimately pick the HARD version of VotingClassifier
list = [
    clf14,
    clf13,
    clf8,
]
list_names = [
    'naive Gaussian',
    'logistic',
    'ExtraTrees',
]
my_classifier = VotingClassifier(estimators=[(i, j) for i, j in zip(list_names, list)], voting='hard')
my_classifier.fit(train,target)

VotingClassifier(estimators=[('naive Gaussian', GaussianNB(priors=None)), ('logistic', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
    ...timators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [16]:
# Make prediction for test - first clean up :)
test.drop('Cabin',axis=1,inplace=True)
# As fare/pclass is strongly correlated, I will drop fare
test.drop('Fare',axis=1,inplace=True)
# I will drop ticket as well, it doesn't seem to mean anything
test.drop('Ticket',axis=1,inplace=True)
# Drop passengerid, completely useless
test.drop('PassengerId',axis=1,inplace=True)
# Apply helper functions to Emabarked, Sex
test['Embarked'] = test['Embarked'].apply(impute_emb)
test['Sex'] = test['Sex'].apply(impute_sex)
# Impute Age
test['Age'] = test[['Age','Pclass','SibSp']].apply(impute_age,axis=1)
# Convert into age categories
test['Age'] = pd.cut(test['Age'], bins=[0,17,64,100],labels=[1,2,3])
test['Title'] = test['Name'].apply(get_title)

test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')
test['Title'] = test['Title'].replace('Mlle', 'Miss')
test['Title'] = test['Title'].replace('Ms', 'Miss')
test['Title'] = test['Title'].replace('Mme', 'Mrs')

# Mapping
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
test['Title'] = test['Title'].map(title_mapping)
test['Title'] = test['Title'].fillna(0)

# Drop Name column
test.drop('Name',axis=1,inplace=True)

In [17]:
# Predict
my_pred = my_classifier.predict(test)
# Check score
sub = pd.read_csv('gender_submission.csv')
print(accuracy_score(my_pred,sub['Survived']))
# This is currently tied for 22nd in Kaggle leaderboard :)

0.9425837320574163


  if diff:
