In [188]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split

In [255]:
titanic = pd.read_csv('Datasets/Kaggle/titanic/train.csv')

In [190]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


First, I'll clean the data. Both my training and test set will need uniform preprocessing before any predictions are made.
I'll make functions that will be applicable to both sets of data

In [247]:
#fill null ages
def guess_age(df):
    avg_age = df['Age'].sum()/len(df['Age'])
    df['Age'] = df['Age'].fillna(avg_age)
    return df

#fill null fares
def guess_fare(df):
    avg_fare = df['Fare'].sum()/len(df['Fare'])
    df['Fare'] = df['Fare'].fillna(avg_fare)
    return df

#mapping sex 
def sex_map(df):
    df['Sex'] = df['Sex'].map(lambda x: 1 if x == 'female' else 0)
    return df

#making pclass columns
def pclass3(df):
    df['Pclass_3'] = df['Pclass'].map(lambda x: 1 if x == 3 else 0)
    return df

def pclass2(df):
    df['Pclass_2'] = df['Pclass'].map(lambda x: 1 if x == 2 else 0)
    return df

#embarked_dummies
def embarked_dummies(df):
    return pd.get_dummies(df, columns=['Embarked'], drop_first = True)

#droping unused columns
def dropping_columns(df):
    for col in ['PassengerId', 'Name', 'Cabin', 'Ticket', 'Pclass']:
        try:
            df.drop(col, axis=1, inplace=True)
        except:
            pass
    return df

In [256]:
titanic = guess_age(titanic)

In [257]:
titanic = guess_fare(titanic)

In [258]:
titanic = sex_map(titanic)

In [259]:
titanic = pclass2(titanic)

In [260]:
titanic = pclass3(titanic)

In [261]:
titanic = embarked_dummies(titanic)

In [262]:
titanic = dropping_columns(titanic)

Perfect. Now that I have only the columns I need for my training set. 

In [199]:
titanic.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,0,0,22.0,1,0,7.25,0,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0,0
2,1,1,26.0,0,0,7.925,0,1,0,1
3,1,1,35.0,1,0,53.1,0,0,0,1
4,0,0,35.0,0,0,8.05,0,1,0,1


I have to do the all the same procedures to my test set.

In [263]:
test = pd.read_csv('Datasets/Kaggle/titanic/test.csv')

In [201]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [264]:
test = guess_age(test)

In [265]:
test = guess_fare(test)

In [266]:
test = sex_map(test)

In [267]:
test = pclass2(test)

In [268]:
test = pclass3(test)

In [269]:
test = embarked_dummies(test)

In [270]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,0,1,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,0,1,0,1
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,1,0,1,0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,0,1,0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,0,1,0,1


I'll be constructing a pipeline. I love pipelines

In [210]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [150]:
#First I'll try a Logistic Regression
lr_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('lr', LogisticRegression())
])

In [113]:
# SVM pipe
svm_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('svm', svm.SVC())
])

In [271]:
#XGBoost Classifier
xgb_pipe = Pipeline([
    ('ss', StandardScaler()),
    ('xgb', XGBClassifier())
])

Establish my target and feature variables. Define X and y on my training set

In [272]:
features = ['Sex','Age','SibSp','Parch','Fare','Pclass_2','Pclass_3','Embarked_Q','Embarked_S']
target = 'Survived'

y = titanic[target]
X = titanic[features]
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

Fitting and Scoring each of my Pipeline Models

In [273]:
lr_pipe.fit(X, y)
lr_pipe.score(X, y)

0.80359147025813693

In [274]:
svm_pipe.fit(X, y)
svm_pipe.score(X, y)

0.84287317620650959

In [275]:
xgb_pipe.fit(X, y)
xgb_pipe.score(X, y)

0.87654320987654322

My XGBoost performed the best so I'll submit the predictions and see what I get

In [240]:
xgb_pipe.predict(test[features])

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [276]:
test['Survived'] = xgb_pipe.predict(test[features])

In [277]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Pclass_2,Pclass_3,Embarked_Q,Embarked_S,Survived
0,892,3,"Kelly, Mr. James",0,34.500000,0,0,330911,7.8292,,0,1,1,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.000000,1,0,363272,7.0000,,0,1,0,1,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.000000,0,0,240276,9.6875,,1,0,1,0,0
3,895,3,"Wirz, Mr. Albert",0,27.000000,0,0,315154,8.6625,,0,1,0,1,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.000000,1,1,3101298,12.2875,,0,1,0,1,0
5,897,3,"Svensson, Mr. Johan Cervin",0,14.000000,0,0,7538,9.2250,,0,1,0,1,0
6,898,3,"Connolly, Miss. Kate",1,30.000000,0,0,330972,7.6292,,0,1,1,0,1
7,899,2,"Caldwell, Mr. Albert Francis",0,26.000000,1,1,248738,29.0000,,1,0,0,1,0
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",1,18.000000,0,0,2657,7.2292,,0,1,0,0,1
9,901,3,"Davies, Mr. John Samuel",0,21.000000,2,0,A/4 48871,24.1500,,0,1,0,1,0


In [278]:
submission = test[['PassengerId', 'Survived']]
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [279]:
submission.to_csv('submission.csv', index=False)