In [42]:
#import required library

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier



In [43]:
#reading data


train = pd.read_csv('titanictrain.csv')
test = pd.read_csv('titanictest.csv')
sample = pd.read_csv('titanicgender_submission.csv')

In [44]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [45]:

train.shape

(891, 12)

In [46]:
#checking for nulls


train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [47]:
# Going to drop those two Embarked rows since they won't affect outcome much
# Going to drop the cabin column as well. There are too many missing to make dependable predictions.
# What to do about those missing 'Age' values? I'll impute them with the mean for now.

train.dropna(subset=['Embarked'], how='any', inplace=True)

train.drop('Cabin', axis=1, inplace=True)

train.Age.fillna(train.Age.mean(), inplace=True)

In [48]:
# Encoding the 'Sex' column so we can feed it into a model.

sex_encoded = {
    'male': 1,
    'female': 0
}

train['Sex'] = train.Sex.map(sex_encoded)

In [49]:
# Let's take a look at the rest of the dtypes and see what we want to use

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    889 non-null int64
Survived       889 non-null int64
Pclass         889 non-null int64
Name           889 non-null object
Sex            889 non-null int64
Age            889 non-null float64
SibSp          889 non-null int64
Parch          889 non-null int64
Ticket         889 non-null object
Fare           889 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(6), object(3)
memory usage: 83.3+ KB


In [50]:
# So I think out of those, the only object one that seems useful is maybe Embarked?
# Let's go ahead and make dummy variables of those

train['Embarked'] = train['Embarked'].astype('category')
train = pd.get_dummies(train, columns = ['Embarked'], drop_first = True)



In [51]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    889 non-null int64
Survived       889 non-null int64
Pclass         889 non-null int64
Name           889 non-null object
Sex            889 non-null int64
Age            889 non-null float64
SibSp          889 non-null int64
Parch          889 non-null int64
Ticket         889 non-null object
Fare           889 non-null float64
Embarked_Q     889 non-null uint8
Embarked_S     889 non-null uint8
dtypes: float64(2), int64(6), object(2), uint8(2)
memory usage: 78.1+ KB


In [52]:
# And then for the last one I'll go ahead and make a combined "Family_Count" by summing "SibSp" and "Parch"

train['Family_Count'] = train.SibSp + train.Parch

In [53]:
train.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked_Q,Embarked_S,Family_Count
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,0,1,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,0,0,1
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,0,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,0,1,1
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,0,1,0


In [54]:
#Going to do the same cleaning and everything to the test set

test.shape

(418, 11)

In [55]:
test.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [56]:

test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [57]:
test.Fare.fillna(test.Fare.median(), inplace= True)

test.Age.fillna(test.Age.mean(), inplace=True)

test.drop('Cabin', axis=1, inplace=True)


In [58]:
test['Sex'] = test.Sex.map(sex_encoded)

test['Embarked'] = test['Embarked'].astype('category')
test = pd.get_dummies(test, columns = ['Embarked'], drop_first = True)


test['Family_Count'] = test.SibSp + test.Parch

In [None]:
#setting our model

In [59]:
# Selecting our features and our target variable

features = [col for col in train.columns if col not in ['Survived', 'PassengerId', 'Name', 'Ticket']]
X = train[features]
y = train['Survived']

In [60]:

# Looking to see what our baseline is, in this case the majority class is about 62%

train.Survived.value_counts(normalize=True)

0    0.617548
1    0.382452
Name: Survived, dtype: float64

In [84]:
# Setting up train, test, split. Setting random_state for reproducibility

X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, stratify = y, random_state=42)

In [85]:
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [86]:

# Let's look at a DecisionTree and optimize it

dtc = DecisionTreeClassifier(random_state = 42)
dtc_params = {
    'max_depth':[None,1,2,3,4],
    'max_features':[None,'log2','sqrt',2,3,4,5],
    'min_samples_split':[2,3,4,5,10,15,20,25,30,40,50],
    'criterion':['gini', 'entropy']
}
gs_dtc = GridSearchCV(dtc, dtc_params)
gs_dtc.fit(X_train, y_train)

print(gs_dtc.best_score_)
print(gs_dtc.best_params_)
print(gs_dtc.score(X_holdout, y_holdout))

0.8213213213213213
{'criterion': 'entropy', 'max_depth': None, 'max_features': 4, 'min_samples_split': 20}
0.8071748878923767


In [87]:
dt_opti = gs_dtc.best_estimator_

In [88]:
# Let's look at a RandomForest and optimize it

rf = RandomForestClassifier(random_state=42)

rf_params = {
    'n_estimators': [5, 10, 15, 20, 25],
    'max_depth': [None, 1,2,3],
    'max_features': ['auto', 'log2']
}
gs_rf = GridSearchCV(rf, rf_params)
gs_rf.fit(X_train, y_train)

print (gs_rf.best_score_)
print (gs_rf.best_params_)
print (gs_rf.score(X_holdout, y_holdout))

0.8258258258258259
{'max_depth': None, 'max_features': 'auto', 'n_estimators': 10}
0.8116591928251121


In [89]:
rf_opti = gs_rf.best_estimator_

In [90]:
# Let's look at an ExtraTrees and optimize it with Gridsearch

extr = ExtraTreesClassifier(random_state=42)

extr_params = {
    'n_estimators': [5, 10, 15, 20, 25],
    'max_depth': [None, 1,2,3],
    'max_features': ['auto', 'log2']
}
gs_extr = GridSearchCV(extr, extr_params)
gs_extr.fit(X_train, y_train)

print (gs_extr.best_score_)
print(gs_extr.best_params_)
print(gs_extr.score(X_holdout, y_holdout))

0.8108108108108109
{'max_depth': 3, 'max_features': 'auto', 'n_estimators': 15}
0.8026905829596412


In [91]:
ext_opti = gs_extr.best_estimator_

In [92]:
# Let's look at an AdaBoost Classifier and optimize it with GridSearch

ada_clf = AdaBoostClassifier(random_state=42)

ada_params = {
#     'DecisionTreeClassifier__max_depth': [None, 1,2,3,4,5],
    'n_estimators': [5, 10, 15, 20, 25],
    'algorithm': ['SAMME', 'SAMME.R'],
    'learning_rate': [.5, .75, 1.0]
}
gs_ada = GridSearchCV(ada_clf, ada_params)
gs_ada.fit(X_train, y_train)

print (gs_ada.best_score_)
print(gs_ada.best_params_)
print(gs_ada.score(X_holdout, y_holdout))

0.8168168168168168
{'algorithm': 'SAMME.R', 'learning_rate': 0.75, 'n_estimators': 25}
0.7802690582959642


In [93]:
ada_opti = gs_ada.best_estimator_

In [104]:
# Now let's take all those models together and ensemble them for (hopefully) the best results

voting_clf = VotingClassifier(estimators=[('rf', rf_opti), ('extr', ext_opti), ('dtc', dt_opti), ('ada_clf', ada_opti)],
                              voting='soft')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_we...hm='SAMME.R', base_estimator=None,
          learning_rate=0.75, n_estimators=25, random_state=42))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [105]:
voting_clf.score(X_holdout, y_holdout)

  if diff:


0.8161434977578476

In [106]:
# So far it looks like the Random Forest model performed best on the holdout set, so let's score that on the test data

X_test = test[features]

In [107]:
predictions = rf_opti.predict(X_test)

In [108]:
# Now let's add those to the test data set and prepare it for submission

test['Survived'] = predictions

test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked_Q,Embarked_S,Family_Count,Survived
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,1,0,0,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,0,1,1,0
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,1,0,0,0
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,0,1,0,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,0,1,2,1


In [109]:
sample.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [111]:
test.shape

(418, 13)

In [113]:
test[['PassengerId', 'Survived']].to_csv('titanicresult_submission.csv', index=False)

In [114]:
submissions = pd.read_csv('titanicresult_submission.csv')
submissions.shape

(418, 2)