In [108]:
# Load in our libraries
import pandas as pd
import numpy as np
import sklearn
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline

# import plotly.offline as py
# py.init_notebook_mode(connected=True)
# import plotly.graph_objs as go
# import plotly.tools as tls

# Going to use these 5 base models for the stacking
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold;

In [109]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

passenger_ids = df_train["PassengerId"]

print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


In [110]:
print(df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())

   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363


In [111]:
# Age categorization
for df in [df_train, df_test]:
    avg_age = df["Age"].mean()
    std_age = df["Age"].std()
    null_count = df["Age"].isnull().sum()
    
    age_null_list = np.random.randint(avg_age - std_age, avg_age + std_age, size=null_count)
    df["Age"][np.isnan(df["Age"])] = age_null_list
    df["Age"] = df["Age"].astype(int)
    
    df["CategoricalAge"] = pd.cut(df["Age"], 6, labels=range(0, 6))
    
print("max age: %d"%(df_train["Age"].max()))
print("min age: %d"%(df_train["Age"].min()))
print(df_train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean())

max age: 80
min age: 0
  CategoricalAge  Survived
0              0  0.591549
1              1  0.339564
2              2  0.402985
3              3  0.350877
4              4  0.348837
5              5  0.142857


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [112]:
#Fare categorization
for df in [df_train, df_test]:
    df["Fare"] = df["Fare"].fillna(df["Fare"].median())
    df["CategoricalFare"] = pd.qcut(df["Fare"], 5, labels=range(0, 5))

print(df_train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean())

  CategoricalFare  Survived
0               0  0.217877
1               1  0.201087
2               2  0.424419
3               3  0.444444
4               4  0.642045


In [113]:
#Embarked preprocessing and mapping
for df in [df_train, df_test]:
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2}).astype(int)

print(df_train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())


   Embarked  Survived
0         0  0.553571
1         1  0.389610
2         2  0.339009


In [114]:
#Create a new feature called HasCabin

for df in [df_train, df_test]:
    df['HasCabin'] = df['Cabin'].apply(lambda x: 0 if type(x) == float else 1)

In [115]:
#Create a new feature called FamilySize as a combination of siblings on board and parents/children
#Also make a feature called isAlone if FamilySize == 1 then 1, else 0

for df in [df_train, df_test]:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = 0
    df['IsAlone'] = df['FamilySize'].apply(lambda x: 1 if x == 1 else 0)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,CategoricalAge,CategoricalFare,HasCabin,FamilySize,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,2,1,0,0,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,0,2,4,1,2,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,2,1,1,0,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,2,2,4,1,2,0
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,2,2,1,0,1,1


In [116]:
#Final data processing incl sex
for df in [df_train, df_test]:
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)

# drop_fields = ['PassengerId','Name', 'Age', 'Ticket', 'Fare', 'Cabin']
drop_fields = ['PassengerId','Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Parch']

df_train = df_train.drop(drop_fields, axis=1)

In [117]:
#Training the classifier
clf = GradientBoostingClassifier(random_state=0, learning_rate=0.18, max_depth=3)
# clf = SVC()
X_train = df_train.drop(['Survived'], axis=1)
y_train = df_train['Survived']

clf.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(clf.score(X_train, y_train)))
# print("Accuracy on training set: {:.3f}".format(clf.score(X_test, y_test)))

Accuracy on training set: 0.871


In [85]:
importances = clf.fit(X_train,y_train).feature_importances_
for i, val in enumerate(list(X_train.columns.values)):
    print("%s: %f"%(val, importances[i]))

Pclass: 0.099695
Sex: 0.100131
SibSp: 0.077252
Parch: 0.043602
Embarked: 0.078126
CategoricalAge: 0.144615
CategoricalFare: 0.218078
HasCabin: 0.040082
FamilySize: 0.186264
IsAlone: 0.012155


In [86]:
X_test = df_test.drop(drop_fields, axis=1)

res = clf.predict(X_test)
df_result = pd.DataFrame()
df_result["PassengerId"] = df_test["PassengerId"]
df_result["Survived"] = res.ravel()
df_result.to_csv("results.csv", sep=",", index=False)