In [88]:
# Load in our libraries
import pandas as pd
import numpy as np
import sklearn
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline

# import plotly.offline as py
# py.init_notebook_mode(connected=True)
# import plotly.graph_objs as go
# import plotly.tools as tls

# Going to use these 5 base models for the stacking
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold;

In [89]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

passenger_ids = df_train["PassengerId"]

print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


In [90]:
print(df_train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())

   Pclass  Survived
0       1  0.629630
1       2  0.472826
2       3  0.242363


In [91]:
# Age categorization
for df in [df_train, df_test]:
    avg_age = df["Age"].mean()
    std_age = df["Age"].std()
    null_count = df["Age"].isnull().sum()
    
    age_null_list = np.random.randint(avg_age - std_age, avg_age + std_age, size=null_count)
    df["Age"][np.isnan(df["Age"])] = age_null_list
    df["Age"] = df["Age"].astype(int)

df_train["CategoricalAge"] = pd.cut(df_train["Age"], 6, labels=range(0, 6))
print("max age: %d"%(df_train["Age"].max()))
print("min age: %d"%(df_train["Age"].min()))
print(df_train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean())

max age: 80
min age: 0
  CategoricalAge  Survived
0              0  0.591549
1              1  0.338608
2              2  0.390533
3              3  0.387931
4              4  0.348837
5              5  0.142857


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [92]:
#Fare categorization
for df in [df_train, df_test]:
    df["Fare"] = df["Fare"].fillna(df["Fare"].median())

df_train["CategoricalFare"] = pd.qcut(df_train["Fare"], 5, labels=range(0, 5))

print(df_train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean())

  CategoricalFare  Survived
0               0  0.217877
1               1  0.201087
2               2  0.424419
3               3  0.444444
4               4  0.642045


In [93]:
#Embarked preprocessing and mapping
for df in [df_train, df_test]:
    df['Embarked'] = df['Embarked'].fillna('S')
    df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2}).astype(int)

print(df_train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean())


  Embarked  Survived
0        C  0.553571
1        Q  0.389610
2        S  0.339009


In [94]:
#Final data processing incl sex
df_train['Sex'] = df_train['Sex'].map({'male': 0, 'female': 1}).astype(int)

drop_fields = ['Name', 'Age', 'Ticket', 'Fare', 'Cabin']
df_train = df_train.drop(drop_fields, axis=1)

In [75]:
#Feature importance helper function
def plot_feature_importances(model, X_train):
    n_features = X_train.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), list(X_train))
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")

In [79]:
#Training the classifier
clf = GradientBoostingClassifier(random_state=0, learning_rate=0.18, max_depth=3)
X_train = df_train.drop(['Survived'], axis=1)
y_train = df_train['Survived']

clf.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(clf.score(df_train.drop(['Survived'], axis=1), y_train)))
# print("Accuracy on training set: {:.3f}".format(clf.score(X_test, y_test)))

Accuracy on training set: 0.920
