In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

# Read in data

In [None]:
# read in data from csv files
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
combine = [train_df, test_df]

In [None]:
# preview the data
train_df.head()

# Preprocessing 
### Remove survived column

In [None]:
train_X = train_df
train_Y = train_X["Survived"]
del train_X["Survived"] # Delete because it is numeric and we want to index independent numerical

# preview the data
train_X.head()

### Replace sex for numerical labels

In [None]:
# replace sex attribute to numerical: female=0, male=1
train_X['Sex'].replace(['female','male'], [0,1],inplace=True)

# preview the data
train_X.head()

### Make age categories 

In [None]:
train_X["Age"].fillna(train_X.Age.mean(), inplace=True)

# Age to category of age
interval = (0, 5, 12, 18, 25, 35, 60, 120)
cats = [0,1,2,3,4,5,6]
train_X["Age_cat"] = pd.cut(train_X.Age, interval, labels=cats)
train_X["Age_cat"].head()

### Categorize embarked attribute

In [None]:
freq_port = train_X.Embarked.dropna().mode()[0]
print ("Replacing NaN values of embarked with most frequent port:", freq_port)
train_X['Embarked'] = train_X['Embarked'].fillna(freq_port)

# we have three types of ports
print(pd.unique(train_X['Embarked']))

# replace port numbers with category ID
train_X["Embarked_cat"] = train_X["Embarked"]
train_X["Embarked_cat"].replace(['S','C','Q'], [1,2,3], inplace=True) # To numerical values
print(pd.unique(train_X['Embarked_cat']))

# preview the data
train_X.head()

### Put fares into numerical categories

In [None]:
# Fill in NaN values
train_X["Fare"].fillna(train_X.Fare.mean(), inplace=True)
train_X.head()

# Put fares into 4 bands (see https://www.kaggle.com/polarhut/titanic-data-science-solutions)
train_X['Fare_cat'] = train_X['Fare']
train_X.loc[ train_X['Fare_cat'] <= 7.91, 'Fare_cat'] = 0
train_X.loc[(train_X['Fare_cat'] > 7.91) & (train_X['Fare'] <= 14.454), 'Fare_cat'] = 1
train_X.loc[(train_X['Fare_cat'] > 14.454) & (train_X['Fare'] <= 31), 'Fare_cat']   = 2
train_X.loc[ train_X['Fare_cat'] > 31, 'Fare_cat'] = 3
train_X['Fare_cat'] = train_X['Fare_cat'].astype(int)


# interval = (0, 7.91, 14.54, 31, 1000)
# cats = [0,1,2,3]
# train_X['Fare_cat'] = train_X['Fare']
# train_X['Fare_cat'] = pd.cut(train_X.Fare, interval, labels=cats)
train_X['Fare_cat'].head()

### Create a new feature: family size and isAlone

In [None]:
# family = self + sibelings/spouse + Parents/childs
train_X['FamilySize'] = train_X['SibSp'] + train_X['Parch'] + 1

train_X['IsAlone'] = 0
train_X.loc[train_X['FamilySize'] == 1, 'IsAlone'] = 1

# preview the data
train_X.head()

### Create a new feature: Pclass * age

In [None]:
train_X['Age*Class'] = train_X.Age_cat * train_X.Pclass

train_X.loc[:, ['Age*Class', 'Age_cat', 'Pclass']].head(10)

### Remove attributes with no predictive power 

In [None]:
# remove 
train_X = train_X.drop(['PassengerId','SibSp','Parch', 'Age', 'Fare'], axis=1)

# preview the data
train_X.head()

# Evaluation 

In [None]:
runs = 100 # Do it over multiple runs to get estimate of "true" accuracy distribution
accus = np.zeros((runs,1))

# Only numerical attributes are used
numeric_variables = list(train_X.dtypes[train_X.dtypes!= 'object'].index) 
print ("Using variables:", numeric_variables)

print ("Running..")
for run in range(runs):
    # generate a random train / test split
    cv_X_train, cv_X_test, cv_Y_train, cv_Y_test = train_test_split(train_X, train_Y, test_size=0.3)

    # decision tree -> 80.0 acc
#     model = DecisionTreeClassifier()
#     model.fit(cv_X_train[numeric_variables], cv_Y_train)
#     # calc mean accuracy over predicted test set
#     accus[run] = model.score(cv_X_test[numeric_variables], cv_Y_test)
    
    # random forest -> 81.2 acc
    model = RandomForestClassifier(n_estimators=250)
    model.fit(cv_X_train[numeric_variables], cv_Y_train)
    # calc mean accuracy over predicted test set
    accus[run] = model.score(cv_X_test[numeric_variables], cv_Y_test)

print("Mean test accuracy:", str(np.mean(accus)), "| Std:", str(np.std(accus)) + ". in", str(runs),"runs.")