# Kaggle-Titanic 
### Predicting survival rates for passengers on the Titanic

The competition can be found on [Kaggle](https://www.kaggle.com/c/titanic)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import tensorflow as tf

## Data Preprocessing
First, read in the data from .csv files from local directory

In [223]:
train = pd.read_csv("data/train.csv")
test  = pd.read_csv("data/test.csv")
ys    = pd.read_csv("data/gender_submission.csv")

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Next, drop unwanted columns from our data, replace NaN values with zero, and factorize columns with categorical variables.

In [None]:
def process(d):
    # Drop these variables because they do not contain valuable information
    d.drop(['PassengerId','Name','Ticket','Cabin'], axis=1, inplace=True)
    for col in d.columns.values:
        # If contains NaN values
        if d[col].isnull().values.any():
            d[col] = d[col].fillna(0)
        # Age and Fare are continuous variables
        if not col == 'Age' or col == 'Fare':
            d[col] = pd.factorize(d[col])[0]
    return d
            
train = process(train)
test  = process(test)

Now that the data is factorized, we can compute the **min** and **max** for each column for normalization. We will use the 'Survive' column as y-labels, therefore we don't want to normalize it.

In [None]:
for col in train.columns.values:
    if not col == 'Survived':
        train[col] = (train[col] - train[col].min()) / (train[col].max() - train[col].min()) - 0.5
        test[col] = (test[col] - train[col].min()) / (train[col].max() - train[col].min()) - 0.5

From the factorized and normalized training and testing sets we now define a simple function to generate batches from these sets.

In [None]:
def one_hot(x):
    if x == 0:
        return [0,1]
    elif x == 1:
        return [1,0]
    else:
        raise Exception("Input is neither 0 nor 1 which should be the case.")

class get_batch():    
    def training(batch_size):
        train_x, train_y = [],[]
        for i in range(batch_size):
            train_index = np.random.randint(len(train.index))
            
            train_row = train.iloc[train_index]

            train_x.append(np.array(train_row.drop('Survived')))
            train_y.append(one_hot(train_row.Survived))
        return (np.array(train_x), np.array(train_y))

    def testing(batch_size):
        test_x, test_y = [],[]
        for i in range(batch_size):
            test_index = np.random.randint(len(test.index))

            test_x.append(np.array(test.iloc[test_index]))
            test_y.append(one_hot(ys.iloc[test_index][1]))
        return (np.array(test_x), np.array(test_y))

## Designing the Network Graph
The architecture will be a simple two-hidden-layer feed-forward neural network implemented in Tensorflow.

In [None]:
hls = 28 # The size of the first hidden layer, 2nd hidden layer is simply 2*hls

# Inits
x = tf.placeholder(tf.float32, [None, 7])
W1 = tf.Variable(tf.truncated_normal([7,hls], mean=0.0, stddev=1.0))
b1 = tf.Variable(tf.ones([hls]))
W2 = tf.Variable(tf.truncated_normal([hls,hls*2], mean=0.0, stddev=1.0))
b2 = tf.Variable(tf.ones([hls*2]))
W3 = tf.Variable(tf.truncated_normal([hls*2,2], mean=0.0, stddev=1.0))
b3 = tf.Variable(tf.ones([2]))

# Network graph
l1 = tf.nn.relu(tf.matmul(x,  W1) + b1)
l2 = tf.nn.relu(tf.matmul(l1, W2) + b2)
y  =            tf.matmul(l2, W3) + b3
yhat = tf.placeholder(tf.float32, [None,2])

# Computations
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=yhat,logits=y)
train_step = tf.train.AdamOptimizer(0.5).minimize(cross_entropy)

correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(yhat,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

## Running the network

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

def run_network(mode = "test", episodes=1000, batch_size = 200):
    
    if mode == "train":
        for i in range(episodes):
            train_x, train_y = get_batch.training(batch_size)
            if i % 100 == 0:
                train_accuracy = sess.run(
                    accuracy,
                    feed_dict = {x: train_x, yhat: train_y})
                print("Step: {}, Training accuracy: {}".format(i, train_accuracy))
            sess.run(
                train_step,
                feed_dict = {x: train_x, yhat: train_y})
            
    elif mode == "test":
        accuracies = []
        for i in range(episodes):
            test_x, test_y = get_batch.testing(batch_size)
            test_accuracy = sess.run(
                accuracy,
                feed_dict = {x: test_x, yhat: test_y})
            if i % 10 == 0:
                print("Step: {}, Testing accuracy: {}".format(i, test_accuracy))
            accuracies.append(test_accuracy)
        avg_accuracy = sum(accuracies) / float(len(accuracies))
        print("The average accuracy after {} episodes of testing is {}".format(episodes, avg_accuracy))
        
run_network(mode="train",episodes=1000)
run_network(mode="test", episodes=100)