In [None]:
import keras
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from keras.models import Sequential
from keras.utils import to_categorical



In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from matplotlib import pyplot
%matplotlib inline  

# 1. Let's import some data

https://www.kaggle.com/c/titanic/data

In [None]:
train_dataset=pd.DataFrame()
train_dataset=pd.read_csv('https://raw.githubusercontent.com/thousandoaks/AOMDeepLearningWorkshop/master/titanictrain.csv')

In [None]:
train_dataset.head(10)

### We want to predict Survived, using the following features: PClass, Sex, Age, SibSp, Parch, Fare, Embarked

### We have to transform features Sex and Embarked (a.k.a label encoding) 

In [None]:
train_dataset = train_dataset.replace(["male", "female"], [0,1])
train_dataset = train_dataset.replace(["S", "C", "Q"], [0,1,2])
train_dataset= train_dataset.fillna(0)


In [None]:
train_dataset.head(10)

In [None]:
target = train_dataset[["Survived"]]
features = train_dataset[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]]

In [None]:
#convert to numpy array for NN
features = features.values
target = target.values

In [None]:
#target = to_categorical(train_dataset['Survived'])

In [None]:
target[0:10]

In [None]:
features[0:10]

In [None]:
# we will only use the following features: PClass, Age, SibSp,Parch, Fare
#predictors=train_dataset[['Pclass','Sex','Age','SibSp','Parch','Fare']].values


In [None]:
#predictors

In [None]:
numberofcolumns=features.shape[1]
numberofcolumns

## 2. Let's understand the data

In [None]:
correlation_matrix=train_dataset.corr()
correlation_matrix

In [None]:
ax = sns.heatmap(correlation_matrix,linewidths=.5)

# 3. Specify the architecture of the model (very basic one)

In [None]:
# Set up the model
model = Sequential()

# Add the first layer
model.add(Dense(100,activation='relu',input_shape=(numberofcolumns,)))

# Add a second layer
model.add(Dense(100, activation='relu'))

# Add the output layer
model.add(Dense(1,activation='sigmoid'))



# 4. Let's compile the model

In [None]:
# Compile the model. Add accuracy as the metric to benchmark our model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])


# 5. Let's fit the model

In [None]:
# Fit the model. This time we keep 20% of our samples for test purposes
model_training=model.fit(features,target,verbose=2,epochs=100,validation_split=0.1)


# 6. Let's evaluate our model

In [None]:
# Create the plot
pyplot.plot(model_training.history['val_loss'], 'r')
pyplot.xlabel('Epochs')
pyplot.ylabel('Validation score')
pyplot.show()

# 7. Let's improve our model

In [None]:
# Set up the model
#model2 = Sequential()
# Add the first layer
#model2.add(Dense(100,activation='relu',input_shape=(numberofcolumns,)))
# Add a second layer


#model2.add(Dense(100, activation='relu'))
#model2.add(Dense(10, activation='relu'))

# Add the output layer
#model2.add(Dense(1,activation='sigmoid'))

In [None]:
model2 = Sequential()
#input layer
model2.add(Dense(100, input_shape=(numberofcolumns,)))
model2.add(BatchNormalization())
model2.add(Activation("relu"))
model2.add(Dropout(0.4))

# hidden layers
model2.add(Dense(50))
model2.add(BatchNormalization())
model2.add(Activation("sigmoid"))
model2.add(Dropout(0.4))
    
model2.add(Dense(10))
model2.add(BatchNormalization())
model2.add(Activation("sigmoid"))
model2.add(Dropout(0.4))
    
#model2.add(Dense(2, activation="sigmoid"))
    
# output layer
model2.add(Dense(1, activation='sigmoid'))

In [None]:
# Compile the model. Add accuracy as the metric to benchmark our model
model2.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])


In [None]:
#Fit the model. This time we keep 20% of our samples for test purposes
model2_training=model2.fit(features,target,verbose=2,epochs=100,validation_split=0.1)

In [None]:
# Create the plot
pyplot.plot(model2_training.history['val_loss'], 'r',label='performance model 2 on test data')
pyplot.plot(model_training.history['val_loss'], 'b',label='performance model 1 on test data')
pyplot.xlabel('Epochs')
pyplot.ylabel('Validation score')
pyplot.legend()
pyplot.show()

## Model 2 seems to be performing better in the long run

## How is model 2 performing ?

In [None]:
train_dataset['Predicted']=pd.DataFrame(model2.predict(features))
# we transform probabilities into discrete outcomes (0/1)
train_dataset['Predicted_Discrete']=round(train_dataset['Predicted'])

In [None]:
# Create confusion matrix


confusion_matrix=pd.crosstab(train_dataset['Survived'], train_dataset['Predicted_Discrete'], rownames=['Actual Types'], colnames=['Predicted Types'])
confusion_matrix

### We observe that model 2 is relatively good at predicting who will perish.
### Moreover model 2 achieves 83% of accuracy (is it good enough ?)

### However model 2 predicted many survivors which did not survive in practice (62). 
### Model 2 predicted many fatalities which did survive in practice (98). 

#### This gives us some hints for further improvement. Manual Analysis of misclassifications to understand the causes.


In [None]:
train_dataset.head(10)