# Titanic: Machine Learning from Disaster
- **Goal**
You are asked to predict if a passenger survived the sinking of the Titanic or not. 
For each in the test set, we must predict a 0 or 1 value for the variable.
- **Metric**
Your score is the percentage of passengers you correctly predict. This is known simply as "accuracy”.
- **Submission File Format**
You should submit a csv file with exactly 418 entries plus a header row. Your submission will show an error if you have extra columns (beyond PassengerId and Survived) or rows.

The file should have exactly 2 columns:

PassengerId (sorted in any order)
Survived (contains your binary predictions: 1 for survived, 0 for deceased)
~~~
PassengerId,Survived
 892,0
 893,1
 894,0
 Etc.
 ~~~

# 1 - Packages

In [11]:
import numpy as np
import pandas as pd

# 2 - Load Data
First we have to select "the most" relevant features to use.
After quick inspection of the train.csv file. I select `pclass, sex, age, sibsp, parch, fare, embarked`
## 2.1 - Utility functions 

In [297]:
def numbify_data(df):
    # Convert sex to numeric
    df['Sex'].replace('female', 0, inplace=True)
    df['Sex'].replace('male', 1, inplace=True)

    # Convert embarked to numeric
    df['Embarked'].replace('Q', 0, inplace=True)
    df['Embarked'].replace('S', 1, inplace=True)
    df['Embarked'].replace('C', 2, inplace=True)

    # Replace nan age values by the mean of the available ages
    age_mean = np.nanmean(df.Age.values)
    df['Age'].fillna(age_mean, inplace=True)
    
    sibsp_mean = np.nanmean(df.SibSp.values)
    df['SibSp'].fillna(sibsp_mean, inplace=True)
    
    parch_mean = np.nanmean(df.Parch.values)
    df['Parch'].fillna(parch_mean, inplace=True)
    
    fare_mean = np.nanmean(df.Fare.values)
    df['Fare'].fillna(fare_mean, inplace=True)
    
    #For missed values
    df.fillna(0, inplace=True)
    
    
def load_train_set():
    df = pd.read_csv('train.csv', delimiter = ',')
    df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    numbify_data(df)
    train_set_x = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].values.T
    train_set_y = df[['Survived']].values.T
    return train_set_x, train_set_y

def load_test_set():
    df = pd.read_csv('test.csv', delimiter = ',')
    df = df[['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    numbify_data(df)
    test_set_x = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']].values.T
    test_set_id = df[['PassengerId']].values.T
    return test_set_x, test_set_id

def sigmoid(z):
    s = 1/(1 + np.exp(-z))
    return s

## 2.2 - Load training set and test set

In [175]:
train_set_x, train_set_y = load_train_set()
test_set_x, test_set_id = load_test_set()

## 2.3 - Overview of the data set

In [176]:
m_train = train_set_x.shape[1]
m_test = test_set_x.shape[1]
print ("Number of training examples: m_train = " + str(m_train))
print ("Number of testing examples: m_test = " + str(m_test))

Number of training examples: m_train = 891
Number of testing examples: m_test = 418


# 3 - Building the parts of our algorithm

## 3.1 - Initializing parameters and help functions

In [218]:

def initialize_with_zeros(dim_features):
    w = np.zeros((dim_features, 1))
    b = 0
    return w, b

In [178]:
dim_features = train_set_x.shape[0]
w, b = initialize_with_zeros(dim_features)
print ("w = " + str(w))
print ("b = " + str(b))

w = [[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]
b = 0


## 3.2 - Forward and BackWard Propagation

In [243]:
def propagate(w, b, X, Y):
    m = X.shape[1]
    #forward propagation
    A = sigmoid(np.dot(w.T, X) + b)
    cost = (-1/m) * np.sum(Y*np.log(A) + (1 - Y)*np.log(1-A))
    
    #backward propagation
    dw = (1/m) * np.dot(X, (A-Y).T)
    db = (1/m) * np.sum(A-Y)
    
    grads = {'dw': dw,
            'db': db}
    return grads, cost
    

## 3.3 - Optimization

In [262]:
def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost=False):
    costs = []
    for i in range(num_iterations):
        grads, cost = propagate(w, b, X, Y)
        w = w - learning_rate*grads['dw']
        b = b - learning_rate*grads['db']
        if print_cost and i % 10000 == 0:
            print("cost after iteration", i,":", cost)
        costs.append(cost)
    params = {'w': w,
             'b': b}

    grads = {'dw': grads['dw'],
             'db': grads['db']}
    
    return params, grads, costs

In [273]:
X = train_set_x
Y = train_set_y
np.nan_to_num(X, copy=False)
np.nan_to_num(Y, copy=False)
w, b = initialize_with_zeros(X.shape[0])
params, grads, costs = optimize(w, b, X, Y, num_iterations= 1000000, learning_rate = 0.004, print_cost = True)

print ("w = " + str(params["w"]))
print ("b = " + str(params["b"]))

cost after iteration 0 : 0.6931471805599454
cost after iteration 10000 : 0.4790215030032276
cost after iteration 20000 : 0.4660584107324556
cost after iteration 30000 : 0.46007141327940915
cost after iteration 40000 : 0.4558464863829955
cost after iteration 50000 : 0.4526715716934283
cost after iteration 60000 : 0.4502630388388405
cost after iteration 70000 : 0.4484303263209077
cost after iteration 80000 : 0.4470323144509472
cost after iteration 90000 : 0.44596323120243064
cost after iteration 100000 : 0.4451436478270597
cost after iteration 110000 : 0.4445138188553954
cost after iteration 120000 : 0.44402870218390145
cost after iteration 130000 : 0.44365424849708596
cost after iteration 140000 : 0.44336464284205107
cost after iteration 150000 : 0.44314025447825967
cost after iteration 160000 : 0.4429661111581356
cost after iteration 170000 : 0.44283076106863994
cost after iteration 180000 : 0.4427254214419925
cost after iteration 190000 : 0.4426433395268757
cost after iteration 200000

In [295]:
def predict(w, b, X):
    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    w = w.reshape(X.shape[0], 1)
    
    A = sigmoid(np.dot(w.T,X) + b)
    
    for i in range(A.shape[1]):
        Y_prediction[0, i] = 1 if A[0, i] > .5 else 0 
        pass
    return Y_prediction

In [296]:
X_train = train_set_x
Y_train = train_set_y

np.nan_to_num(X_train, copy=False)
np.nan_to_num(Y_train, copy=False)

X_test = test_set_x
np.nan_to_num(X_test, copy=False)

Y_prediction_test = predict(w,b,X_test)
Y_prediction_train = predict(w,b,X_train)

print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
#print("test accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100))
np.savetxt('submission.csv', np.hstack([test_set_id.T, Y_prediction_test.T]), delimiter=',')

Y_prediction_train

train accuracy: 61.61616161616162 %


array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 