# <h1 align = "center">Titanic Survival Prediction</h1>
<div style="width:100%;text-align: center;"> <img align = middle src="https://cdn.wallpapersafari.com/85/81/klWnN6.jpg" style="height:500px"> </div>

# Dataset Fields
- **PassengerId**: Unique Id for each passenger
- **Survived**: Binary value for survival (0 = No, 1 = Yes)
- **Pclass**: Ticket class for each passenger (1 = 1<sup>st</sup> Class, 2 = 2<sup>nd</sup> Class, 3 = 3<sup>rd</sup> Class)
- **Sex**: Gender of each passenger
- **Age**: Age of each passenger in years
- **SibSp**: Number of siblings or spouses aboard the Titanic
- **Parch**: Number of parents or children aboard the Titanic
- **Ticket**: Ticket number for the passenger
- **Fare**: Price of the ticker
- **Cabin**: Cabin number of the passenger
- **Embarked**: Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

# Installing the Necessary Libraries (for first time run only)

In [None]:
# !conda install -c anaconda graphviz python-graphviz missingno seaborn

# Importing the Necessary Libraries

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
import graphviz
import missingno as msno

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')

<h3>Printing out a list of all the files in the directory</h3>

In [None]:
# for dirname, _, filenames in os.walk(os.environ['DSX_PROJECT_DIR']):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# Dataset Information

<h3>Reading the data</h3>

In [None]:
df = pd.read_csv(os.environ['DSX_PROJECT_DIR']+'/datasets/train_titanic.csv')
df

<h3>Getting information about our dataset</h3>

In [None]:
df.info()


<h3>Looking at the statistical summary of our data</h3>

In [None]:
df.describe()

<h3>Total missing values in the dataset</h3>

In [None]:
print("Count of the missing values")
print(30 * "-")
print(df.isna().sum())
print(30 * "-")
print("Total missing values are:", df.isna().sum().sum())
print(30 * "-")

# Exploratory Data Analysis (EDA) 

In [None]:
plt.figure(figsize = (15, 10))
sns.heatmap(df.isna(), yticklabels = False, cbar = False, cmap = 'viridis')
plt.title("Visualizing the Missing Data", fontsize = 20)
plt.xticks(rotation = 35, fontsize = 15)
plt.show()

In [None]:
msno.bar(df, color = (0, 0.4, 0.8), sort = "ascending", figsize = (15, 10))
plt.show()

In [None]:
print("Missing Data in the Cabin column =", (df['Cabin'].isna().sum() / len(df['Cabin']) * 100), "%")

Due to a high number of missing data in the `Cabin` column, it would be better to drop the entire column, rather than try and fill all the values. Since `Age` and `Embarked` have a relatively lower number of missing values it is possible to fill them.

In [None]:
plt.figure(figsize = (15, 10))
ax = sns.countplot(x = 'Survived', data = df)
plt.title('Survival Rates', fontsize = 20)
plt.xlabel('Survived', fontsize = 15)
plt.ylabel('Count', fontsize = 15)
for p in ax.patches:
        ax.annotate('{:.0f} = {:.2f}%'.format(p.get_height(), (p.get_height() / len(df['Survived'])) * 100), (p.get_x() + 0.33, p.get_height() + 5))
plt.show()

Based on the data in the `Survived` column, we observe that only 342 passengers managed to survive (38.38%).

In [None]:
plt.figure(figsize = (15, 10))
plt.pie([(df.Sex == 'male').sum(), (df.Sex == 'female').sum()], labels = ["Male", "Female"], autopct = "%.2f", startangle = 90, explode = (0.1, 0.0))
plt.title('Percentage of Male and Female Passengers', fontsize = 18)
plt.show()

Majority of the passengers aboard the Titanic were Male (64.76 %).<br>
Let us now take a look at the Survival Rates for Male and Female passengers.

In [None]:
plt.figure(figsize = (15, 10))
ax = sns.countplot(x = 'Survived', hue = 'Sex', data = df)
plt.title('Survival for Male and Female Passengers', fontsize = 20)
plt.xlabel('Survived', fontsize = 15)
plt.ylabel('Count', fontsize = 15)
for p in ax.patches:
        ax.annotate('{:.2f}'.format(p.get_height()), (p.get_x() + 0.17, p.get_height() + 3))
plt.show()

**Observations**<br>
- Most of the Male passengers have not survived.<br>
- Majority of the Female passengers have survived.

In [None]:
plt.figure(figsize = (15, 10))
ax = sns.countplot(x = 'Survived', hue = 'Pclass', data = df)
plt.title('Survival Based on Class', fontsize = 20)
plt.xlabel('Survival', fontsize = 15)
plt.ylabel('Count', fontsize = 15)
for p in ax.patches:
        ax.annotate('{:.2f}'.format(p.get_height()), (p.get_x() + 0.1, p.get_height() + 3))
plt.show()

We notice that `Pclass` of the passenger does affect their survival odds.<br>
Passengers in the 3<sup>rd</sup> class have a much higher mortality rate as compared to the other two classes.<br>
The 1<sup>st</sup> class has a higher number of passengers that survived, probably because they were richer.

In [None]:
plt.figure(figsize = (15, 10))
sns.distplot(df['Age'].dropna(), color = (0, 0.5, 1), bins = 40, kde = True)
plt.title('Age Density of the Passengers', fontsize = 20)
plt.xlabel('Age', fontsize = 15)
plt.show()

Majority of the passengers on the Titanic were between 20 to 40 years of age

In [None]:
plt.figure(figsize = (15, 10))
ax = sns.countplot(x = 'SibSp', data = df)
plt.title('Siblings/Spouses on Board', fontsize = 20)
plt.xlabel('SibSp', fontsize = 15)
plt.ylabel('Count', fontsize = 15)
for p in ax.patches:
        ax.annotate('{:.0f} = {:.2f}%'.format(p.get_height(), (p.get_height() / len(df['SibSp'])) * 100), (p.get_x() + 0.15, p.get_height() + 5))
plt.show()

`SibSp` indicates the number of Siblings or Spouses on board.<br>
From the countplot we see that most of the passengers were travelling alone.

In [None]:
plt.figure(figsize = (15, 10))
ax = sns.countplot(x = 'Parch', data = df)
plt.title('Parents/Children on Board', fontsize = 20)
plt.xlabel('Parch', fontsize = 15)
plt.ylabel('Count', fontsize = 15)
for p in ax.patches:
        ax.annotate('{:.0f} = {:.2f}%'.format(p.get_height(), (p.get_height() / len(df['SibSp'])) * 100), (p.get_x() + 0.15, p.get_height() + 5))
plt.show()

`Parch` indicates the number of Parents or Children aboard the ship<br>
Just like in the case of `SibSp` we can observe that most of the people are travelling on their own.

In [None]:
plt.figure(figsize = (15, 10))
sns.histplot(df['Fare'], bins = 40, kde = True)
plt.title('Fare Count for the Passengers', fontsize = 20)
plt.xlabel('Fare', fontsize = 15)
plt.ylabel('Count', fontsize = 15)
plt.show()

In [None]:
plt.figure(figsize = (15, 10))
bp = sns.boxplot(x = 'Pclass', y = 'Age', data = df, palette = 'winter')
plt.xlabel('Pclass', fontsize = 15)
plt.ylabel('Age', fontsize = 15)
plt.show()

Looking at the median age of the passengers based on their class.<br>
The median age of the passengers in the 1<sup>st</sup> class is the highest.<br>
The median age of the passengers in the 3<sup>rd</sup> class is the lowest.<br>
Let us fill in the missing values in the `Age` column based on the median age of the `Pclass`.

In [None]:
def transform_columns(column):
    Age = column[0]
    Pclass = column[1]
    
    if(pd.isna(Age)):
       if(Pclass == 1):
            return 38
       elif(Pclass == 2):
            return 29   
       else:
            return 23  
    else:
       return Age

df['Age'] = df[['Age', 'Pclass']].apply(transform_columns, axis = 1)
df

# Feature Engineering

Lets create a new feature `IsAlone` that tells us if the passenger is travelling solo or with a family.

In [None]:
df['IsAlone'] = df['SibSp'] + df['Parch']
df

In [None]:
plt.figure(figsize = (15, 10))
ax = sns.countplot(x = 'IsAlone', data = df)
plt.xlabel('IsAlone', fontsize = 15)
plt.ylabel('Count', fontsize = 15)
for p in ax.patches:
        ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x() + 0.30, p.get_height() + 5))
plt.show()

As we can observe from the `IsAlone` feature most passengers are travelling by themselves, without any family.

In [None]:
def convert_IsAlone(df):
    
    bins = [None] * len(df)

    for i in range(len(df)):
        if(df.IsAlone[i] in [0]):
            bins[i] = 'Alone'
        if(df.IsAlone[i] in [1, 2, 3, 4, 5, 6, 7, 10]):
            bins[i] = 'Not Alone'

    df['IsAlone'] = bins
    
convert_IsAlone(df)
df

We created 2 groups for the `IsAlone` feature.<br>
- The first group is named `Alone` and contains passengers travelling alone.<br>
- The second group `Not Alone` is for passengers having one or more family member.

In [None]:
plt.figure(figsize = (15, 10))
ax = sns.countplot(x = 'Survived', hue = 'IsAlone', data = df)
plt.title('Survival Count for the IsAlone Feature', fontsize = 20)
plt.xlabel('Survived', fontsize = 15)
plt.ylabel('Count', fontsize = 15)
for p in ax.patches:
        ax.annotate('{:.2f}'.format(p.get_height()), (p.get_x() + 0.17, p.get_height() + 3))
plt.show()

- Passengers that travelled alone have a higher mortality than passengers that travelled with family.
- The survival chances for passengers tha travelled alone and those that travelled with family is almost the same.

In [None]:
df

In [None]:
df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis = 1, inplace = True)
df

In [None]:
msno.bar(df, color = (0, 0.4, 0.8), sort = "ascending", figsize = (15, 10))
plt.show()

In [None]:
plt.figure(figsize = (15, 10))
sns.heatmap(df.isna(), yticklabels = False, cbar = False, cmap = 'viridis')
plt.title("Visualizing the Missing Data", fontsize = 20)
plt.xticks(rotation = 35, fontsize = 15)
plt.show()

Our dataset no longer contains any missing values. We can now encode and scale the data to start training our ML models.

# Checking for Correlation

In [None]:
plt.figure(figsize = (15, 10))
sns.heatmap(df.corr(), cmap = 'Blues', square = True, annot = True)
plt.title("Visualizing Correlations", size = 20)
plt.show()

In [None]:
numeric_features = ['Age', 'Fare']
sns.pairplot(df[numeric_features], size = 5)
plt.show()

# Encoding the Categorical Features
The categorical data can be encoded using Label Encoder. It encodes labels with a value between 0 and n_classes - 1 where n is the number of distinct labels. If a label repeats it assigns the same value as assigned earlier. The categorical values can be converted into numeric values.

In [None]:
label_encoder = LabelEncoder()

def label_encoder_converter(df):
    
    df['Sex'] = label_encoder.fit_transform(df['Sex'])
    df['IsAlone'] = label_encoder.fit_transform(df['IsAlone'])
    
label_encoder_converter(df)

# Scaling the Data
StandardScaler standardizes a feature by subtracting the mean and then scaling it to unit variance.
<div style="width:100%;text-align: center;"> <img align = left src="https://cdn-images-1.medium.com/max/800/0*vQEjz0mvylP--30Q.GIF" style="height:150px"></div>

In [None]:
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])
df

In [None]:
X = df.iloc[:, 1:]
X.sort_index(axis=1, inplace=True)  # need to sort columns alphabetically for WSL scoring
y = df['Survived']
print(X, "\n\n\n", y)

In [None]:
df

In [None]:
X

# Splitting the Data into Train, Test, and Validation Sets

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
ratio_train = 0.8
ratio_val = 0.1
ratio_test = 0.1

# produces test split.
x_remaining, X_test, y_remaining, y_test = train_test_split(X, y, test_size=ratio_test, random_state=0)

# adjusts val ratio, w.r.t. remaining dataset.
ratio_remaining = 1 - ratio_test
ratio_val_adjusted = ratio_val / ratio_remaining

# produces train and val splits.
X_train, X_val, y_train, y_val = train_test_split(x_remaining, y_remaining, test_size=ratio_val_adjusted, random_state=0)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
X_val.shape

# Logistic Regression
Logistic regression is a supervised learning algorithm used to predict the probability of a target variable. It is used for classification, in this case to predict whether a passenger survived or not

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
acc_lr = lr.score(X_test, y_test)
print(acc_lr * 100, "%")
cm_lr = confusion_matrix(y_test, y_pred)
print(cm_lr)

# K-Nearest Neighbors
KNN works by finding the distances between a query and all the examples in the data, selecting the specified number examples (K) closest to the query, then votes for the most frequent label.

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc_knn = knn.score(X_test, y_test)
print(acc_knn * 100, "%")
cm_knn = confusion_matrix(y_test, y_pred)
print(cm_knn)

# Support Vector Classifier
It is used in classification problems to predict which class the target variable belongs to.

In [None]:
svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
acc_svc = svc.score(X_test, y_test)
print(acc_svc * 100, "%")
cm_svc = confusion_matrix(y_test, y_pred)
print(cm_svc)

# Decision Tree Classifier
Decision trees use multiple algorithms to decide to split a node into two or more sub-nodes. The creation of sub-nodes increases the homogeneity of resultant sub-nodes. In other words, we can say that the purity of the node increases with respect to the target variable.
<div style="width:100%;text-align: center;"> <img align = left src="https://res.cloudinary.com/dyd911kmh/image/upload/f_auto,q_auto:best/v1545934190/1_r5ikdb.png" style="height:500px"> </div>

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
acc_dtc = dtc.score(X_test, y_test)
print(acc_dtc * 100, "%")
cm_dtc = confusion_matrix(y_test, y_pred)
print(cm_dtc)

# Visualizing the Decision Tree Classifier

In [None]:
dot_data = tree.export_graphviz(dtc, out_file = None, feature_names = X.columns, class_names = ["0", "1"], filled = True)
graph = graphviz.Source(dot_data, format = "jpg")
display(graph)

# Random Forest Classifier
The random forest is a classification algorithm consisting of many decisions trees. It uses bagging and feature randomness when building each individual tree to try to create an uncorrelated forest of trees whose prediction by committee is more accurate than that of any individual tree.
<div style="width:100%;text-align: center;"> <img align = left src="https://www.freecodecamp.org/news/content/images/2020/08/how-random-forest-classifier-work.PNG" style="height:400px"> </div>

In [None]:
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
acc_rf = rf.score(X_test, y_test)
print(acc_rf * 100, "%")
cm_rf = confusion_matrix(y_test, y_pred)
print(cm_rf)

# AdaBoost Classifier
It combines multiple classifiers to increase the accuracy of classifiers. AdaBoost is an iterative ensemble method. AdaBoost classifier builds a strong classifier by combining multiple poorly performing classifiers so that you will get high accuracy strong classifier.

In [None]:
adc = AdaBoostClassifier()
adc.fit(X_train, y_train)
y_pred = adc.predict(X_test)
acc = adc.score(X_test, y_test)
acc_adc = adc.score(X_test, y_test)
print(acc_adc * 100, "%")
cm_adc = confusion_matrix(y_test, y_pred)
print(cm_adc)

# Model Results

In [None]:
data = {'Logistic Regression': acc_lr, 'KNN': acc_knn,
        'Support Vector Classifier': acc_svc, 'Decision Tree Classifier': acc_dtc, 'Random Forest Classifier': acc_rf,
        'Ada Boost Classifier': acc_adc}
data = dict(sorted(data.items(), key = lambda x: x[1], reverse = True))
models = list(data.keys())
score = list(data.values())
fig = plt.figure(figsize = (15, 10))
sns.barplot(x = score, y = models)
plt.xlabel("Models Used", size = 20)
plt.xticks(size = 15)
plt.ylabel("Score", size = 20)
plt.yticks(size = 15)
plt.title("Score for Unoptimized models", size = 25)
plt.show()

# Save Test Data

In [None]:
test_run_df = y_val.to_frame().join(X_val)
test_run_df.head()

In [None]:
test_run_df.to_csv(os.environ['DSX_PROJECT_DIR']+'/datasets/test_df_titanic_labeled.csv', index = False)

In [None]:
X_val.to_csv(os.environ['DSX_PROJECT_DIR']+'/datasets/test_df_titanic_unlabeled.csv', index = False)

# Save Model

In [None]:
model_name = 'Classifier_Titanic_rf'

In [None]:
from dsx_ml.ml import save

saved_model_output = save(name = model_name,
                          model = rf,
                          x_test=pd.DataFrame(X_test),
                          y_test=pd.DataFrame(y_test),
                          labelColumn_json = [{"name": "Survived", "type": "int"}],
                          algorithm_type = 'Classification',
                          source='titanic_survival_classification_models.ipynb',
                          description='Classifier model for a titanic'
                         )
saved_model_output

## Make an online scoring prediction

Upon saving a model, an internal online scoring endpoint is automatically created.

Compatible models:
https://www.ibm.com/docs/en/watson-studio-local/1.2.3?topic=data-machine-learning-models

In [None]:
import os
import requests
import json

In [None]:
# for dirname, _, filenames in os.walk(f"/user-home/{os.environ['DSX_USER_ID']}/DSX_Projects/{os.environ['DSX_PROJECT_NAME']}"):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [None]:
with open(f"/user-home/{os.environ['DSX_USER_ID']}/DSX_Projects/{os.environ['DSX_PROJECT_NAME']}/models/{model_name}/metadata.json") as infile:
    metadata_dict = json.load(infile)

In [None]:
print(f"Model Type: {metadata_dict['algorithm']}")
print("Feature(s):")
for feature in metadata_dict['features']:
    print('    '+feature['name'])

print(f"Latest Model Version: {metadata_dict['latestModelVersion']}")
print("Label(s):")
for label in metadata_dict['labelColumns']:
    print('    '+label['name'])

In [None]:
header_online = {'Content-Type': 'application/json', 'Authorization': os.environ['DSX_TOKEN']}

New data is provided in the following cell.

In [None]:
payload = [{"Age":-0.4464399261,"Fare":-0.3573083058,"IsAlone":0,"Pclass":3,"Sex":1}]
print(payload)

The model evaluates new data and give an estimate scoring.

In [None]:
scoring_response = requests.post(saved_model_output['scoring_endpoint'], json=payload, headers=header_online)

In [None]:
json.loads(scoring_response.content)

## Make batch online scoring prediction via API

In Watson Studio, an online batch scoring endpoint can be created.  
By running batch score at least once to generate a script, API details can be generated by going to {Your project} > Scripts > {your model} then click the 3 vertical dots on the right then Test API…

To test a script, a bearer token (accessToken) is needed to authenticate the user. The token lasts for 13 hours and can be retrieved by running:

In [None]:
url_batch_score_auth = 'https://52.116.135.95/v1/preauth/validateAuth'
url_batch_score = 'https://52.116.135.95/dsx-py3-script/ibmdsxuser-1003/1648180381668/batch_score'

In [None]:
score_auth = requests.get(url_batch_score_auth, auth=({{your username}}, {{your password}}), verify=False)

In [None]:
json.loads(score_auth.content)

In [None]:
accessToken = json.loads(score_auth.content)['accessToken']

In [None]:
header_batch_score = {'Content-Type': 'application/json', 'Authorization': f'Bearer {accessToken}'}

In [None]:
args = {'execution_type': 'DSX', 'target': '/datasets/test-results.csv', 'source': '/datasets/test_df_titanic_unlabeled.csv', 'output_type': 'Localfile', 'output_datasource_type': '', 'sysparm': '', 'remoteHost': '', 'remoteHostImage': '', 'livyVersion': 'livyspark2'}

In [None]:
batch_score_payload = { "relativeScriptPath": "scripts/batch_score_titanic_sample.py", "args": args }

# Batch score script template

In [None]:
batch_scoring_response = requests.post(url_batch_score, json=batch_score_payload, headers=header_batch_score, verify=False)

In [None]:
json.loads(batch_scoring_response.content)

## Make an online scoring prediction via API

Upon deploying in Watson Machine Learning, an online scoring endpoint is automatically created.

In [None]:
url_score = 'https://52.116.135.95/dmodel/v1/python-lab/pyscript/titanic-classifier-rf/score'

In [None]:
header_online_api = {'Content-Type': 'application/json', 'Cache-Control': 'no-cache', 'Authorization': {{your token}}}

In [None]:
payload_data = {"args":{"input_json":[{"Age":99,"Fare":-0.3573083058,"IsAlone":0,"Pclass":3,"Sex":1}]}}
print(payload_data)

In [None]:
scoring_response_api = requests.post(url_score, json=payload_data, headers=header_online_api, verify=False)

In [None]:
json.loads(scoring_response_api.content)

# Creating the Submission File (Optional only if submitting to Kaggle for test/ranking)

In [None]:
# display training dataset again
train_df = pd.read_csv(os.environ['DSX_PROJECT_DIR']+'/datasets/train_titanic.csv')
train_df.head()

In [None]:
# display test dataset
test_df = pd.read_csv(os.environ['DSX_PROJECT_DIR']+'/datasets/test_titanic.csv')
test_df.head()

In [None]:
# preprocess test dataset for the model to be used
test_df['Age'] = test_df[['Age', 'Pclass']].apply(transform_columns, axis = 1)
test_df = test_df.drop(columns = ['Cabin'], axis = 1)
test_df = test_df.fillna(df['Fare'].mean())
test_df['IsAlone'] = test_df['SibSp'] + test_df['Parch']
convert_IsAlone(test_df)
test_df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Embarked'], axis = 1, inplace = True)
label_encoder_converter(test_df)
X = test_df.iloc[:, 0:]
X.head()

In [None]:
y_pred = rf.predict(X)
final_pred = list(y_pred)
final_sub = pd.read_csv(os.environ['DSX_PROJECT_DIR']+'/datasets/test_titanic.csv')['PassengerId']
final_sub = pd.DataFrame(final_sub)
final_sub['Survived'] = final_pred
#final_sub.to_csv('submission', index = False)
final_sub.head()