<a href="https://colab.research.google.com/github/tisonpatrik/deep_learning_training/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import copy
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/python/ML/Titanic/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/python/ML/Titanic/test.csv')
gender_submission = pd.read_csv('/content/drive/MyDrive/python/ML/Titanic/gender_submission.csv')

In [None]:
raw_data = pd.concat([train_data, test_data])
raw_data.set_index('PassengerId', inplace=True)
gender_submission.set_index('PassengerId', inplace=True)
raw_data.update(gender_submission)
raw_data.reset_index(inplace=True)
raw_data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1304,1305,0.0,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
1305,1306,1.0,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
1306,1307,0.0,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
1307,1308,0.0,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
1308,1309,0.0,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [None]:
print(raw_data.isnull().sum())

PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64


In [None]:
df = raw_data

In [None]:
df = df.dropna(subset=['Age'])
df = df.dropna(subset=['Fare'])

In [None]:
df['Sex'] = df['Sex'].replace({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].replace({'C': 0, 'Q': 1, 'S': 2})
df = df.drop(columns=['Cabin'])
df = df.drop(columns=['Ticket'])
df = df.drop(columns=['Embarked'])

In [None]:
fig = px.histogram(df, x='Age', title='Distribution of Age', nbins=10)  # You can adjust the nbins parameter for the number of bins you prefer
fig.show()

In [None]:
fig = px.bar(df['Pclass'].value_counts().reset_index(), x='index', y='Pclass', title='Distribution of Ticket Class', color='index', labels={'index': 'Ticket Class', 'Pclass': 'Count'},
             color_discrete_sequence=px.colors.qualitative.Pastel)
fig.show()

In [None]:
fig = px.histogram(df, x="Fare", title='Distribution of Fare', nbins=500)
fig.show()

In [None]:
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare
1300,1301,1.0,3,"Peacock, Miss. Treasteall",1,3.0,1,1,13.775
1302,1303,1.0,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",1,37.0,1,0,90.0
1303,1304,1.0,3,"Henriksson, Miss. Jenny Lovisa",1,28.0,0,0,7.775
1305,1306,1.0,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,108.9
1306,1307,0.0,3,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,7.25


In [None]:
len(df)

1045

In [None]:
df = pd.get_dummies(df, columns=['Pclass'], prefix='Pclass', drop_first=False)
df.tail()

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3
1300,1301,1.0,"Peacock, Miss. Treasteall",1,3.0,1,1,13.775,0,0,1
1302,1303,1.0,"Minahan, Mrs. William Edward (Lillian E Thorpe)",1,37.0,1,0,90.0,1,0,0
1303,1304,1.0,"Henriksson, Miss. Jenny Lovisa",1,28.0,0,0,7.775,0,0,1
1305,1306,1.0,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,108.9,1,0,0
1306,1307,0.0,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,7.25,0,0,1


In [None]:
fig = px.histogram(df, x="SibSp", title='Distribution of Siblings', nbins=10)
fig.show()

In [None]:
fig = px.histogram(df, x="Parch", title='Distribution of Parents', nbins=10)
fig.show()

In [None]:
scaler = StandardScaler()
df['Age'] = scaler.fit_transform(df[['Age']])

df['Fare'] = np.log1p(df['Fare'])  # log1p is used to handle zeros by computing log(1+x)
scaler_fare = StandardScaler()
df['Fare'] = scaler_fare.fit_transform(df[['Fare']])

In [None]:
fig = px.histogram(df, x="Age", title='Distribution of Age', nbins=10)
fig.show()

In [None]:
fig = px.histogram(df, x="Fare", title='Distribution of Age', nbins=10)
fig.show()

In [None]:
df.set_index('PassengerId', inplace=True)

In [None]:
df.tail()

Unnamed: 0_level_0,Survived,Name,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1301,1.0,"Peacock, Miss. Treasteall",1,-1.867005,1,1,-0.393617,0,0,1
1303,1.0,"Minahan, Mrs. William Edward (Lillian E Thorpe)",1,0.497011,1,0,1.48626,1,0,0
1304,1.0,"Henriksson, Miss. Jenny Lovisa",1,-0.128758,0,0,-0.932403,0,0,1
1306,1.0,"Oliva y Ocana, Dona. Fermina",1,0.636071,0,0,1.681403,1,0,0
1307,0.0,"Saether, Mr. Simon Sivertsen",0,0.601306,0,0,-0.996199,0,0,1


In [None]:
X = df.drop(['Name', 'Survived'], axis=1)
y = df['Survived']

In [None]:
def plotly_violin_plot(dataframe):
    # Melt the data for visualization
    df_std_melted = dataframe.melt(var_name='Column', value_name='Normalized')

    # Create the violin plot using plotly express with different colors for each column
    fig = px.violin(df_std_melted, x='Column', y='Normalized', color='Column',
                    box=True, points="all", hover_data=df_std_melted.columns)
    fig.show()

In [None]:
plotly_violin_plot(X)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
print("Shape of X_train:",X_train.shape)
print("Shape of y_train:",y_train.shape)

Shape of X_train: (836, 8)
Shape of y_train: (836,)


In [None]:
def plot_training_progress(training_progress, validation_progress):
    # Create a DataFrame to organize data for plotting
    df = pd.DataFrame({
        'Epoch': list(range(1, len(training_progress) + 1)),
        'Training Loss': training_progress,
        'Validation Loss': validation_progress
    })

    # Plot using plotly
    fig = px.line(df, x='Epoch', y=['Training Loss', 'Validation Loss'],
                  title='Training and Validation Progress', labels={'y': 'Loss'})
    fig.show()

In [None]:
# Define the model
logistic_regression = tf.keras.Sequential([
    tf.keras.layers.Dense(units=1, activation='sigmoid', input_shape=(X_train.shape[1],))
])

In [None]:
MAX_EPOCHS = 100

def compile_and_fit(model, X_train, y_train, X_val, y_val, patience=2):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                      patience=patience,
                                                      mode='min')

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.SGD(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])

    history = model.fit(X_train, y_train, epochs=MAX_EPOCHS,
                        validation_data=(X_val, y_val),
                        callbacks=[early_stopping])
    return history

In [None]:
history = compile_and_fit(logistic_regression, X_train, y_train, X_val, y_val)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
val_performance = {}
performance = {}
val_performance = logistic_regression.evaluate(X_val, y_val)
performance = logistic_regression.evaluate(X_test, y_test, verbose=0)



In [None]:
def plot_metrics(history, val_performance, performance):
    # Plotting the trends during training
    metrics_df = pd.DataFrame(history.history)
    metrics_df['epoch'] = range(1, len(metrics_df) + 1)

    fig1 = px.line(metrics_df, x='epoch', y=['loss', 'val_loss'],
                   title='Training and Validation Loss over Epochs',
                   labels={'value': 'Loss', 'variable': 'Dataset'})
    fig1.show()

    fig2 = px.line(metrics_df, x='epoch', y=['binary_accuracy', 'val_binary_accuracy'],
                   title='Training and Validation Accuracy over Epochs',
                   labels={'value': 'Accuracy', 'variable': 'Dataset'})
    fig2.show()

    # Plotting the final performance on validation and test sets
    df = pd.DataFrame({
        'Metric': ['Loss', 'Loss', 'Accuracy', 'Accuracy'],
        'Type': ['Validation', 'Test', 'Validation', 'Test'],
        'Value': [val_performance[0], performance[0], val_performance[1], performance[1]]
    })

    fig = px.bar(df, x='Metric', y='Value', color='Type', barmode="group",
                 title='Performance on Validation and Test Sets',
                 category_orders={"Metric": ["Loss", "Accuracy"], "Type": ["Validation", "Test"]},
                 color_discrete_map={"Validation": "blue", "Test": "orange"})
    fig.show()

In [None]:
plot_metrics(history, val_performance, performance)

In [None]:
print(X_train.shape)

(836, 8)


In [None]:
layers_dims = [8, 20, 7, 5, 1]
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(layers_dims[1], activation='relu', input_shape=(layers_dims[0],), kernel_regularizer=tf.keras.regularizers.l2(0.01)))

for l in range(2, len(layers_dims)-1):
    model.add(tf.keras.layers.Dense(layers_dims[l], activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)))

model.summary()

Model: "sequential_33"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_82 (Dense)            (None, 20)                180       
                                                                 
 dense_83 (Dense)            (None, 7)                 147       
                                                                 
 dense_84 (Dense)            (None, 5)                 40        
                                                                 
Total params: 367 (1.43 KB)
Trainable params: 367 (1.43 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
MAX_EPOCHS = 100
def compile_and_fit(model, X_train, y_train, X_val, y_val, patience=2):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, mode='min')

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(),
                  metrics=[tf.keras.metrics.BinaryAccuracy()])

    history = model.fit(X_train, y_train, epochs=MAX_EPOCHS,
                        validation_data=(X_val, y_val),
                        callbacks=[early_stopping])
    return history

In [None]:
history = compile_and_fit(model, X_train, y_train, X_val, y_val)

In [None]:
val_performance = {}
performance = {}
val_performance = logistic_regression.evaluate(X_val, y_val)
performance = logistic_regression.evaluate(X_test, y_test, verbose=0)



In [None]:
plot_metrics(history, val_performance, performance)