<a href="https://www.kaggle.com/code/zirklelc/micrograd?scriptVersionId=156394207" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

###  Titanic with MicroGrad

# Titanic with Micrograd
This notebook uses Andrej Karpathy's [Micrograd](https://www.youtube.com/redirect?event=video_description&redir_token=QUFFLUhqbkJGdDA2Y3JzZHlPc0lIOU5DdDVMRTc5cldFQXxBQ3Jtc0trN3ZCRGxaYmtXRWhmUm4wVVZHV2pfdWtuUllIOHl0aFdtSGxTNEpkQ2stY25lY2t6bzIxR2tCWHBGZDNJU3FfTk0xcWFQN0dMZGw2TU1UNE9VWXlvY1pBMmZjR0VYZkJYd1ppWTZlN3UzWURNdlZkSQ&q=https%3A%2F%2Fgithub.com%2Fkarpathy%2Fmicrograd&v=VMj-3S1tku0) library to solve the [Titanic challenge](https://www.kaggle.com/competitions/titanic) on Kaggle. I recommend you watch his phenomenal YouTube video on [building micrograd](https://www.youtube.com/watch?v=VMj-3S1tku0&t=2s&ab_channel=AndrejKarpathy) for an introduction to neural networks and backpropagation. The notebook is available on [Kaggle](https://www.kaggle.com/code/zirklelc/micrograd) in different versions and with different scores. The current best score of [0.76555 (v9)](https://www.kaggle.com/code/zirklelc/micrograd?scriptVersionId=156800490) is very close to other implementations with PyTorch.

## References
Here are some references that helped me with data preparation and implementation:
- https://danielmuellerkomorowska.com/2021/02/03/a-deep-feedforward-network-in-pytorch-for-the-titanic-challenge/
- https://www.kaggle.com/code/jcardenzana/titanic-pytorch
- https://www.kaggle.com/code/kiranscaria/titanic-pytorch
- https://github.com/kurtispykes/Machine-Learning

In [None]:
!pip install micrograd

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from micrograd.engine import Value
from micrograd.nn import MLP

%matplotlib inline

In [None]:
# Load train and test datasets
train_df = pd.read_csv("../input/titanic/train.csv", index_col="PassengerId")
test_df = pd.read_csv("../input/titanic/test.csv", index_col="PassengerId")

train_df.head(5)

In [None]:
# Preprocess datasets
datasets = [train_df, test_df]

# Calculate the median age and fare from teh training dataset
median_age = train_df["Age"].median()
median_fare = train_df["Fare"].median()

# Iterate over both datasets
for dataset in datasets:
    # Fill NaNs for `Age` and `Fare` with the columns' median value.
    # Note to fill NaNs in the test dataset with the median values from the training dataset.
    dataset["Age"].fillna(median_age, inplace=True)
    dataset["Fare"].fillna(median_fare, inplace=True)

    # Convert `Sex` into categorical feature
    dataset["Sex"] = pd.Categorical(dataset["Sex"])
    dataset["Sex"] = dataset["Sex"].cat.codes

    # Note not to convert `Embarked` into a categorical feature here, because the training set missing values but the test set does not.
    # This results in more columns on the training dataset than on the test dataset when converted into categorical features.
    # This will be handled by the `get_dummies` function later.
    dataset["Embarked"] = dataset["Embarked"]

    # Drop columns that are not useful for prediction
    dataset.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)

train_df.head(5)

In [None]:
# Check training dataset for data types
train_df.info()

In [None]:
# Check test dataset for data types
test_df.info()

In [None]:
# Define the columns for one-hot encoding
categorical_cols = ["Pclass", "Sex", "Embarked", "SibSp"]

# Convert categorical variable into dummy/indicator variables.
# Note to use the `dummy_na=True` parameter to create a column for unknown values
# https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
train_dummies_df = pd.get_dummies(
    train_df,
    columns=categorical_cols,
    prefix=categorical_cols,
    dummy_na=True,
    # dtype=int
)
test_dummies_df = pd.get_dummies(
    test_df,
    columns=categorical_cols,
    prefix=categorical_cols,
    dummy_na=True,
    # dtype=int
)

In [None]:
train_dummies_df.head(5)

In [None]:
test_dummies_df.head(5)

In [None]:
# Check if train and test dataset have the same number of columns
# Note `Survived` is not included in test data set, so we exclude it from the comparison
assert train_dummies_df.iloc[:, 1:].columns.equals(
    test_dummies_df.columns
), "train_dummies_df and test_dummies_df do not have the same columns"

print(f"train_dummies_df.shape: {train_dummies_df.shape}")
print(f"test_dummies_df.shape: {test_dummies_df.shape}")

In [None]:
# Calculate survival rate
total_samples = train_dummies_df.shape[0]  # Total number of samples
num_survived = (train_dummies_df["Survived"] == 1).sum()  # Number of survivors
rate_survival = (num_survived / total_samples) * 100

print(f"Survival rate: {rate_survival:.2f}%")

In [None]:
# Standardize datasets
from sklearn.preprocessing import StandardScaler

# Use standard scaling with mean and standard deviation from the training dataset
# Note to use the same scaler for both training and test datasets
# The `Survived` column is excluded from the scaling by using `iloc[:, 1:]`
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
scaler = StandardScaler()
scaler.fit(train_dummies_df.iloc[:, 1:])
train_scaled = scaler.transform(train_dummies_df.iloc[:, 1:])
test_scaled = scaler.transform(test_dummies_df)

train_scaled_df = pd.DataFrame(
    train_scaled,
    index=train_dummies_df.index,
    columns=train_dummies_df.iloc[:, 1:].columns,
)

test_scaled_df = pd.DataFrame(
    test_scaled, index=test_dummies_df.index, columns=test_dummies_df.columns
)

In [None]:
train_dummies_df.describe()

In [None]:
train_scaled_df.describe()

In [None]:
# Check train and test datasets have the same columns
# Note `Surivived` was removed during scaling
assert train_scaled_df.columns.equals(
    test_scaled_df.columns
), "train_scaled_df and test_scaled_df do not have the same columns"

In [None]:
from sklearn.model_selection import train_test_split

# Take target labels from the unscaled training dataset and input features from the scaled training dataset
train_labels = train_dummies_df["Survived"].to_numpy()
train_features = train_scaled_df.to_numpy()

# Split the training dataset into training and validation datasets
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
X_train, X_validate, y_train, y_validate = train_test_split(
    train_features, train_labels, test_size=0.1
)

print(f"training dataset: {X_train.shape}")
print(f"validation dataset: {X_validate.shape}")

In [None]:
def init_model(n_input, n_hidden=[], n_output=1):
    nodes = n_hidden + [n_output]

    model = MLP(n_input, nodes)
    return model

In [None]:
import math

# Micograd doesn't have a sigmoid function
def sigmoid(value):
    x = value.data
    e = math.exp(x)
    t = (e) / (e + 1)
    out = Value(t, (value,), "Sigmoid")

    def _backward():
            value.grad += (e) / ((1 + e) ** 2) * out.grad

    out._backward = _backward

    return out

In [None]:
def forward(model, features):
    # assert features is 2d numpy array
    assert len(features.shape) == 2

    # map features to micrograd values
    inputs = [list(map(Value, row)) for row in features]

    # forward the model to get predictions
    predictions = list(map(model, inputs))

    # apply sigmoid to predictions
    predictions = list(map(sigmoid, predictions))

    return np.asarray(predictions)

In [None]:
def loss(predictions, labels):
    # assert predictions and labels are 1d numpy arrays
    assert len(predictions.shape) == 1, "predictions must be 1d numpy array"
    assert len(labels.shape) == 1, "labels must be 1d numpy array"
    assert len(predictions) == len(
        labels
    ), "predictions and labels must have the same length"

    # svm "max-margin" loss
    # losses = [(1 + -label*prediction).relu() for label, prediction in zip(labels, predictions)]
    # data_loss = sum(losses) * (1.0 / len(losses))

    # L2 regularization
    # alpha = 1e-4
    # reg_loss = alpha * sum((p*p for p in model.parameters()))
    # total_loss = data_loss + reg_loss
    # return total_loss
    
    # mean squared error loss
    losses = [
        (prediction - label) ** 2 for label, prediction in zip(labels, predictions)
    ]
    total_loss = sum(losses) * (1.0 / len(losses))
    return total_loss


In [None]:
def accuracy(predictions, labels):
    # assert predictions and labels are 1d numpy arrays
    assert len(predictions.shape) == 1, "predictions must be 1d numpy array"
    assert len(labels.shape) == 1, "labels must be 1d numpy array"
    assert len(predictions) == len(
            labels
    ), "predictions and labels must have the same length"

    # Extract values from micrograd Value objects
    predicted_values = np.array([value.data for value in predictions])

    # Convert predictions to binary values (0 or 1) based on the threshold
    binary_predictions = (predicted_values > 0.5).astype(int)

    # Compare binary_predictions with true_labels
    correct_predictions = np.sum(binary_predictions == labels)

    # Calculate accuracy
    accuracy = correct_predictions / len(labels)

    return accuracy

In [None]:
def optimize(model, epoch, loss):
    # unpack epochs
    epoch, num_epochs = epoch
    
    model.zero_grad()
    loss.backward()

    # learning_rate = 1.0-0.9*k/100
    # learning_rate = 0.001
    start_lr = 0.01
    end_lr = 0.001
    learning_rate = max(
        (start_lr - (start_lr - end_lr) * epoch / (num_epochs - 1)), 
        end_lr
    )
    for p in model.parameters():
        p.data -= learning_rate * p.grad

In [None]:
from sklearn.utils import shuffle


def train_model(model, xy_train, xy_validate, num_epochs=100, batch_size=32):
    # Unpack training and validation data
    x_train, y_train = xy_train
    x_validate, y_validate = xy_validate

    # Calculate number of batches
    batch_size = x_train.shape[0] if batch_size == None else batch_size
    num_batches = x_train.shape[0] // batch_size

    # Losses per epoch
    train_losses = [0] * num_epochs
    validate_losses = [0] * num_epochs
    validate_accuracy = [0] * num_epochs

    print(f"Training on {x_train.shape[0]} samples")
    print(f"Epochs: {num_epochs}")
    print(f"Batches: {num_batches} with size {batch_size}")

    for epoch in range(num_epochs):
        # Shuffle training data at the beginning of each epoch
        x_train, y_train = shuffle(x_train, y_train)

        for batch in range(num_batches):
            # Calculate next batch indices
            start = batch * batch_size
            end = start + batch_size
            x_batch, y_batch = x_train[start:end], y_train[start:end]

            # sample a random batch from the training data
            #ri = np.random.permutation(x_train.shape[0])[:batch_size]
            #x_batch, y_batch = x_train[ri], y_train[ri]

            # train on batch
            train_output = forward(model, x_batch)
            train_loss = loss(train_output, y_batch)

            # optimize after each batch
            optimize(model, (epoch, num_epochs), train_loss)

        # forward full training set
        train_output = forward(model, x_train)
        train_loss = loss(train_output, y_train)
        train_losses[epoch] = train_loss.data

        # forward full validation set
        validate_output = forward(model, x_validate)
        validate_loss = loss(validate_output, y_validate)
        validate_losses[epoch] = validate_loss.data
        
        # calculate accuracy
        validate_accuracy[epoch] = accuracy(validate_output, y_validate)

        print(
            f"Epoch {epoch}, train loss {train_loss.data:.3f}, validate loss {validate_loss.data:.3f}, accuracy {(validate_accuracy[epoch]*100):.3f}"
        )

    print("Training completed.")
    print(f"Training loss: {train_losses[-1]:.3f}")
    print(f"Validation loss: {validate_losses[-1]:.3f}")
    print(f"Validation accuracy: {(validate_accuracy[-1]*100):.3f}%")

    return train_losses, validate_losses, validate_accuracy

In [None]:
# Define number of input features, hidden layers, and output features
num_inputs = X_train.shape[1]
num_hidden = [100]
num_outputs = 1

# Initialize the model
model = init_model(num_inputs, num_hidden, num_outputs)

print(model)
print(f"Parameters: {len(model.parameters())}")

# Define the training parameters
batch_size = 50 # None = full batch
num_epochs = 100

# Train the model
train_losses, validate_losses, validate_accuracy = train_model(
    model,
    (X_train, y_train),
    (X_validate, y_validate),
    num_epochs=num_epochs,
    batch_size=batch_size,
)

# Plot the results
plt.subplot(211)
plt.ylabel('Accuracy')
plt.plot(validate_accuracy, label='Accuracy')

plt.subplot(212)
plt.ylabel('Loss')
plt.plot(train_losses, label='Training Loss')
plt.plot(validate_losses, label='Validation Loss')
plt.legend()
plt.xlabel("Epoch")

In [None]:
# Input features from the scaled test dataset
test_features = test_scaled_df.to_numpy()

# Forward full test set
test_output = forward(model, test_features)
test_output_binary = [1 if x.data > 0.5 else 0 for x in test_output]

# Create submission dataframe
submission_df = pd.DataFrame(test_output_binary, index=test_df.index, columns=["Survived"])
submission_df.to_csv("submission.csv")

submission_df.head(10)