In [1]:
import pandas as pd
import torch
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
device = "mps" if torch.backends.mps.is_available() else "cpu"

In [3]:
df = pd.read_csv("../excel_files/dataset_01.csv")

In [4]:
scaler = MinMaxScaler()

df['Fare'] = scaler.fit_transform(df[['Fare']])

In [5]:
age_missing = df[df['Age'].isnull()]
age_not_missing = df.dropna(subset=['Age'])

X = age_not_missing[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Alone']]
y = age_not_missing['Age']

In [6]:
X_tensor = torch.tensor(X.values, dtype=torch.float32).to(device)
y_tensor = torch.tensor(y.values, dtype=torch.float32).to(device)

X_tensor.shape, y_tensor.shape, len(y_tensor.shape), len(X_tensor), len(y_tensor)

(torch.Size([1046, 6]), torch.Size([1046]), 1, 1046, 1046)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.1, random_state=42)

len(X_train), len(y_train), len(X_test), len(y_test), X_train.device, X_train.shape, y_train.shape

(941,
 941,
 105,
 105,
 device(type='mps', index=0),
 torch.Size([941, 6]),
 torch.Size([941]))

In [8]:
class AgePredictionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer_stack = nn.Sequential(
            nn.Linear(in_features=X_tensor.shape[1], out_features=128),
            nn.ReLU(),
            nn.Linear(in_features=128, out_features=64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=32),
            nn.ReLU(),
            nn.Linear(in_features=32, out_features=len(y_tensor.shape))
        )

    def forward(self, x):
        return self.layer_stack(x)

In [9]:
torch.manual_seed(42)
age_prediction_model = AgePredictionModel()
age_prediction_model.to(device)

AgePredictionModel(
  (layer_stack): Sequential(
    (0): Linear(in_features=6, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [10]:
loss_fn = nn.L1Loss()
optimizer = torch.optim.Adam(age_prediction_model.parameters(), lr=0.01)

In [11]:
epochs = 10000
for epoch in range(epochs):
    age_prediction_model.train()
    y_pred = age_prediction_model(X_train).squeeze()
    loss = loss_fn(y_pred, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    age_prediction_model.eval()
    with torch.inference_mode():
        test_pred = age_prediction_model(X_test).squeeze()
        test_loss = loss_fn(test_pred, y_test)

    if (epoch + 1) % 1000 == 0:
        print(f'Epoch [{epoch + 1}/{epochs}] | Train Loss: {loss.item():.4f} | Test Loss: {test_loss.item():.4f}')

Epoch [1000/10000] | Train Loss: 8.4007 | Test Loss: 8.9438
Epoch [2000/10000] | Train Loss: 8.1544 | Test Loss: 8.9353
Epoch [3000/10000] | Train Loss: 8.0393 | Test Loss: 8.9240
Epoch [4000/10000] | Train Loss: 7.9927 | Test Loss: 8.9474
Epoch [5000/10000] | Train Loss: 7.9678 | Test Loss: 8.8721
Epoch [6000/10000] | Train Loss: 7.7974 | Test Loss: 8.9666
Epoch [7000/10000] | Train Loss: 7.6231 | Test Loss: 8.9931
Epoch [8000/10000] | Train Loss: 7.4716 | Test Loss: 8.9239
Epoch [9000/10000] | Train Loss: 7.7711 | Test Loss: 8.9128
Epoch [10000/10000] | Train Loss: 7.4353 | Test Loss: 8.8581


In [12]:
X_missing = age_missing[['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Alone']]
X_missing_tensor = torch.tensor(X_missing.values, dtype=torch.float32).to(device)

age_prediction_model.eval()
with torch.inference_mode():
    predicted_age_tensor = age_prediction_model(X_missing_tensor)
    predicted_age = predicted_age_tensor.cpu().numpy().flatten()
predicted_age.shape

(263,)

In [13]:
df.loc[df['Age'].isnull(), 'Age'] = predicted_age

In [14]:
df.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
Alone          0
dtype: int64

In [15]:
df.to_csv('../excel_files/dataset_02.csv', index=False)

In [16]:
df = pd.read_csv('../excel_files/dataset_02.csv')

In [17]:
df['Age'] = scaler.fit_transform(df[['Age']])

In [18]:
df.to_csv('../excel_files/dataset_03.csv', index=False)