In [69]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import math

In [70]:
class TitanicDataset(Dataset):
  def __init__(self, filepath):
    data = TitanicDataset.clean_data(filepath)

    x = data.drop(columns=['Survived'])
    y = data['Survived']

    scaler = StandardScaler()
    X = scaler.fit_transform(x)

    self.x = torch.tensor(X, dtype=torch.float32)
    self.y = torch.tensor(y.values, dtype=torch.float32)

    self.y = self.y.view(self.y.shape[0], 1)

  def __getitem__(self, index):
    return self.x[index], self.y[index]

  def __len__(self):
    return self.x.shape[0]

  @staticmethod
  def clean_data(filepath):
    data = pd.read_csv(filepath)
    data = data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

    data['Age'].fillna(data['Age'].mean(), inplace=True)
    # data['Cabin'].fillna('Unknown', inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

    label_columns = ['Embarked', 'Sex']

    for col in label_columns:
      le = LabelEncoder()
      data[col] = le.fit_transform(data[col])

    return data

class LogisticRegression(nn.Module):
  def __init__(self, n_input_features):
    super(LogisticRegression, self).__init__()
    self.linear = nn.Linear(in_features=n_input_features, out_features=1)

  def forward(self, x):
    y_predicted = torch.sigmoid(self.linear(x))
    return y_predicted


In [71]:
# train_dataset = TitanicDataset('/content/data/train.csv')
# # for i in range(len(train_dataset)):
# #   print(train_dataset[i])
# print(train_dataset[0][0].shape[0])
# print(train_dataset[0][1].shape)



In [72]:
# Load Data
train_dataset = TitanicDataset('/content/data/train.csv')
dataloader = DataLoader(dataset=train_dataset, batch_size=25, shuffle=True)

# Model
model = LogisticRegression(train_dataset[0][0].shape[0])

eta = 0.01
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(params=model.parameters(), lr=eta)

n_epoch = 1000

# Train
for epoch in range(n_epoch):
  for i, (input, label) in enumerate(dataloader):
    y_hat = model(input)
    loss = criterion(y_hat, label)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
  if epoch % 100 == 0:
    print(f'Epoch: {epoch+1}/{n_epoch}')


Epoch: 1/1000
Epoch: 101/1000
Epoch: 201/1000
Epoch: 301/1000
Epoch: 401/1000
Epoch: 501/1000
Epoch: 601/1000
Epoch: 701/1000
Epoch: 801/1000
Epoch: 901/1000


In [73]:
# Test Data
test_data = train_dataset.clean_data('/content/data/test.csv')

scaler = StandardScaler()
test_data = scaler.fit_transform(test_data)

X_test = torch.tensor(test_data, dtype=torch.float32)
pred = None
with torch.no_grad():
  y_prediction = model(X_test)
  # print(y_prediction)
  pred = y_prediction.round()






In [74]:
# Store as csv
test = pd.read_csv('/content/data/test.csv')
passenger_ids = test['PassengerId']

passenger_ids_np = passenger_ids.values.astype(int).reshape(-1, 1)
prediction_np = pred.numpy().astype(int).reshape(-1, 1)

combined_data = np.concatenate((passenger_ids_np, prediction_np), axis=1)

df = pd.DataFrame(combined_data, columns=['PassengerId', 'Survived'])
pred_path = '/content/data/prediction.csv'

df.to_csv(pred_path, index=False)

  prediction_np = pred.numpy().astype(int).reshape(-1, 1)


In [75]:
pred.shape

torch.Size([418, 1])