# This is the part that I practice using a Pytorch Linear model to predict

## Import clean data sets

In [1]:
import pandas as pd
import numpy as np

X_train = pd.read_csv('X_train.csv').iloc[:, 1:].values
y_train = pd.read_csv('y_train.csv', header=None).iloc[:, 1:].values
X_test = pd.read_csv('X_test.csv').iloc[:, 1:].values
assert len(X_train) == len(y_train)

In [2]:
import torch


def to_tensor_X_y(X, y=None):
    '''Returns a data set of numpy array to X and y torch tensors '''
    X = torch.from_numpy(X).type(torch.float)
    if y is not None:
        y = torch.from_numpy(y[:, -1]).type(torch.float)
        y = torch.reshape(y, (-1, 1))
    return X, y


X_train_tensor, y_train_tensor = to_tensor_X_y(X_train, y_train)
X_test_tensor = to_tensor_X_y(X_test)
assert (y_train_tensor is not None)

In [3]:
from torch.utils.data import TensorDataset, random_split

# Split the training data to training and validation sets
data_train = TensorDataset(X_train_tensor, y_train_tensor)
# data_test = TensorDataset(X_test_tensor)
train_ratio = 0.8
train_size = int(len(data_train) * train_ratio)
val_size = len(data_train) - train_size
train_ds, val_ds = random_split(data_train, [train_size, val_size])

In [4]:
np.unique(y_train)

array([0, 1], dtype=int64)

In [5]:
from torch.utils.data.dataloader import DataLoader

batch_size = 32

# Training sampler and data loader
train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size)
# test_loader = DataLoader(data_test, batch_size)

In [6]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

input_size = X_train.shape[1]
output_size = len(np.unique(y_train))


class My_Model(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.linear1 = nn.Linear(input_size, input_size // 2)
        self.linear2 = nn.Linear(input_size // 2, input_size // 4)
        self.output = nn.Linear(input_size // 4, output_size)

    def forward(self, X):
        X = F.relu(self.linear1(X))
        X = F.relu(self.linear2(X))
        X = self.output(X)
        return torch.sigmoid(X)

    def training_step(self, batch):
        inputs, targets = batch
        # Generate prediction
        y_pred = self(inputs)
        # Calculate loss
        loss = nn.BCELoss(y_pred, targets)
        return loss

    def validation_step(self, batch):
        inputs, targets = batch
        # Generate prediction
        y_pred = self(inputs)
        # Calculate loss
        loss = nn.BCELoss(y_pred, targets)
        return {'val_loss': loss.detach()}  # Detach = no_grad

    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()  # Combine losses
        return {'val_loss': epoch_loss.item()}

    def epoch_end(self, epoch, result, num_epochs):
        # Print result every 10% of num_epochs
        if (epoch + 1) % (num_epochs // 10) == 0:
            print(f"Epoch [{epoch + 1}], val_loss: {result['val_loss']:,.4f}")

In [7]:
import math
from tqdm import tqdm


def evaluate(model, val_loader):
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)


def fit(epochs, lr, model, train_loader, val_loader, opt_func=optim.Adam):
    history = []
    optimizer = opt_func(model.parameters(), lr)
    last_loss = -math.inf
    patience = epochs // 20
    for epoch in tqdm(range(epochs)):
        trigger_times = 0
        # Training Phase
        for batch in train_loader:
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # Validation phase
        result = evaluate(model, val_loader)

        # Early stopping
        if result['val_loss'] > last_loss:
            trigger_times += 1
            if trigger_times >= patience:
                history.append(result)
                return history
        last_loss = result['val_loss']

        model.epoch_end(epoch, result, epochs)
        history.append(result)
    return history

In [8]:
model = My_Model(input_size, output_size)
epochs = int(1e2)
lr = 1e-5
history = fit(epochs, lr, model, train_loader, val_loader)

  0%|          | 0/100 [00:00<?, ?it/s]


RuntimeError: Boolean value of Tensor with more than one value is ambiguous

In [None]:
# Plotting
losses = [x['val_loss'] for x in history]
plt.plot(losses, '-x')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.title('Loss vs. No. of epochs')

In [None]:
import pandas as pd


# Make predictions using the trained model
def to_csv(dataset, model, file_name):
    preds, actual = [], []
    for X, y in dataset:
        X = X.unsqueeze(0)
        predictions = model(X)
        prediction = predictions[0].detach()
        preds.append([i for i in prediction.flatten().numpy()])
        actual.append([i for i in y.numpy().flatten()])
    preds = [item for sublist in preds for item in sublist]
    actual = [item for sublist in actual for item in sublist]
    predicted_df = pd.DataFrame(data={'predicted_value': preds, 'actual_value': actual})
    predicted_df.to_csv(file_name)

In [None]:
to_csv(test_loader, model, 'Predicted value from Pytorch Linear model.csv')