## Setup

In [40]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import matplotlib.pyplot as plt
import seaborn as sns
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, TensorDataset

## Data Loading

In [38]:
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
mnist_trainset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
mnist_testset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

## Preprocessing

In [39]:
# Transform the training data
X_train = mnist_trainset.data.float() / 255.0  # Normalize and convert to float
X_train = X_train.view(X_train.shape[0], -1)   # Flatten the images
y_train = mnist_trainset.targets               # Labels

# Transform the test data
X_val = mnist_testset.data.float() / 255.0    # Normalize and convert to float
X_val = X_val.view(X_val.shape[0], -1)        # Flatten the images
y_val = mnist_testset.targets                 # Labels

## Stochastic Gradient Descent from Scratch

In [45]:
class BaseClassificationModel(nn.Module):
    def __init__(self):
        super().__init__()

    def fit(self, X_train, y_train, X_val, y_val, num_epochs, learning_rate, batch_size=64):
        # Creating DataLoader for batching
        train_dataset = TensorDataset(X_train, y_train)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        train_losses, val_losses = [], []
        for epoch in range(num_epochs):
            for X_batch, y_batch in train_loader:
                # Forward pass
                y_pred_train = self.forward(X_batch)
                loss_train = self.loss(y_pred_train, y_batch)

                # Zero gradients
                if self.w.grad is not None:
                    self.w.grad.zero_()
                if self.b.grad is not None:
                    self.b.grad.zero_()

                # Backward pass
                loss_train.backward()

                # Update parameters
                with torch.no_grad():
                    self.w -= learning_rate * self.w.grad
                    self.b -= learning_rate * self.b.grad

            # Validation
            with torch.no_grad():
                y_pred_val = self.forward(X_val)
                loss_val = self.loss(y_pred_val, y_val)

            # Store losses
            train_losses.append(loss_train.item())
            val_losses.append(loss_val.item())

            # Print loss every few epochs
            if epoch % 10 == 0:
                print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {loss_train.item()}, Validation Loss: {loss_val.item()}")

        # Plot the training and validation losses
        plt.plot(train_losses, label='Training Loss')
        plt.plot(val_losses, label='Validation Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Training and Validation Loss')
        plt.legend()
        plt.show()

## Logistic Regression from Scratch

### Single Batch

In [167]:
num_classes = y_train.unique().shape[0]

batch_size = 64
X_batch = X_train[:batch_size]
y_batch = y_train[:batch_size]

W = torch.randn(size=(X_batch.shape[1], num_classes), requires_grad=True)
b = torch.zeros(size=(1,1), requires_grad=True)

O = X_batch @ W + b

def softmax(X):
    X_exp = torch.exp(X)
    partition = X_exp.sum(axis=1, keepdims=True)
    return X_exp / partition

y_pred = softmax(O)

#def loss(y_pred, y_train):



In [159]:
y_pred.shape

torch.Size([64, 10])

In [160]:
nn.functional.one_hot(y_batch).shape

torch.Size([64, 10])

In [149]:
y_pred * torch.log(y_batch)

RuntimeError: The size of tensor a (10) must match the size of tensor b (64) at non-singleton dimension 1

In [146]:
torch.round(y_pred * 1000) / 1000

tensor([[0.0000, 0.0000, 0.8930, 0.0010, 0.0000, 0.0000, 0.0000, 0.0000, 0.1060,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8230, 0.0000, 0.1750, 0.0020,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.3970, 0.0000, 0.0000, 0.1020, 0.5000, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0050, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.9940,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0010, 0.0000, 0.0000, 0.0000, 0.9140, 0.0850,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0390, 0.0000, 0.0000, 0.0000, 0.9590, 0.0020,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.6270, 0.0000, 0.0000, 0.0000, 0.3730, 0.0000,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.0170, 0.0000, 0.0000, 0.0000, 0.0020, 0.9810,
         0.0000],
        [0.0000, 0.0000, 0.0000, 0.9750, 0.0000, 0.0000, 0.0000, 0.0170, 0.0070,
         0.0000],
        [0.0000, 0.0000, 0.0020, 0.0000, 0.0020, 0.2530, 0.0030, 0.1860, 0.5550,
         0.0000],
        [0

In [124]:
O.shape

torch.Size([64, 10])

In [117]:
torch.exp(O).sum(axis=1, keepdims=True).shape

torch.Size([64, 1])

In [114]:
X = O 
X_exp = torch.exp(X)
X_exp.sum(1, keepdims=True).shape

torch.Size([64, 1])

In [110]:
torch.exp(O).sum(axis=0)

tensor([1.6460e+03, 6.1084e+04, 5.0361e+04, 1.9378e+04, 6.9996e+10, 2.7343e+06,
        1.8150e+07, 3.1560e+05, 1.5547e+07, 1.3126e+10],
       grad_fn=<SumBackward1>)

In [109]:
torch.exp(O) / torch.exp(O).sum(axis=0)

tensor([[4.9459e-07, 2.9707e-05, 2.7486e-14, 2.0638e-09, 5.4338e-05, 1.8297e-03,
         6.9229e-16, 4.5132e-11, 3.5651e-11, 1.9131e-15],
        [3.4086e-04, 3.4287e-07, 3.1337e-14, 4.3773e-05, 4.3026e-10, 2.1644e-05,
         1.4578e-07, 3.4217e-10, 1.2311e-05, 8.6288e-07],
        [1.4064e-04, 1.1862e-04, 1.6303e-03, 1.5537e-10, 5.9690e-04, 3.4065e-11,
         8.7321e-04, 5.2101e-06, 2.0020e-08, 2.0865e-05],
        [5.7586e-09, 1.0304e-06, 3.7472e-01, 1.3343e-02, 1.2079e-06, 7.7167e-07,
         3.8771e-08, 1.2865e-01, 1.1037e-07, 9.4395e-08],
        [5.1089e-08, 4.3240e-06, 1.9610e-09, 6.6191e-13, 6.6747e-09, 4.3662e-11,
         2.0770e-15, 2.6961e-07, 2.1812e-04, 2.0547e-07],
        [5.7922e-08, 1.3082e-07, 6.9564e-09, 1.3455e-12, 4.7819e-12, 1.0093e-04,
         1.5863e-11, 9.5883e-08, 2.6991e-08, 2.5376e-06],
        [6.3449e-07, 4.2344e-02, 9.3502e-04, 4.1483e-04, 5.1299e-06, 1.1537e-07,
         1.2720e-12, 7.8954e-07, 4.1178e-06, 1.0019e-14],
        [3.1386e-09, 1.3324

In [101]:
O.shape

torch.Size([64, 10])

In [102]:
torch.exp(O).sum(axis=1)

tensor([7.9480e+10, 1.5522e+10, 6.3992e+05, 1.2818e+05, 1.0198e+08, 3.4120e+05,
        1.0586e+08, 3.8687e+11, 3.4325e+04, 2.4640e+04, 4.1829e+10, 1.2867e+04,
        3.5797e+06, 5.2261e+09, 1.5880e+04, 9.4111e+06, 7.0931e+07, 7.1171e+07,
        3.9531e+06, 1.9748e+05, 3.6325e+10, 9.0758e+13, 1.2530e+04, 2.2933e+05,
        9.4702e+05, 1.3110e+09, 7.6559e+04, 2.7408e+12, 3.6465e+15, 4.0158e+05,
        4.4650e+06, 9.1181e+07, 4.1548e+06, 1.3397e+05, 1.2018e+08, 1.9002e+03,
        8.4377e+09, 1.7782e+10, 4.0375e+13, 5.8197e+08, 1.8379e+04, 7.0598e+08,
        7.6529e+03, 6.5472e+05, 2.4880e+05, 1.4689e+07, 5.6751e+05, 1.4192e+03,
        8.9766e+02, 5.2856e+11, 7.1630e+03, 2.6258e+14, 6.1695e+11, 1.4877e+05,
        8.0811e+10, 4.5880e+10, 6.9909e+15, 1.7057e+04, 2.1248e+07, 6.5082e+05,
        2.3695e+12, 2.1055e+03, 3.0712e+09, 1.4262e+16],
       grad_fn=<SumBackward1>)

In [95]:
torch.exp(O).sum(axis=0)

tensor([5.4329e+12, 7.2418e+10, 1.1214e+05, 1.7907e+01, 1.0330e+12, 6.4195e+08,
        5.3815e+06, 4.8606e+06, 6.7338e+02, 2.2498e+05],
       grad_fn=<SumBackward1>)

In [93]:
torch.exp(o).sum(axis=0)

tensor([8613.8633], grad_fn=<SumBackward1>)

In [86]:
O.apply_(softmax)

RuntimeError: Can't call apply_() on Variable that requires grad. Use var.detach().apply_() instead.

In [75]:

X = X_train[:64]
y = y_train[:64]

w = torch.randn(X.shape[1], 1, requires_grad=True)
b = torch.zeros(1, 1, requires_grad=True)

o = X @ w + b

def softmax(o):
    return torch.exp(o) / torch.exp(o).sum()

y_pred = softmax(o)


In [74]:
def softmax(o):
    return torch.exp(o) / torch.exp(o).sum()

In [70]:
o

tensor([[ -1.7284],
        [  5.3626],
        [  9.6239],
        [ -1.8063],
        [  0.4426],
        [  3.1868],
        [ -2.0599],
        [ 16.1163],
        [ -0.7369],
        [  6.7454],
        [  5.7246],
        [  3.8791],
        [ -2.4204],
        [  0.6376],
        [ -2.7021],
        [  8.7005],
        [ -2.1716],
        [ -5.8198],
        [  3.8899],
        [ -3.4341],
        [  9.1908],
        [  1.2324],
        [ -2.6439],
        [ -4.0557],
        [ -0.9578],
        [  2.4787],
        [ -0.9881],
        [  5.9406],
        [  4.6929],
        [  4.0552],
        [ -8.2735],
        [ -0.4614],
        [ 13.2897],
        [  2.2910],
        [ 10.2118],
        [  1.0290],
        [  2.2115],
        [  8.1164],
        [ -8.0071],
        [ 10.7568],
        [ -3.2584],
        [  2.6659],
        [ -8.5159],
        [ -1.4384],
        [ -7.9096],
        [ -1.7909],
        [ -4.1755],
        [ -0.4762],
        [  1.4304],
        [ 11.3068],


In [None]:
class LogisticRegression(BaseClassificationModel):
    def __init__(self, in_features):
        super().__init__()
        self.w = torch.randn(in_features, 1, requires_grad=True)
        self.b = torch.zeros(1, 1, requires_grad=True)

    def forward(self, X):
        y_pred = X @ self.w + self.b
        return y_pred
    
    def loss(self, y_pred, y_true):
        loss = torch.mean((y_true - y_pred) ** 2)
        return loss

In [55]:
X @ w + b

tensor([[ 5.2412],
        [-4.8160],
        [11.7902],
        ...,
        [-0.7309],
        [-4.7391],
        [-7.6389]], grad_fn=<AddBackward0>)