##### TODO
* Create, load, test Dataset and Dataloaders
* Create an MLP
* Train MLP on Different datasets
    * Dry Beans Dataset
* Use Wandb (or just do this with PyTorch Lightning na)
* Refactor code to training script, model classes, util, etc.
* Deploy to Streamlit  

In [3]:
%load_ext watermark
%watermark -v -p matplotlib,numpy,pandas,torch

Python implementation: CPython
Python version       : 3.9.5
IPython version      : 8.16.1

matplotlib: not installed
numpy     : 1.26.0
pandas    : 2.1.1
torch     : 2.1.0



### Setup

In [4]:
import torch as t
import torch.nn.functional as F 
from torch.autograd import grad
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import numpy as np
import pandas as pd

from ucimlrepo import fetch_ucirepo 

from dataclasses import dataclass
from pathlib import Path
from collections import Counter
from util.util import compute_total_loss, compute_accuracy

from rich import print

In [5]:
device = t.device("cuda" if t.cuda.is_available() else "cpu")
print(device)

In [6]:
t.manual_seed(123)

<torch._C.Generator at 0x7f32691cf670>

### Import Data and Light EDA

In [7]:
# fetch dataset 
dry_bean_dataset = fetch_ucirepo(id=602) 
  
# data (as pandas dataframes) 
X = dry_bean_dataset.data.features 
y = dry_bean_dataset.data.targets 

In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13611 entries, 0 to 13610
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             13611 non-null  int64  
 1   Perimeter        13611 non-null  float64
 2   MajorAxisLength  13611 non-null  float64
 3   MinorAxisLength  13611 non-null  float64
 4   AspectRatio      13611 non-null  float64
 5   Eccentricity     13611 non-null  float64
 6   ConvexArea       13611 non-null  int64  
 7   EquivDiameter    13611 non-null  float64
 8   Extent           13611 non-null  float64
 9   Solidity         13611 non-null  float64
 10  Roundness        13611 non-null  float64
 11  Compactness      13611 non-null  float64
 12  ShapeFactor1     13611 non-null  float64
 13  ShapeFactor2     13611 non-null  float64
 14  ShapeFactor3     13611 non-null  float64
 15  ShapeFactor4     13611 non-null  float64
dtypes: float64(14), int64(2)
memory usage: 1.7 MB


In [9]:
X.head(10)

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199
4,30140,620.134,201.847882,190.279279,1.060798,0.33368,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.9419,0.999166
5,30279,634.927,212.560556,181.510182,1.171067,0.520401,30600,196.347702,0.775688,0.98951,0.943852,0.923726,0.00702,0.003153,0.85327,0.999236
6,30477,670.033,211.050155,184.03905,1.146768,0.489478,30970,196.988633,0.762402,0.984081,0.85308,0.933374,0.006925,0.003242,0.871186,0.999049
7,30519,629.727,212.996755,182.737204,1.165591,0.51376,30847,197.12432,0.770682,0.989367,0.967109,0.92548,0.006979,0.003158,0.856514,0.998345
8,30685,635.681,213.534145,183.157146,1.165852,0.514081,31044,197.659696,0.771561,0.988436,0.95424,0.925658,0.006959,0.003152,0.856844,0.998953
9,30834,631.934,217.227813,180.897469,1.200834,0.553642,31120,198.139012,0.783683,0.99081,0.970278,0.912125,0.007045,0.003008,0.831973,0.999061


In [10]:
X.columns

Index(['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength',
       'AspectRatio', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent',
       'Solidity', 'Roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2',
       'ShapeFactor3', 'ShapeFactor4'],
      dtype='object')

In [11]:
X.tail(5)

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4
13606,42097,759.696,288.721612,185.944705,1.552728,0.765002,42508,231.515799,0.714574,0.990331,0.916603,0.801865,0.006858,0.001749,0.642988,0.998385
13607,42101,757.499,281.576392,190.713136,1.476439,0.735702,42494,231.526798,0.799943,0.990752,0.922015,0.822252,0.006688,0.001886,0.676099,0.998219
13608,42139,759.321,281.539928,191.187979,1.472582,0.734065,42569,231.631261,0.729932,0.989899,0.918424,0.82273,0.006681,0.001888,0.676884,0.996767
13609,42147,763.779,283.382636,190.275731,1.489326,0.741055,42667,231.653247,0.705389,0.987813,0.907906,0.817457,0.006724,0.001852,0.668237,0.995222
13610,42159,772.237,295.142741,182.204716,1.619841,0.786693,42600,231.686223,0.788962,0.989648,0.88838,0.784997,0.007001,0.00164,0.616221,0.99818


In [12]:
y.sample(15)

Unnamed: 0,Class
7250,HOROZ
5321,CALI
6795,HOROZ
11870,DERMASON
2628,BARBUNYA
9728,SIRA
13459,DERMASON
6296,HOROZ
8341,SIRA
8634,SIRA


Convert a categorical series into a numerical one 

In [13]:
codes, uniques = pd.factorize(y["Class"])

codes

array([0, 0, 0, ..., 6, 6, 6])

In [14]:
codes.dtype

dtype('int64')

Check number of categories

In [15]:
y.nunique()

Class    7
dtype: int64

Check if there are missing values; There should be none based on UCI's page.

In [16]:
print(X.isna().any(axis=1))

Trying different scaling options using sklearn and torch

In [17]:
scaler = MinMaxScaler()

scaler.fit_transform(X.values)

array([[0.03405267, 0.05857388, 0.04426214, ..., 0.83304873, 0.75099637,
        0.98061988],
       [0.03550018, 0.07755673, 0.03047881, ..., 0.96731556, 0.88498696,
        0.97497943],
       [0.03825855, 0.06803484, 0.05263303, ..., 0.80094221, 0.7361995 ,
        0.98719586],
       ...,
       [0.09273856, 0.16060492, 0.17638393, ..., 0.42701907, 0.4722397 ,
        0.9430251 ],
       [0.09277272, 0.16365701, 0.17970258, ..., 0.41532984, 0.45691893,
        0.91334232],
       [0.09282396, 0.16944765, 0.20088207, ..., 0.34689176, 0.36476185,
        0.9701623 ]])

In [18]:
# This clamps all the values, not min-max scaling column by column
t.clamp(t.tensor(X.values, device=device), min=0, max=1)

tensor([[1.0000, 1.0000, 1.0000,  ..., 0.0031, 0.8342, 0.9987],
        [1.0000, 1.0000, 1.0000,  ..., 0.0036, 0.9099, 0.9984],
        [1.0000, 1.0000, 1.0000,  ..., 0.0030, 0.8259, 0.9991],
        ...,
        [1.0000, 1.0000, 1.0000,  ..., 0.0019, 0.6769, 0.9968],
        [1.0000, 1.0000, 1.0000,  ..., 0.0019, 0.6682, 0.9952],
        [1.0000, 1.0000, 1.0000,  ..., 0.0016, 0.6162, 0.9982]],
       device='cuda:0', dtype=torch.float64)

### Config

In [19]:
@dataclass
class Config:
    device: t.device = device
    train_pct: float = 0.8
    test_pct: float = 0.2
    batch_size: int = 32
    n_classes: int = 7
    n_features: int = 16
    layer1_dim: int = 60
    layer2_dim: int = 25
    lr: float = 0.005
    n_epochs: int = 60

In [20]:
cfg = Config()

print(cfg.device)

### Dataset and DataLoaders

##### Dataset

In [21]:
# TODO : Add Transform function since we want to have normalized values for MLPs

class DryBeansDataset(Dataset):
    def __init__(self,
                cfg: Config,
                transform = None):

        # fetch dataset 
        self.dry_bean_dataset = fetch_ucirepo(id=602)
        
        self.cfg = cfg
        self.transform = transform

        self.scaler = MinMaxScaler()
        scaled_features = self.scaler.fit_transform(self.dry_bean_dataset.data.features)

        self.features = t.tensor(scaled_features, device=self.cfg.device, dtype=t.float32)

        # Categorical to Numerical
        codes, uniques = pd.factorize(self.dry_bean_dataset.data.targets['Class'])

        self.uniques = uniques
        self.labels = t.tensor(codes, device=self.cfg.device)

    def __getitem__(self, index: int):
        if self.transform is None:    
            return self.features[index], self.labels[index]
        else:
            self.features[index] = self.transform(self.features[index])
            return self.features[index], self.labels[index] 

    def __len__(self):
        return self.labels.shape[0]

In [22]:
dataset = DryBeansDataset(cfg)

train_dataset, test_dataset = random_split(dataset, [cfg.train_pct, cfg.test_pct])
train_dataset, val_dataset = random_split(train_dataset, [cfg.train_pct, cfg.test_pct])

In [23]:
print(f"Length of Train set: {len(train_dataset)}")
print(f"The length of Val set: {len(val_dataset)}")
print(f"The length of Test set: {len(test_dataset)}")


In [24]:
print(dataset.features.shape)
print(dataset.labels.shape)
print(len(dataset))

print(dataset.features.dtype)
print(dataset.labels.dtype)

##### DataLoaders

In [25]:
train_loader = DataLoader(train_dataset,
                          batch_size=cfg.batch_size,
                          shuffle=True,
                          num_workers=0)
val_loader = DataLoader(val_dataset,
                        batch_size=cfg.batch_size,
                        shuffle=False,
                        num_workers=0)
test_loader = DataLoader(test_dataset,
                        batch_size=cfg.batch_size,
                        shuffle=False,
                        num_workers=0)

##### Testing DataLoaders

In [66]:
import time

num_epochs = 1
for epoch in range(num_epochs):

    for batch_idx, (x, y) in enumerate(train_loader):
        time.sleep(1)
        if batch_idx >= 3:
            break
        print(" Batch index:", batch_idx, end="")
        print(" | Batch size:", y.shape[0], end="")
        print(" | x shape:", x.shape, end="")
        print(" | y shape:", y.shape)

print("Labels from current batch:", y)

##### Checking Data Distributions

In [56]:
def count_classes(loader: DataLoader):
    """
        Count and sort the number of labels/classes from a DataLoader
    """

    counter = Counter()

    for _, labels in loader:
        counter.update(labels.tolist())

    return (sorted(counter.items()), counter)


In [57]:
print("Training label Distribution: ")
print(count_classes(train_loader)[0])


print("Val label Distribution: ")
print(count_classes(val_loader)[0])

print("Test label Distribution: ")
print(count_classes(test_loader)[0])

In [58]:
test_counter = count_classes(test_loader)[1]

#### Zero-rule Baseline

In [60]:
majority_class = test_counter.most_common(1)[0]
majority_label = majority_class[0]
print(f"Majority Class is {uniques[majority_label]} with label {majority_label}")

majority_acc = majority_class[1] / sum(test_counter.values())
print(f"Zero-rule Baseline Accuracy: {majority_acc}")

In [None]:
type(test_loader)

torch.utils.data.dataloader.DataLoader

### Model

In [61]:
# TODO: adjust to take in cfg instead of num_features and num_classes
# Note that performance does not change that much beyond two hidden layers at least for MLPs

class MLP(t.nn.Module):
    def __init__(self,
                cfg: Config
                ):
        super().__init__()

        self.cfg = cfg

        self.all_layers = t.nn.Sequential(
            # 1st hidden layer
            t.nn.Linear(self.cfg.n_features, self.cfg.layer1_dim),
            t.nn.ReLU(),
            # 2nd hidden layer
            t.nn.Linear(self.cfg.layer1_dim, self.cfg.layer2_dim),
            t.nn.ReLU(),
            # output layer
            t.nn.Linear(self.cfg.layer2_dim, self.cfg.n_classes),
        )

    def forward(self, x):
        x = t.flatten(x, start_dim=1)
        logits = self.all_layers(x)
        return logits

### Training

In [None]:
model = MLP(cfg).to(device)
optimizer = t.optim.Adam(model.parameters(), lr=cfg.lr)

n_epochs = cfg.n_epochs

loss_list = []
train_acc_list, val_acc_list = [], []

for epoch in range(n_epochs):

    model = model.train()
    for batch_idx, (features, labels) in enumerate(train_loader):

        logits = model(features)
        loss = F.cross_entropy(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if not batch_idx % 70:
            val_loss = compute_total_loss(model, val_loader, device=cfg.device)

            print(f"Epoch: {epoch+1:03d}/{n_epochs}"
                        f"| Batch: {batch_idx}/{len(train_loader)}"
                        f"| Train Loss: {loss:.4f}"
                        f"| Val Total Loss: {val_loss:.4f}")

    loss_list.append(loss.item())    




In [None]:
loss_list

Compute Accuracy from Scratch

In [78]:

print(compute_accuracy(model, train_loader))
print(compute_accuracy(model, val_loader))
print(compute_accuracy(model, test_loader))