In [48]:
!pip install torch torchvision
# !git clone https://github.com/ensemble-core/NdLinear.git
# !cd NdLinear
# !pip install .
# !pip install ndlinear



In [49]:
# !pip install ndlinear --ignore-requires-python
!pip install torch torchvision torchprofile psutil



In [50]:
!python --version

Python 3.13.2


In [51]:
!pip install matplotlib



In [52]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from ndlinear import NdLinear
from torchprofile import profile_macs
import psutil
import time
import matplotlib.pyplot as plt
import numpy as np
import math

# Load and Prepare Dataset
- Use MNIST for simplicity (28x28 grayscale images, 10 classes).
- Code to load and preprocess:

In [53]:

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

trainset = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

testset = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)

test_loader_256 = torch.utils.data.DataLoader(dataset=testset, batch_size=256, shuffle=False)

# transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.2860,), (0.3530,))])
# train_dataset = torchvision.datasets.FashionMNIST(root='./data', train=True, transform=transform, download=True)
# test_dataset = torchvision.datasets.FashionMNIST(root='./data', train=False, transform=transform)
# train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=64, shuffle=True, drop_last=True)
# test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)
# test_loader_256 = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=256, shuffle=False)

In [54]:
train_dataset

Dataset FashionMNIST
    Number of datapoints: 60000
    Root location: ./data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.286,), std=(0.353,))
           )

# Define Models

- Baseline Model: A CNN with nn.Linear layers.
- NdLinear Model: Same architecture, swapping nn.Linear with NdLinear.
- Sample CNN (2 conv layers + 2 linear layers):

In [55]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Model Architecture

In [58]:
class BaselineCNN(nn.Module):
    """
    Lightweight CNN with a single linear layer:
      (3, 32, 32) -> Conv(3->16) -> Pool -> Linear(4096->512) -> FC(512->10)
    """
    def __init__(self, out_dim=10, dropout_rate=0.3):
        super(BaselineCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.pool = nn.MaxPool2d(2, 2)
        self.linear = nn.Linear(16 * 16 * 16, 512)  # 16 * 16 * 16 = 4096
        self.bn_linear = nn.BatchNorm1d(512)
        self.fc = nn.Linear(512, out_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        # x: (B, 3, 32, 32)
        x = self.pool(F.relu(self.bn1(self.conv1(x))))  # (B, 16, 16, 16)
        x = x.view(x.size(0), -1)  # (B, 4096)
        x = F.relu(self.bn_linear(self.linear(x)))  # (B, 512)
        x = self.dropout(x)
        x = self.fc(x)  # (B, 10)
        return x

# NdLinearCNN for CIFAR-100
class NdLinearCNN(nn.Module):
    """
    Lightweight CNN with a single NdLinear layer:
      (3, 32, 32) -> Conv(3->16) -> Pool -> NdLinear((16,16,16)->(8,8,8)) -> FC(512->10)
    """
    def __init__(self, out_dim=10, dropout_rate=0.3):
        super(NdLinearCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.pool = nn.MaxPool2d(2, 2)
        self.nd = NdLinear(in_shape=(16, 16, 16), out_shape=(8, 8, 8))  # 8*8*8 = 512
        self.bn_nd = nn.BatchNorm2d(8)
        self.fc = nn.Linear(512, out_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        # x: (B, 3, 32, 32)
        batch_size = x.size(0)
        x = self.pool(F.relu(self.bn1(self.conv1(x))))  # (B, 16, 16, 16)
        x = self.nd(x.contiguous().view(batch_size, 16, 16, 16))  # (B, 8, 8, 8)
        x = F.relu(self.bn_nd(x))
        x = x.view(x.shape[0], -1)  # (B, 512)
        x = self.dropout(x)
        x = self.fc(x)  # (B, 10)
        return x

# Train and Evaluate
- Train both models for a few epochs (e.g., 5) to compare performance.
- Track accuracy, parameter count, and inference time.
- Training loop:

In [61]:
# Custom weight initialization for NdLinear
# def init_weights(m):
#     if isinstance(m, NdLinear):
#         for layer in m.align_layers:
#             nn.init.orthogonal_(layer.weight)
#             if layer.bias is not None:
#                 nn.init.zeros_(layer.bias)
                
# Training function
def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs=8):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)
    model.train()
    # model.apply(init_weights)
    
    for epoch in range(num_epochs):
        total_loss, correct, total = 0, 0, 0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        scheduler.step()
        avg_loss = total_loss / len(train_loader)
        train_acc = 100 * correct / total
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Train Acc: {train_acc:.2f}%')
        
        # Evaluate on test set
        test_acc = evaluate_model(model, test_loader)
        print(f'Test Acc: {test_acc:.2f}%')
    
    return avg_loss, test_acc

# Evaluation function
def evaluate_model(model, test_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total

# Parameter counting
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)



# Memory usage measurement
def measure_memory_usage(model, test_loader):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    process = psutil.Process()
    with torch.no_grad():
        for images, _ in test_loader:
            images = images.to(device)
            model(images)
            memory = process.memory_info().rss / 1024 / 1024
            break
    return memory

In [62]:
# Training and evaluation loop
models = [
    (BaselineCNN_1M, NdLinearCNN_1M, "CNN"),
    # Add (BaselineCNN_3M, NdLinearCNN_3M, "3M") and (BaselineCNN_5M, NdLinearCNN_5M, "5M") as needed
]

baseline_accs, ndlinear_accs = [], []
baseline_params, ndlinear_params = [], []
baseline_gflops, ndlinear_gflops = [], []
baseline_memory, ndlinear_memory = [], []

for baseline_cls, ndlinear_cls, size in models:
    print(f"\nTraining {size} models...")
    
    # Baseline model
    baseline_model = baseline_cls()
    criterion = nn.CrossEntropyLoss()
    baseline_optimizer = optim.Adam(baseline_model.parameters(), lr=0.001)
    print("\nBaselineCNN:")
    baseline_loss, baseline_acc = train_model(baseline_model, train_loader, test_loader, criterion, baseline_optimizer)
    
    # NdLinear model
    ndlinear_model = ndlinear_cls()
    ndlinear_optimizer = optim.Adam(ndlinear_model.parameters(), lr=0.001)
    # ndlinear_optimizer = optim.Adam([
    #     {'params': conv_bn_params, 'lr': 0.001},
    #     {'params': ndlinear_params_list, 'lr': 0.001}
    # ])
    print("\nNdLinearCNN:")
    ndlinear_loss, ndlinear_acc = train_model(ndlinear_model, train_loader, test_loader, criterion, ndlinear_optimizer)
    
    # Metrics
    baseline_param = count_parameters(baseline_model)
    ndlinear_param = count_parameters(ndlinear_model)
    baseline_mem = measure_memory_usage(baseline_model, test_loader)
    ndlinear_mem = measure_memory_usage(ndlinear_model, test_loader)
    
    baseline_accs.append(baseline_acc)
    ndlinear_accs.append(ndlinear_acc)
    baseline_params.append(baseline_param)
    ndlinear_params.append(ndlinear_param)
    baseline_memory.append(baseline_mem)
    ndlinear_memory.append(ndlinear_mem)
    
    print(f"\nBaseline {size} - Acc: {baseline_acc:.2f}%, Params: {baseline_param}, GFLOPs: {baseline_gflop:.2f}, Memory: {baseline_mem:.2f} MB")
    print(f"NdLinear {size} - Acc: {ndlinear_acc:.2f}%, Params: {ndlinear_param}, GFLOPs: {ndlinear_gflop:.2f}, Memory: {ndlinear_mem:.2f} MB\n")



Training CNN models...

BaselineCNN:
Epoch [1/8], Loss: 0.3650, Train Acc: 86.65%
Test Acc: 89.84%
Epoch [2/8], Loss: 0.2396, Train Acc: 91.10%
Test Acc: 90.70%
Epoch [3/8], Loss: 0.1930, Train Acc: 92.83%
Test Acc: 91.76%
Epoch [4/8], Loss: 0.1356, Train Acc: 95.01%
Test Acc: 92.39%
Epoch [5/8], Loss: 0.1138, Train Acc: 95.79%
Test Acc: 92.67%
Epoch [6/8], Loss: 0.0931, Train Acc: 96.60%
Test Acc: 91.85%
Epoch [7/8], Loss: 0.0588, Train Acc: 98.01%
Test Acc: 93.07%
Epoch [8/8], Loss: 0.0454, Train Acc: 98.53%
Test Acc: 93.06%

NdLinearCNN:
Epoch [1/8], Loss: 0.4582, Train Acc: 84.07%
Test Acc: 87.67%
Epoch [2/8], Loss: 0.2864, Train Acc: 89.60%
Test Acc: 89.12%
Epoch [3/8], Loss: 0.2493, Train Acc: 91.05%
Test Acc: 90.19%
Epoch [4/8], Loss: 0.2066, Train Acc: 92.65%
Test Acc: 90.83%
Epoch [5/8], Loss: 0.1940, Train Acc: 93.01%
Test Acc: 90.94%
Epoch [6/8], Loss: 0.1838, Train Acc: 93.53%
Test Acc: 90.43%
Epoch [7/8], Loss: 0.1600, Train Acc: 94.35%
Test Acc: 91.45%
Epoch [8/8], Loss:

# Measure Metrics
- Parameter Count

In [47]:
print(f'Baseline Parameters: {count_parameters(baseline_model)}')
print(f'NdLinear Parameters: {count_parameters(ndlinear_model)}')

Baseline Parameters: 1631306
NdLinear Parameters: 30506
