# CPU Model Runner

Assuming you have models saved at `model_cpu/bin`

## Setup

In [1]:
import sys
print(sys.executable)
print(sys.version)

import warnings
warnings.filterwarnings("ignore")

c:\Users\kevin\Documents\UT\Fa25\ECE 382V Parallel Algorithms\Project\ParallelConvolution\.venv\Scripts\python.exe
3.12.0 (tags/v3.12.0:0fb18b0, Oct  2 2023, 13:03:39) [MSC v.1935 64 bit (AMD64)]


In [2]:
import torch
import pytorch_ocl
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split

import os
import time

from model_ocl.model import SimpleCNN_OCL

In [3]:
# test if opencl backend is working
d = "ocl:1"
a = torch.randn(10, 10, device=d)
print(a)

tensor([[-0.3353, -1.9969,  0.4002, -0.8788,  0.5879,  0.2263,  2.3304, -0.4758,
          0.6576, -0.6822],
        [ 0.8969,  0.5318, -1.6782,  0.5136, -1.2366,  0.0815, -0.2332, -0.5884,
          0.2860,  1.3909],
        [-0.5703,  0.2103,  0.6810,  1.4556, -1.4839,  0.8884,  0.1283,  0.3658,
         -1.5041,  1.4942],
        [ 0.5756, -0.1858, -1.8230, -0.3469, -0.2627, -1.7218,  0.9533, -0.1594,
          0.8503, -0.9109],
        [ 0.9801, -0.3053,  0.6675,  0.1577,  0.9398, -2.0231, -0.2284,  0.0171,
          0.9087, -0.6431],
        [-0.2854, -1.4053,  1.3438, -1.7106,  1.0479, -0.5900, -0.6511,  0.7411,
         -0.7643, -1.1028],
        [-0.3865,  1.3572,  0.8877, -0.5633, -0.0361,  0.6555, -0.8055, -0.0862,
          1.9238, -0.6474],
        [-0.6947,  0.1900, -1.3436,  1.0144,  0.4820, -1.3304,  1.8510, -0.5527,
          0.9221,  1.0242],
        [ 0.1837,  0.9953,  1.8329, -0.6770, -1.2779, -0.2813,  1.3593, -0.3489,
         -0.7107, -0.1647],
        [ 0.6120,  

In [4]:
# Define the directory and file path
SAVE_DIR = "model_ocl/bin"

FP32_MODEL_NAME = "CNN-MNIST-OCL1-fp32.pt"
FP32_SAVE_PATH = os.path.join(SAVE_DIR, FP32_MODEL_NAME)

In [5]:
# --- Hyperparameters and Setup ---
DEVICE = torch.device(d)   # <-- USE OPENCL GPU
LEARNING_RATE = 0.001
BATCH_SIZE = 512
NUM_EPOCHS = 10
TRAIN_VAL_SPLIT_RATIO = 0.9

In [6]:
# --- 2. Load Data and Define DataLoaders ---
def get_dataloaders():

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])

    full_train_dataset = datasets.MNIST(
        root="./data",
        train=True,
        transform=transform,
        download=True
    )

    test_dataset = datasets.MNIST(
        root="./data",
        train=False,
        transform=transform,
        download=True
    )

    train_size = int(len(full_train_dataset) * TRAIN_VAL_SPLIT_RATIO)
    val_size = len(full_train_dataset) - train_size

    train_subset, val_subset = random_split(
        full_train_dataset,
        [train_size, val_size],
        generator=torch.Generator().manual_seed(42)
    )

    train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=BATCH_SIZE, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    return train_loader, val_loader, test_loader

In [7]:
print(f"Using device: {DEVICE}")

train_loader, val_loader, test_loader = get_dataloaders()

model = SimpleCNN_OCL().to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

train_loss_history = []
train_acc_history = []
val_loss_history = []
val_acc_history = []

Using device: ocl:1


In [8]:
# --- Helper Functions for Comparison ---
# Helper to print model size
def print_model_size(model, label):
    # Save a temporary file
    torch.save(model.state_dict(), "temp.pt")
    size_mb = os.path.getsize("temp.pt") / (1024 * 1024)
    print(f"Size of {label} model: {size_mb:.2f} MB")
    os.remove("temp.pt")

# Helper to evaluate accuracy
def evaluate_model(model, data_loader, device):
    model.eval()
    model.to(device)
    
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in data_loader:
            images = images.to(device)
            labels = labels.to(device)
            
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
    return 100 * correct / total

# Helper to measure inference speed (Throughput)
def measure_inference_speed(model, data_loader, device):
    model.eval()
    model.to(device)
    
    # Use one batch for a warm-up/test run
    dummy_input = next(iter(data_loader))[0].to(device)
    
    # Warm-up runs
    print("  Running warm-up...")
    for _ in range(10):
        _ = model(dummy_input)
        
    # Measure
    print("  Measuring inference throughput...")
    start_time = time.time()
    total_samples = 0
    
    with torch.no_grad():
        for images, _ in data_loader:
            _ = model(images.to(device))
            # Add the batch size (number of images in this batch)
            total_samples += images.size(0)
            
    end_time = time.time()
    
    total_time = end_time - start_time
    samples_per_second = total_samples / total_time
    
    return samples_per_second

# Helper to get model size in bytes
def get_model_size(model):
    """Saves model state_dict temporarily to get file size."""
    # Save a temporary file
    torch.save(model.state_dict(), "temp_size_calc.pt")
    size_bytes = os.path.getsize("temp_size_calc.pt")
    os.remove("temp_size_calc.pt")
    return size_bytes

## Run

In [9]:
# --- Load the FP32 model ---
print(f"Loading FP32 model from '{FP32_SAVE_PATH}'...")
fp32_model = SimpleCNN_OCL().to(DEVICE)
fp32_model.load_state_dict(
    torch.load(FP32_SAVE_PATH)
)
fp32_model.eval()

fp32_accuracy = evaluate_model(fp32_model, test_loader, DEVICE)
print(f"FP32 Model Accuracy: {fp32_accuracy:.2f}%")

print("Measuring FP32 model...")
fp32_throughput = measure_inference_speed(fp32_model, val_loader, DEVICE)
print(f"-> FP32 Throughput: {fp32_throughput:.2f} samples/sec")

Loading FP32 model from 'model_ocl/bin\CNN-MNIST-OCL1-fp32.pt'...
FP32 Model Accuracy: 99.26%
Measuring FP32 model...
  Running warm-up...
  Measuring inference throughput...
-> FP32 Throughput: 3064.53 samples/sec
