In [166]:
#data objects
#training loop
#eval 

import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.optim as optim




In [167]:
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
print(torch.backends.mps.is_available())

if torch.backends.mps.is_available():
    device = torch.device("mps")  # Apple GPU
elif torch.cuda.is_available():
    device = torch.device("cuda") # NVIDIA GPU
else:
    device = torch.device("cpu")  # fallback
print("Using device:", device)



cpu
True
Using device: mps


In [168]:
df = pd.read_csv('fmnist_small.csv')
df.head()


Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,9,0,0,0,0,0,0,0,0,0,...,0,7,0,50,205,196,213,165,0,0
1,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,142,142,142,21,0,3,0,0,0,0
3,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8,0,0,0,0,0,0,0,0,0,...,213,203,174,151,188,10,0,0,0,0


In [169]:
#train test split data
x = df.iloc[:,1:].values #pixel values starting from col 1
y = df.iloc[:,0].values # y is the label


In [170]:
x_train , x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [171]:
#scaling the features ; vals 0 to 1
x_train = x_train / 255.0
x_test = x_test / 255.0

In [172]:
#create customdataset class
class CustomDataset(Dataset):

    def __init__(self, features, labels):
        #convert to pytroch tensors
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return self.features[index], self.labels[index]


In [173]:
# create train dataset object
train_dataset = CustomDataset(x_train,y_train)
#create test dataset object
test_dataset = CustomDataset(x_test,y_test)

In [174]:
#define NN class

class myNN(nn.Module):
    def __init__(self, input_dim, output_dim, num_hidden_layers, neurons_per_layer,dropout_rate):
        super().__init__()

        layers=[]

        for i in range(num_hidden_layers):
            layers.append(nn.Linear(input_dim,neurons_per_layer))
            layers.append(nn.BatchNorm1d(neurons_per_layer))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            input_dim = neurons_per_layer
        
        layers.append(nn.Linear(neurons_per_layer,output_dim)) #this is the output layer

        self.model = nn.Sequential(*layers) #* unwraps the layers list
    
    def forward(self,x):
        return self.model(x)


In [175]:
# #set learning rate & epochs
# epochs = 100
# learning_rate = 0.1


In [182]:

# Objective function: USING OP

def objective(trial):
    #next hyperparamter values from the search space using optima
    num_hidden_layers = trial.suggest_int("num_hidden_layers", 1,5)
    neurons_per_layer = trial.suggest_int('neurons_per_layer',8,128, step=8)
    epochs = trial.suggest_int('epochs', 10,50,step=10)
    learning_rate = trial.suggest_float('learning_rate', 1e-5,1e-1,log=True)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5, step=0.1)
    batch_size = trial.suggest_categorical('batch_size',[16,32,64,128])
    optimizer_name = trial.suggest_categorical('optimizer', ['Adam', 'SGD', 'RMSprop'])
    weight_decay = trial.suggest_float('weight_decay',1e-5,1e-3,log=True)



# {'num_hidden_layers': 2,
#  'neurons_per_layer': 112,
#  'epochs': 30,
#  'learning_rate': 0.0010005061612763301,
#  'dropout_rate': 0.4,
#  'batch_size': 32,
#  'optimizer': 'Adam',
#  'weight_decay': 0.00019590116325074537}



    train_loader = DataLoader(train_dataset, batch_size=batch_size , shuffle=True, pin_memory=True) #pin memory batches memory together and makes it run faster?
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)

    #model init
    input_dim = 784 # 28x28 pic flattenned
    output_dim = 10 #10 classes

    model = myNN(input_dim, output_dim , num_hidden_layers, neurons_per_layer, dropout_rate)
    model.to(device)

    #optimizer selection
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr = 0.1 , weight_decay=1e-4)

    if optimizer_name == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_name == 'SGD':
       optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    else:
        optimizer =optim.RMSprop(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    #training loop
    for epoch in range(epochs):
        for batch_features, batch_labels in train_loader:
            #move data to gpu
            batch_features, batch_labels = batch_features.to(device) , batch_labels.to(device)

            #forward pass
            outputs = model(batch_features)
 
            # calculate loss
            loss = criterion(outputs , batch_labels)

            #back pass
            optimizer.zero_grad()
            loss.backward()

            #update grads
            optimizer.step()



    #eval mode
    # model.eval() 
    # total = 0
    # correct = 0

    # with torch.no_grad():

    #     for batch_features, batch_labels in test_loader:
    #   # move data to gpu
    #         batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

    #         outputs = model(batch_features)

    #         _, predicted = torch.max(outputs, 1)

    #         total = total + batch_labels.shape[0]

    #         correct = correct + (predicted == batch_labels).sum().item()

    #     accuracy = correct/total

    # print(accuracy)

    # #return accuracy

    # with torch.no_grad():
    #     for batch_features, batch_labels in train_loader:
    #         batch_features, batch_labels = batch_features.to(device) , batch_labels.to(device)

    #         outputs = model(batch_features)
    #         _, predicted= torch.max(outputs,1)
    #         total = total + batch_labels.shape[0]
    #         correct = correct + (predicted == batch_labels).sum().item()

    # print(correct/total)

    # return accuracy

    model.eval()

    #TEST ACCURACY 
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for batch_features, batch_labels in test_loader:
            #move data to gpu
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

            outputs = model(batch_features)
            _ , predicted = torch.max(outputs, 1)

            test_total = test_total + batch_labels.shape[0]
            test_correct = test_correct + (predicted == batch_labels).sum().item()

    test_acc = test_correct / test_total
    print(f"Test Accuracy: {test_acc:.4f}")

    # TRAIN ACCURACY to see if it overfits
    train_correct = 0
    train_total = 0
    with torch.no_grad():
        for batch_features, batch_labels in train_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

            outputs = model(batch_features)
            _ , predicted = torch.max(outputs, 1)

            train_total = train_total + batch_labels.shape[0]
            train_correct = train_correct + (predicted == batch_labels).sum().item()

    train_acc = train_correct / train_total
    print(f"train accuracy: {train_acc:.4f}")

    return test_acc

In [177]:
import optuna
study = optuna.create_study(direction='maximize')


[I 2025-09-03 17:13:13,037] A new study created in memory with name: no-name-d523864f-7af9-40d2-8f51-0f8512983b8d


In [183]:
study.optimize(objective,n_trials=10)
study.best_value
study.best_params

Test Accuracy: 0.4333


[I 2025-09-03 17:22:01,705] Trial 4 finished with value: 0.43333333333333335 and parameters: {'num_hidden_layers': 1, 'neurons_per_layer': 56, 'epochs': 30, 'learning_rate': 1.6287448407982486e-05, 'dropout_rate': 0.1, 'batch_size': 32, 'optimizer': 'SGD', 'weight_decay': 6.342657815408323e-05}. Best is trial 4 with value: 0.43333333333333335.


train accuracy: 0.4358


[I 2025-09-03 17:22:02,930] Trial 5 finished with value: 0.19583333333333333 and parameters: {'num_hidden_layers': 2, 'neurons_per_layer': 112, 'epochs': 10, 'learning_rate': 9.55130036090169e-05, 'dropout_rate': 0.4, 'batch_size': 128, 'optimizer': 'SGD', 'weight_decay': 9.830040622224442e-05}. Best is trial 4 with value: 0.43333333333333335.


Test Accuracy: 0.1958
train accuracy: 0.2046
Test Accuracy: 0.8208


[I 2025-09-03 17:22:25,366] Trial 6 finished with value: 0.8208333333333333 and parameters: {'num_hidden_layers': 1, 'neurons_per_layer': 56, 'epochs': 30, 'learning_rate': 0.0017433009083455172, 'dropout_rate': 0.2, 'batch_size': 16, 'optimizer': 'SGD', 'weight_decay': 3.786037165493035e-05}. Best is trial 6 with value: 0.8208333333333333.


train accuracy: 0.8858


[I 2025-09-03 17:22:38,438] Trial 7 finished with value: 0.6108333333333333 and parameters: {'num_hidden_layers': 4, 'neurons_per_layer': 24, 'epochs': 40, 'learning_rate': 0.0007831608343386735, 'dropout_rate': 0.30000000000000004, 'batch_size': 64, 'optimizer': 'SGD', 'weight_decay': 0.0004038543748289501}. Best is trial 6 with value: 0.8208333333333333.


Test Accuracy: 0.6108
train accuracy: 0.6506
Test Accuracy: 0.7875


[I 2025-09-03 17:23:28,979] Trial 8 finished with value: 0.7875 and parameters: {'num_hidden_layers': 3, 'neurons_per_layer': 40, 'epochs': 50, 'learning_rate': 0.0065266026854516185, 'dropout_rate': 0.5, 'batch_size': 16, 'optimizer': 'SGD', 'weight_decay': 4.064651477973213e-05}. Best is trial 6 with value: 0.8208333333333333.


train accuracy: 0.8071
Test Accuracy: 0.6675


[I 2025-09-03 17:23:38,247] Trial 9 finished with value: 0.6675 and parameters: {'num_hidden_layers': 1, 'neurons_per_layer': 120, 'epochs': 10, 'learning_rate': 0.07292469605213642, 'dropout_rate': 0.5, 'batch_size': 16, 'optimizer': 'Adam', 'weight_decay': 4.3926634148893444e-05}. Best is trial 6 with value: 0.8208333333333333.


train accuracy: 0.6744
Test Accuracy: 0.6692


[I 2025-09-03 17:24:06,478] Trial 10 finished with value: 0.6691666666666667 and parameters: {'num_hidden_layers': 3, 'neurons_per_layer': 24, 'epochs': 50, 'learning_rate': 0.0014115702172650697, 'dropout_rate': 0.5, 'batch_size': 32, 'optimizer': 'SGD', 'weight_decay': 1.3647737406630543e-05}. Best is trial 6 with value: 0.8208333333333333.


train accuracy: 0.7004
Test Accuracy: 0.7833


[I 2025-09-03 17:24:40,856] Trial 11 finished with value: 0.7833333333333333 and parameters: {'num_hidden_layers': 4, 'neurons_per_layer': 80, 'epochs': 40, 'learning_rate': 0.016694309114653434, 'dropout_rate': 0.1, 'batch_size': 32, 'optimizer': 'RMSprop', 'weight_decay': 0.0001786024273150858}. Best is trial 6 with value: 0.8208333333333333.


train accuracy: 0.8296


[I 2025-09-03 17:24:56,941] Trial 12 finished with value: 0.6391666666666667 and parameters: {'num_hidden_layers': 2, 'neurons_per_layer': 40, 'epochs': 50, 'learning_rate': 0.07462100244134426, 'dropout_rate': 0.30000000000000004, 'batch_size': 64, 'optimizer': 'Adam', 'weight_decay': 0.0007824612778337822}. Best is trial 6 with value: 0.8208333333333333.


Test Accuracy: 0.6392
train accuracy: 0.6677


[I 2025-09-03 17:25:01,816] Trial 13 finished with value: 0.2941666666666667 and parameters: {'num_hidden_layers': 5, 'neurons_per_layer': 72, 'epochs': 20, 'learning_rate': 5.854218073765489e-05, 'dropout_rate': 0.5, 'batch_size': 128, 'optimizer': 'Adam', 'weight_decay': 0.0006255731282252999}. Best is trial 6 with value: 0.8208333333333333.


Test Accuracy: 0.2942
train accuracy: 0.2921


{'num_hidden_layers': 1,
 'neurons_per_layer': 56,
 'epochs': 30,
 'learning_rate': 0.0017433009083455172,
 'dropout_rate': 0.2,
 'batch_size': 16,
 'optimizer': 'SGD',
 'weight_decay': 3.786037165493035e-05}

In [184]:
num_hidden_layers = 2
neurons_per_layer = 112
epochs = 30
learning_rate = 0.0010005061612763301
dropout_rate = 0.4
batch_size = 32
optimizer_name = 'Adam' 
weight_decay = 0.00019590116325074537

train_loader = DataLoader(train_dataset, batch_size= batch_size , shuffle= True , pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size , shuffle=False, pin_memory=True)

#Model:
input_dim = 784
output_dim = 10

model = myNN(input_dim, output_dim, num_hidden_layers, neurons_per_layer, dropout_rate).to(device)

#loss
criterion = nn.CrossEntropyLoss()

if optimizer_name == 'Adam':
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
elif optimizer_name == 'SGD':
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
else:
    optimizer =optim.RMSprop(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

#training loop

for epoch in range(epochs):
    model.train()
    total_epoch_loss = 0

    for batch_features, batch_labels in train_loader:
        batch_features, batch_labels = batch_features.to(device) , batch_labels.to(device)

        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_epoch_loss += loss.item()

    avg_loss = total_epoch_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")



model.eval()
correct, total = 0, 0
with torch.no_grad():
    for batch_features, batch_labels in test_loader:
        batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
        outputs = model(batch_features)
        _, predicted = torch.max(outputs, 1)
        total += batch_labels.size(0)
        correct += (predicted == batch_labels).sum().item()

test_accuracy = correct / total
print(f"Final Test Accuracy: {test_accuracy:.4f}")


Epoch 1/30, Loss: 1.1362
Epoch 2/30, Loss: 0.7224
Epoch 3/30, Loss: 0.6375
Epoch 4/30, Loss: 0.5920
Epoch 5/30, Loss: 0.5491
Epoch 6/30, Loss: 0.5307
Epoch 7/30, Loss: 0.5045
Epoch 8/30, Loss: 0.4885
Epoch 9/30, Loss: 0.4751
Epoch 10/30, Loss: 0.4439
Epoch 11/30, Loss: 0.4428
Epoch 12/30, Loss: 0.4151
Epoch 13/30, Loss: 0.4127
Epoch 14/30, Loss: 0.3893
Epoch 15/30, Loss: 0.3982
Epoch 16/30, Loss: 0.3706
Epoch 17/30, Loss: 0.3776
Epoch 18/30, Loss: 0.3673
Epoch 19/30, Loss: 0.3619
Epoch 20/30, Loss: 0.3529
Epoch 21/30, Loss: 0.3494
Epoch 22/30, Loss: 0.3350
Epoch 23/30, Loss: 0.3253
Epoch 24/30, Loss: 0.3220
Epoch 25/30, Loss: 0.3084
Epoch 26/30, Loss: 0.3127
Epoch 27/30, Loss: 0.2891
Epoch 28/30, Loss: 0.3031
Epoch 29/30, Loss: 0.2989
Epoch 30/30, Loss: 0.3031
Final Test Accuracy: 0.8525
