In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.optim as optim
import optuna
from torch.utils.data import ConcatDataset



In [34]:
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
print(torch.backends.mps.is_available())

if torch.backends.mps.is_available():
    device = torch.device("mps")  # Apple GPU
elif torch.cuda.is_available():
    device = torch.device("cuda") # NVIDIA GPU
else:
    device = torch.device("cpu")  # fallback
print("Using device:", device)



cpu
True
Using device: mps


In [35]:
df = pd.read_csv('fashion-mnist_train.csv')
df.head()
df.shape


(60000, 785)

In [36]:
#train test split data
x = df.iloc[:,1:].values #pixel values starting from col 1
y = df.iloc[:,0].values # y is the label


In [37]:
x_train_full , x_test, y_train_full, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
print("x_train_full shape:", x_train_full.shape)
print("y_train_full shape:", y_train_full.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)



x_train_full shape: (48000, 784)
y_train_full shape: (48000,)
x_test shape: (12000, 784)
y_test shape: (12000,)


In [38]:
x_train, x_val, y_train, y_val = train_test_split(
    x_train_full, y_train_full, test_size=0.1, random_state=42
)

#scaling the features ; vals 0 to 1
x_train = x_train / 255.0
x_test = x_test / 255.0
x_val = x_val / 255.0


In [39]:
from torchvision import transforms

# data augmentation // transforming the training dataset 
train_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomRotation(10),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomAffine(0, translate=(0.1,0.1)),
    transforms.ToTensor()
])

test_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor()
])

In [40]:
#create customdataset class
class CustomDataset(Dataset):

    def __init__(self, features, labels,transform = None):
        #convert to pytroch tensors
        self.features = torch.tensor(features, dtype=torch.float32).reshape(-1,1,28,28) ## change -1 
        self.labels = torch.tensor(labels, dtype=torch.long)
        self.transform = transform

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        feature, label = self.features[index], self.labels[index]
        if self.transform:
            feature = self.transform(feature.squeeze(0).numpy()) #apply transformation
        return feature, label


In [41]:
# create train dataset object
train_dataset = CustomDataset(x_train,y_train, transform=train_transform)
#create test dataset object
test_dataset = CustomDataset(x_test,y_test, transform=test_transform)
val_dataset = CustomDataset(x_val,y_val,transform=test_transform)

# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, pin_memory=True)


In [43]:
class myNN(nn.Module):
    def __init__(self,num_conv_layers, num_filters, kernel_size, num_fc_layers,fc_layer_size, dropout_rate):
        super().__init__()
        layers=[]
        in_channels = 1 #grayscale img

        #conv layers
        for _ in range(num_conv_layers):
            layers.append(nn.Conv2d(in_channels, num_filters, kernel_size=kernel_size,padding='same'))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm2d(num_filters))
            layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
            in_channels=num_filters #for next layer

        self.features = nn.Sequential(*layers)

        #FCC / ANN:
        fc_layers = [nn.Flatten()]
        input_size = num_filters * (28//(2**num_conv_layers)) **2
        for _ in range(num_fc_layers):
            fc_layers.append(nn.Linear(input_size, fc_layer_size))
            fc_layers.append(nn.ReLU())
            fc_layers.append(nn.Dropout(dropout_rate))
            input_size=fc_layer_size
        fc_layers.append(nn.Linear(input_size,10)) #output layer , 10 classes

        self.classifier = nn.Sequential(*fc_layers)

    def forward(self,x):
        x = self.features(x)
        x = self.classifier(x)
        return x


In [None]:
# optuna obj func:

def objective(trial):
    num_conv_layers = trial.suggest_int('num_conv_layers', 1, 3)
    num_filters = trial.suggest_categorical('num_filters', [16, 32, 64, 128])
    kernel_size = trial.suggest_categorical('kernel_size', [3, 5])
    num_fc_layers = trial.suggest_int('num_fc_layers', 1, 3)
    fc_layer_size = trial.suggest_categorical('fc_layer_size', [64, 128, 256])
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-2)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
    optimizer_name = trial.suggest_categorical('optimizer', ['SGD', 'Adam', 'RMSprop'])
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    num_epochs = trial.suggest_int('num_epochs', 10, 30)

    model = myNN(num_conv_layers, num_filters, kernel_size, num_fc_layers,fc_layer_size, dropout_rate).to(device)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)


    #optimizer = optim.SGD(model.parameters(), lr = 0.1 , weight_decay=1e-4)

    if optimizer_name == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    elif optimizer_name == 'SGD':
       optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    else:
        optimizer =optim.RMSprop(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

    #loss func
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()

        for batch_features, batch_labels in train_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)

            optimizer.zero_grad()
            outputs = model(batch_features)
            loss = criterion(outputs,batch_labels)
            loss.backward()

            optimizer.step()

    
    #validation loop // eval
    model.eval()
 
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for batch_features, batch_labels in val_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
            outputs = model(batch_features)
            _, predicted = torch.max(outputs, 1)
            val_total += batch_labels.size(0)
            val_correct += (predicted == batch_labels).sum().item()

    val_acc = val_correct / val_total
    print(f"Validation Accuracy: {val_acc:.4f}")
    


   # return test_acc
    return val_acc
    

In [45]:
#run optuna

pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize', pruner = pruner)
study.optimize(objective, n_trials=3)


[I 2025-09-05 16:23:32,268] A new study created in memory with name: no-name-d201cfe3-ca84-47e3-a1fe-abeefd3e8d51
  dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-2)
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)


Validation Accuracy: 0.8640


[I 2025-09-05 16:25:03,456] Trial 0 finished with value: 0.8639583333333334 and parameters: {'num_conv_layers': 2, 'num_filters': 16, 'kernel_size': 5, 'num_fc_layers': 2, 'fc_layer_size': 64, 'dropout_rate': 0.3577402833039748, 'weight_decay': 0.0010636238928688735, 'learning_rate': 0.0020469845104683533, 'optimizer': 'RMSprop', 'batch_size': 128, 'num_epochs': 14}. Best is trial 0 with value: 0.8639583333333334.


train accuracy: 0.8448
Validation Accuracy: 0.8894


[I 2025-09-05 16:30:14,013] Trial 1 finished with value: 0.889375 and parameters: {'num_conv_layers': 2, 'num_filters': 128, 'kernel_size': 3, 'num_fc_layers': 3, 'fc_layer_size': 128, 'dropout_rate': 0.49309874513333896, 'weight_decay': 0.0026066014121061028, 'learning_rate': 0.005087549409132183, 'optimizer': 'SGD', 'batch_size': 32, 'num_epochs': 16}. Best is trial 1 with value: 0.889375.


train accuracy: 0.8742
Validation Accuracy: 0.9073


[I 2025-09-05 16:38:42,919] Trial 2 finished with value: 0.9072916666666667 and parameters: {'num_conv_layers': 3, 'num_filters': 64, 'kernel_size': 5, 'num_fc_layers': 2, 'fc_layer_size': 256, 'dropout_rate': 0.43200349107299785, 'weight_decay': 5.158005301677908e-05, 'learning_rate': 0.0013249628145140544, 'optimizer': 'SGD', 'batch_size': 32, 'num_epochs': 30}. Best is trial 2 with value: 0.9072916666666667.


train accuracy: 0.9036


In [None]:
study.best_params


{'num_conv_layers': 3,
 'num_filters': 64,
 'kernel_size': 5,
 'num_fc_layers': 2,
 'fc_layer_size': 256,
 'dropout_rate': 0.43200349107299785,
 'weight_decay': 5.158005301677908e-05,
 'learning_rate': 0.0013249628145140544,
 'optimizer': 'SGD',
 'batch_size': 32,
 'num_epochs': 30}

In [47]:
study.best_value

0.9072916666666667

In [49]:
best_params = study.best_params


# merge train+val for final training
final_train_dataset = ConcatDataset([train_dataset, val_dataset])
final_train_loader = DataLoader(final_train_dataset,
                                batch_size=best_params["batch_size"],
                                shuffle=True)

test_loader = DataLoader(test_dataset, batch_size=best_params['batch_size'], shuffle=False, pin_memory=True)

model = myNN(num_conv_layers=best_params['num_conv_layers'],
    num_filters=best_params['num_filters'],
    kernel_size=best_params['kernel_size'],
    num_fc_layers=best_params['num_fc_layers'],
    fc_layer_size=best_params['fc_layer_size'],
    dropout_rate=best_params['dropout_rate']
).to(device)

if best_params['optimizer'] == 'Adam':
    optimizer = optim.Adam(model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
elif best_params['optimizer'] == 'SGD':
    optimizer = optim.SGD(model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
else:
    optimizer = optim.RMSprop(model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])

#loss function
criterion = nn.CrossEntropyLoss()

#training loop
epochs = best_params['num_epochs']
for epoch in range(epochs):
    model.train()
    total_epoch_loss = 0

    for batch_features, batch_labels in final_train_loader:
        batch_features, batch_labels = batch_features.to(device) , batch_labels.to(device)

        outputs = model(batch_features)
        loss = criterion(outputs, batch_labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_epoch_loss += loss.item()

    avg_loss = total_epoch_loss / len(final_train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")



    model.eval()
#TRAIN ACCURACY
    train_correct, train_total = 0, 0
    with torch.no_grad():
        for batch_features, batch_labels in final_train_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
            outputs = model(batch_features)
            _, predicted = torch.max(outputs, 1)
            train_total += batch_labels.size(0)
            train_correct += (predicted == batch_labels).sum().item()
    train_acc = train_correct / train_total

    #TEST ACCURACY
    test_correct, test_total = 0, 0
    with torch.no_grad():
        for batch_features, batch_labels in test_loader:
            batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
            outputs = model(batch_features)
            _, predicted = torch.max(outputs, 1)
            test_total += batch_labels.size(0)
            test_correct += (predicted == batch_labels).sum().item()
    test_acc = test_correct / test_total

    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")

    #train mode for next epoch
    model.train()


Epoch 1/30, Loss: 1.3444
Epoch 1/30, Loss: 1.3444, Train Acc: 0.7475, Test Acc: 0.7678
Epoch 2/30, Loss: 0.7396
Epoch 2/30, Loss: 0.7396, Train Acc: 0.7964, Test Acc: 0.8088
Epoch 3/30, Loss: 0.6099
Epoch 3/30, Loss: 0.6099, Train Acc: 0.8226, Test Acc: 0.8355
Epoch 4/30, Loss: 0.5391
Epoch 4/30, Loss: 0.5391, Train Acc: 0.8411, Test Acc: 0.8523
Epoch 5/30, Loss: 0.4975
Epoch 5/30, Loss: 0.4975, Train Acc: 0.8495, Test Acc: 0.8573
Epoch 6/30, Loss: 0.4657
Epoch 6/30, Loss: 0.4657, Train Acc: 0.8598, Test Acc: 0.8719
Epoch 7/30, Loss: 0.4382
Epoch 7/30, Loss: 0.4382, Train Acc: 0.8666, Test Acc: 0.8759
Epoch 8/30, Loss: 0.4196
Epoch 8/30, Loss: 0.4196, Train Acc: 0.8708, Test Acc: 0.8807
Epoch 9/30, Loss: 0.4058
Epoch 9/30, Loss: 0.4058, Train Acc: 0.8705, Test Acc: 0.8821
Epoch 10/30, Loss: 0.3861
Epoch 10/30, Loss: 0.3861, Train Acc: 0.8822, Test Acc: 0.8901
Epoch 11/30, Loss: 0.3790
Epoch 11/30, Loss: 0.3790, Train Acc: 0.8838, Test Acc: 0.8901
Epoch 12/30, Loss: 0.3689
Epoch 12/30, 

In [None]:
#results:
# Epoch 28/30, Loss: 0.2829, Train Acc: 0.8969, Test Acc: 0.9042
# Epoch 29/30, Loss: 0.2846
# Epoch 29/30, Loss: 0.2846, Train Acc: 0.9110, Test Acc: 0.9158
# Epoch 30/30, Loss: 0.2805
# Epoch 30/30, Loss: 0.2805, Train Acc: 0.9117, Test Acc: 0.9134