# MNIST Handwritten Digit Recognizer using Deep CNN

# 1. Importing the libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader, TensorDataset, random_split
from torchvision import datasets, transforms

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg  # similar to keras image reading

%matplotlib inline

np.random.seed(2)
torch.manual_seed(2)

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools

# Torch versions
print(f"Torch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"Device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")
device = 'cuda'

Torch Version: 2.8.0+cu128
CUDA Available: True
Device: cuda


# 2. Preparing the Dataset

Load the data from files

In [2]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

print(df_train.shape)

(42000, 785)


Separate features and target values and convert to pytorch tensor

In [3]:
y_train = torch.tensor(df_train['label'].values)
x_train = torch.tensor(df_train.drop(labels=['label'],axis=1).values)
x_test = torch.tensor(df_test.values)

In [4]:
x_train = torch.nan_to_num(x_train, nan=0)
x_test = torch.nan_to_num(x_test, nan=0)

In [5]:
x_train = x_train/255
x_test = x_test/255

In [6]:
x_train=x_train.view(-1,1,28,28)
x_test = x_test.view(-1,1,28,28)

In [7]:
random_seed=2
x_train, x_cv, y_train, y_cv = train_test_split(x_train, y_train, test_size=0.1, random_state=random_seed)
train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=10)
valid_ds = TensorDataset(x_cv, y_cv)
valid_dl = DataLoader(valid_ds, batch_size = 20)

# 3. Model Building

In [8]:
# CNN Architecture is IN -> [[Conv2D -> relu] * 2 -> MaxPool2D -> Dropout] * 2 -> #Flatten -> Dense -> Dropout ->Out
class Lambda(nn.Module):
    def __init__(self, func):
        super().__init__()
        self.func = func

    def forward(self, x):
        return self.func(x)

model = nn.Sequential(
    # C1: Conv layer
    nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1,padding=0),
    nn.ReLU(),

    # S2: Avg Pooling
    nn.AvgPool2d(kernel_size=2, stride=2),

    # C3: Conv layer
    nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1,padding=0),
    nn.ReLU(),

    # S4: Avg Pooling
    nn.AvgPool2d(kernel_size=2, stride=2),

    # Flatten
    nn.Flatten(),

    # Dense Layers
    nn.Linear(16 * 4 * 4, 120),  # after 2 poolings: 28 -> 14 -> 7
    nn.ReLU(),
    nn.Linear(120,84),
    nn.ReLU(),
    nn.Linear(84, 10)
)
print(model)



Sequential(
  (0): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (1): ReLU()
  (2): AvgPool2d(kernel_size=2, stride=2, padding=0)
  (3): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (4): ReLU()
  (5): AvgPool2d(kernel_size=2, stride=2, padding=0)
  (6): Flatten(start_dim=1, end_dim=-1)
  (7): Linear(in_features=256, out_features=120, bias=True)
  (8): ReLU()
  (9): Linear(in_features=120, out_features=84, bias=True)
  (10): ReLU()
  (11): Linear(in_features=84, out_features=10, bias=True)
)


# 4. Model fitting

In [9]:
loss_func = F.cross_entropy
epoch = 10
def preprocess(x, y):
    return x.to(device), y.to(device)

class WrappedDataLoader:
    def __init__(self, dl, func):
        self.dl = dl
        self.func = func

    def __len__(self):
        return len(self.dl)

    def __iter__(self):
        for b in self.dl:
            yield (self.func(*b))

def loss_batch(model, loss_func, xb, yb, opt=None):
    loss = loss_func(model(xb), yb)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item(), len(xb)
    
def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_dl:
            loss_batch(model, loss_func, xb, yb, opt)

        model.eval()
        with torch.no_grad():
            losses, nums = zip(
                *[loss_batch(model, loss_func, xb, yb) for xb, yb in valid_dl]
            )
        val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)

        print(epoch, val_loss)


model.to(device)

# Define optimizer (RMSprop)
optimizer = optim.RMSprop(
    model.parameters(),  # trainable parameters of your model
    lr=0.001,
    alpha=0.9,
    eps=1e-08
)

train_dl = WrappedDataLoader(train_dl, preprocess)
valid_dl = WrappedDataLoader(valid_dl, preprocess)

In [10]:
fit(epoch, model, loss_func, optimizer, train_dl, valid_dl)

0 0.12417017679489661
1 0.08994748872750538
2 0.10796662219838275
3 0.09858448090999947
4 0.11602350172746721
5 0.10561334942956305
6 0.11071384791142243
7 0.07840433444547792
8 0.10672140037368513
9 0.07109496178111527


# 5. Predicting the test data

In [11]:
# predict result
x_test = x_test.to(device)
results = torch.argmax(model(x_test), dim=1)
result_numpy = results.detach().cpu().numpy()
df = pd.DataFrame(result_numpy)
df.index = df.index + 1
df.to_csv("submission.csv", index_label="ImageId", header = ["Label"])