# Neural network for digit localization and classification
Input is an image of a number, output is the number on the image
Framework used: PyTorch
For classification, we use a LeNet CNN architecture, trained on MNIST dataset
For localization, we use a simple CNN architecture, trained on dataset composed of images from MNIST dataset, with bounding boxes around digits

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as tfs
import matplotlib.pyplot as plt
import numpy as np
import os
import cv2
import pandas as pd

In [3]:
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST

# Load MNIST dataset
data_tfs = tfs.Compose([
    tfs.ToTensor(),
    tfs.Normalize(0.5, 0.5)
])

# install for train and test
root = './'
train_dataset = MNIST(root, train=True,  transform=data_tfs, download=True)
val_dataset  = MNIST(root, train=False, transform=data_tfs, download=True)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, pin_memory=True)
valid_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=False, pin_memory=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./MNIST\raw\train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./MNIST\raw\train-images-idx3-ubyte.gz to ./MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./MNIST\raw\train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./MNIST\raw\train-labels-idx1-ubyte.gz to ./MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./MNIST\raw\t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./MNIST\raw\t10k-images-idx3-ubyte.gz to ./MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST\raw\t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./MNIST\raw\t10k-labels-idx1-ubyte.gz to ./MNIST\raw



In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
# Define LeNet CNN architecture
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square conv kernel
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.pool1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(6, 16, 3)
        self.pool2 = nn.MaxPool2d(2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.activation(self.conv1(x))
        x = self.pool1(x)
        x = self.activation(self.conv2(x))
        x = self.pool2(x)
        x = self.flatten(x)
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.fc3(x)
        return x

In [6]:
model_cls = LeNet().to(device)

criterion_cls = nn.CrossEntropyLoss()
optimizer_cls = torch.optim.Adam(model_cls.parameters())

loaders = {"train": train_dataloader, "valid": valid_dataloader}

In [7]:
# Train LeNet CNN
def train(model, criterion, optimizer, loaders, epochs=10, device="cpu"):
    accuracy = {"train": [], "valid": []}
    for epoch in range(epochs):
        for k, dataloader in loaders.items():
            epoch_correct = 0
            epoch_all = 0
            for x_batch, y_batch in dataloader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                if k == "train":
                    model.train()
                    optimizer.zero_grad()
                    outp = model(x_batch)
                else:
                    model.eval()
                    with torch.no_grad():
                        outp = model(x_batch)

                preds = outp.argmax(-1)
                correct = (preds == y_batch).sum()
                all = len(preds)
                epoch_correct += correct.item()
                epoch_all += all
                if k == "train":
                    loss = criterion(outp, y_batch)
                    loss.backward()
                    optimizer.step()
            if k == "train":
                print(f"Epoch: {epoch+1}")
            print(f"Loader: {k}. Accuracy: {epoch_correct/epoch_all}")
            accuracy[k].append(epoch_correct/epoch_all)

In [8]:
train(model_cls, criterion_cls, optimizer_cls, loaders, epochs=10, device=device)

Epoch: 1
Loader: train. Accuracy: 0.8733333333333333
Loader: valid. Accuracy: 0.9574
Epoch: 2
Loader: train. Accuracy: 0.9645333333333334
Loader: valid. Accuracy: 0.9718
Epoch: 3
Loader: train. Accuracy: 0.9759666666666666
Loader: valid. Accuracy: 0.9771
Epoch: 4
Loader: train. Accuracy: 0.9812166666666666
Loader: valid. Accuracy: 0.9834
Epoch: 5
Loader: train. Accuracy: 0.9844
Loader: valid. Accuracy: 0.9836
Epoch: 6
Loader: train. Accuracy: 0.9866833333333334
Loader: valid. Accuracy: 0.9841
Epoch: 7
Loader: train. Accuracy: 0.9887666666666667
Loader: valid. Accuracy: 0.9853
Epoch: 8
Loader: train. Accuracy: 0.9898833333333333
Loader: valid. Accuracy: 0.9861
Epoch: 9
Loader: train. Accuracy: 0.99115
Loader: valid. Accuracy: 0.9869
Epoch: 10
Loader: train. Accuracy: 0.9920333333333333
Loader: valid. Accuracy: 0.9859


In [9]:
# Save model
torch.save(model_cls.state_dict(), "model_cls.pth")

In [27]:
# Create dataset for digit localization
def create_localization_dataset(dataset):
    if not os.path.exists("localization_dataset"):
        os.mkdir("localization_dataset")

    for i in range(len(dataset)):
        img, label = train_dataset[i]
        img = img.squeeze().numpy()
        img = (img * 255).astype(np.uint8)
        img = cv2.resize(img, (28, 28))
        _, thresh = cv2.threshold(img, 127, 255, 0)
        contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
        cnt = contours[0]
        x,y,w,h = cv2.boundingRect(cnt)
        yield i, x, y, w, h

In [28]:
loc_train_dataset = list(create_localization_dataset(train_dataset))
loc_val_dataset = list(create_localization_dataset(val_dataset))

In [None]:
# Save dataset
pd.DataFrame(loc_train_dataset).to_csv("loc_train_dataset.csv", index=False)
pd.DataFrame(loc_val_dataset).to_csv("loc_val_dataset.csv", index=False)

In [46]:
# Load dataset
loc_train_dataset = pd.read_csv("loc_train_dataset.csv").values
loc_val_dataset = pd.read_csv("loc_val_dataset.csv").values

In [47]:
def split_xy(dataset, img_dataset):
    new_dataset = []
    for i, x, y, w, h in dataset:
        new_dataset.append((img_dataset[i][0], (x, y, w, h)))
    return new_dataset

In [48]:
loc_train_dataset = split_xy(loc_train_dataset, train_dataset)
loc_val_dataset = split_xy(loc_val_dataset, val_dataset)

In [49]:
loc_train_dataset[0]

(tensor([[[-1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
           -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
           -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
           -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000],
          [-1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
           -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
           -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
           -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000],
          [-1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
           -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
           -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
           -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000],
          [-1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
           -1.0000, -1.0000, -1.000

In [50]:
loc_train_dataloader = DataLoader(loc_train_dataset, batch_size=128, shuffle=True, pin_memory=True)
loc_valid_dataloader = DataLoader(loc_val_dataset, batch_size=128, shuffle=False, pin_memory=True)

In [51]:
# Define CNN architecture for digit localization for images of size 140x140
class LocalizationNet(nn.Module):
    def __init__(self):
        super(LocalizationNet, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square conv kernel
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.pool1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(6, 16, 3)
        self.pool2 = nn.MaxPool2d(2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 4)
        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.activation(self.conv1(x))
        x = self.pool1(x)
        x = self.activation(self.conv2(x))
        x = self.pool2(x)
        x = self.flatten(x)
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.fc3(x)
        return x

In [52]:
model_loc = LocalizationNet().to(device)

criterion_loc = nn.MSELoss()
optimizer_loc = torch.optim.Adam(model_loc.parameters())

loaders = {"train": loc_train_dataloader, "valid": loc_valid_dataloader}

In [53]:
# Train CNN for digit localization
def train_loc(model, criterion, optimizer, loaders, epochs=10, device="cpu"):
    for epoch in range(epochs):
        for k, dataloader in loaders.items():
            epoch_loss = 0
            for x_batch, y_batch in dataloader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                if k == "train":
                    model.train()
                    optimizer.zero_grad()
                    outp = model(x_batch)
                else:
                    model.eval()
                    with torch.no_grad():
                        outp = model(x_batch)

                loss = criterion(outp, y_batch)
                epoch_loss += loss.item()
                if k == "train":
                    loss.backward()
                    optimizer.step()
            if k == "train":
                print(f"Epoch: {epoch+1}")
            print(f"Loader: {k}. Loss: {epoch_loss/len(dataloader)}")

In [54]:
train_loc(model_loc, criterion_loc, optimizer_loc, loaders, epochs=10, device=device)

AttributeError: 'list' object has no attribute 'to'

In [None]:
# Save model
torch.save(model_loc.state_dict(), "model_loc.pth")

In [None]:
# Define function to predict
def predict(model_cls, model_loc, img, device="cpu"):
    img = img.unsqueeze(0).to(device)
    model_cls.eval()
    with torch.no_grad():
        outp_cls = model_cls(img)
    preds_cls = outp_cls.argmax(-1)
    model_loc.eval()
    with torch.no_grad():
        outp_loc = model_loc(img)
    x, y, w, h = outp_loc.squeeze().cpu().numpy()
    x, y, w, h = int(x), int(y), int(w), int(h)
    return preds_cls.item