<a href="https://colab.research.google.com/github/vintagedeek/mnist/blob/master/kaggle_mnist_cnn_v1_983.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
import numpy as np

from collections import OrderedDict, namedtuple
from itertools import product
import time
from IPython.display import display, clear_output
import json

import torch 
from torchvision.transforms import Normalize
import torch.nn as nn 
import torch.optim as optim
import torch.nn.functional as F

In [16]:
train_url = "https://raw.githubusercontent.com/wehrley/Kaggle-Digit-Recognizer/master/train.csv"
df_train = pd.read_csv(train_url)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB


In [17]:
test_url = 'https://raw.githubusercontent.com/wehrley/Kaggle-Digit-Recognizer/master/test.csv'
df_test = pd.read_csv(test_url)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 0 to 27999
Columns: 784 entries, pixel0 to pixel783
dtypes: int64(784)
memory usage: 167.5 MB


# Data Normalization

In [18]:
df_train_copy = df_train.copy()
df_train_copy = torch.tensor(np.array(df_train_copy), dtype=torch.float32)
pixels_only = torch.narrow(df_train_copy, 1, 1, 784) # start at 1 to 1 +  = 42000 total samples
df_train_copy_mean = pixels_only.sum()/(42000*28*28)
df_train_copy_var = (1 / (42000 * 28 * 28)) * ((pixels_only - df_train_copy_mean)**2).sum()
df_train_copy_std = df_train_copy_var.sqrt()

train_set = df_train_copy[0:29400]
train = torch.narrow(train_set, 1, 1, 784)
train = (train - df_train_copy_mean) / df_train_copy_std
train_labels = torch.narrow(train_set, 1, 0, 1)
train_labels = torch.tensor(train_labels, dtype=torch.int64).squeeze() # they are 2D if you don't squeeze

val_set = df_train_copy[29400:]
val = torch.narrow(val_set, 1, 1, 784)
val = (val - df_train_copy_mean) / df_train_copy_std
val_labels = torch.narrow(val_set, 1, 0, 1)
val_labels = torch.tensor(val_labels, dtype=torch.int64).squeeze() # they are 2D if you don't squeeze

print(df_train_copy_mean, df_train_copy_std, train.mean(), train.std(), val.mean(), val.std())

tensor(33.4023) tensor(78.4047) tensor(-7.0261e-05) tensor(1.0030) tensor(0.0010) tensor(1.0046)


  if sys.path[0] == '':


In [19]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [20]:
val.shape

torch.Size([12600, 784])

In [21]:
train = train.reshape(29400, 1, 28, 28)
val = val.reshape(12600, 1, 28, 28)
print(train.shape, val.shape)

torch.Size([29400, 1, 28, 28]) torch.Size([12600, 1, 28, 28])


In [26]:
validation_set = val.to(device)
validation_labels = val_labels.to(device) 

class RunBuilder():
    @staticmethod
    def get_runs(params):

        Run = namedtuple('Run', params.keys())

        runs = []
        for v in product(*params.values()):
            runs.append(Run(*v))

        return runs

class RunManager():
    def __init__(self):
        self.epoch_count = 0
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.val_correct = 0
        self.epoch_start_time = None
        
        self.run_params = None
        self.run_count = 0
        self.run_data = []
        self.run_start_time = None
        
        self.network = None

       
    def begin_run(self, run, network):
        self.run_start_time = time.time()
        
        self.epoch_count = 0
        self.run_params = run
        self.run_count += 1
        
        self.network = network
        
    def end_run(self):
        self.epoch_count = 0
    
    def begin_epoch(self):
        self.epoch_start_time = time.time()
        self.epoch_count += 1
        self.epoch_loss = 0
        self.epoch_num_correct = 0
        self.val_correct = 0
    
    def end_epoch(self):
        epoch_duration = time.time() - self.epoch_start_time
        run_duration = time.time() - self.run_start_time
        
        loss = self.epoch_loss / len(train)
        train_accuracy = self.epoch_num_correct / len(train)

        val_accuracy = self.val_correct/len(validation_set)

            
        results = OrderedDict()
        results["run"] = self.run_count
        results['epoch'] = self.epoch_count
        results['loss'] = loss
        results['train_accuracy'] = train_accuracy
        results['val_accuracy'] = val_accuracy
        results['epoch duration'] = epoch_duration
        results['run duration'] = run_duration
        for k, v in self.run_params._asdict().items(): results[k] = v
        self.run_data.append(results)
        df = pd.DataFrame.from_dict(self.run_data, orient='columns')
        clear_output(wait=True)
        display(df)
    
    def track_loss(self, loss):
        self.epoch_loss += loss.item() * 10
    
    def track_num_correct(self, preds, labels):
        self.epoch_num_correct += self._get_num_correct(preds, labels)
      
    def track_val_accuracy(self, model):
        with torch.no_grad():
            validation_preds = model(validation_set)
            self.val_correct += validation_preds.argmax(dim=1).eq(validation_labels).sum().item()
   
    @torch.no_grad()
    def _get_num_correct(self, preds, labels):
        return preds.argmax(dim=1).eq(labels).sum().item()
    
    def save(self, fileName): # save in json and csv
        
        pd.DataFrame.from_dict(self.run_data, orient='columns').to_csv(f'{fileName}.csv')
        
        with open(f'{fileName}.json', 'w', encoding='utf-8') as f:
            json.dump(self.run_data, f, ensure_ascii=False, indent=4)


params = OrderedDict(
    lr = [0.01],
    batch_size = [20],
    device = ['cuda'],
    weight_decay = [0.0])

m = RunManager()
for run in RunBuilder.get_runs(params):

    network = nn.Sequential(
        nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1),
        nn.BatchNorm2d(6),
        nn.MaxPool2d(kernel_size=2, stride=2),
        nn.ReLU(),
        nn.Flatten(start_dim=1),
        nn.Linear(in_features=6 * 12 * 12, out_features=10),
        nn.Sigmoid()).to(device)
    
    mini_batches = [train[k:k + run.batch_size] for k in range(0, train_labels.shape[0], run.batch_size)]
    mini_labels = [train_labels[k:k + run.batch_size] for k in range(0, train_labels.shape[0], run.batch_size)]
    optimizer = optim.Adam(network.parameters(), lr=run.lr, weight_decay=run.weight_decay)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

    m.begin_run(run, network)
    for epoch in range(25):
        m.begin_epoch()
        for b in range(0, len(mini_batches)):
            images = mini_batches[b].to(device)
            labels = mini_labels[b].to(device)
            preds = network(images) # pass batch
            loss = F.cross_entropy(preds, labels) # calculate loss
            optimizer.zero_grad() # zero gradients
            loss.backward() # calculate gradients
            optimizer.step() # update weights

            m.track_loss(loss)
            m.track_num_correct(preds, labels)
        m.track_val_accuracy(network)
        m.end_epoch()
        scheduler.step()
    m.end_run()

pd.DataFrame.from_dict(m.run_data).sort_values('val_accuracy', ascending=False)[0:15]

Unnamed: 0,run,epoch,loss,train_accuracy,val_accuracy,epoch duration,run duration,lr,batch_size,device,weight_decay
0,1,1,0.811674,0.836633,0.958413,2.880777,2.88078,0.01,20,cuda,0.0
1,1,2,0.749572,0.966973,0.974048,2.820071,5.724782,0.01,20,cuda,0.0
2,1,3,0.746136,0.97415,0.970556,2.778045,8.526617,0.01,20,cuda,0.0
3,1,4,0.744352,0.975476,0.975952,2.790708,11.340928,0.01,20,cuda,0.0
4,1,5,0.743827,0.976667,0.972857,2.7547,14.120637,0.01,20,cuda,0.0
5,1,6,0.741985,0.979626,0.974048,2.812637,16.957066,0.01,20,cuda,0.0
6,1,7,0.741817,0.979592,0.978016,2.765887,19.745248,0.01,20,cuda,0.0
7,1,8,0.741206,0.980782,0.977302,2.807077,22.576421,0.01,20,cuda,0.0
8,1,9,0.741409,0.980374,0.976587,2.794776,25.395977,0.01,20,cuda,0.0
9,1,10,0.740507,0.981905,0.976587,2.789986,28.210655,0.01,20,cuda,0.0


Unnamed: 0,run,epoch,loss,train_accuracy,val_accuracy,epoch duration,run duration,lr,batch_size,device,weight_decay
19,1,20,0.735411,0.987687,0.982302,2.787405,56.375055,0.01,20,cuda,0.0
20,1,21,0.735269,0.988095,0.982063,2.764401,59.165885,0.01,20,cuda,0.0
24,1,25,0.735151,0.987925,0.981984,2.769091,70.395626,0.01,20,cuda,0.0
23,1,24,0.735169,0.987925,0.981984,2.786614,67.598051,0.01,20,cuda,0.0
22,1,23,0.73519,0.988061,0.981905,2.782071,64.782331,0.01,20,cuda,0.0
21,1,22,0.735218,0.988129,0.981905,2.778339,61.972239,0.01,20,cuda,0.0
18,1,19,0.735521,0.987551,0.981587,2.769684,53.560291,0.01,20,cuda,0.0
17,1,18,0.73563,0.987755,0.981508,2.787733,50.763219,0.01,20,cuda,0.0
16,1,17,0.735787,0.987381,0.98127,2.822498,47.946937,0.01,20,cuda,0.0
14,1,15,0.736132,0.987075,0.981111,2.815995,42.276937,0.01,20,cuda,0.0


### Notes
- Adding regularization seems to harm performance.
- Batch size initially optimized at 1,000 with 97% acc (beat lower batch sizes). After optimizing LR at 0.01 (from 0.1), lowered batch to 20 and improved val accuracy from 97.8% to 98.3%

In [27]:
m.save('kaggle_mnist_cnn_v1_best_run') 
# to download, click folder (3rd icon down) on left pane