In [1]:
### General libraries useful for python ###

import os
import sys
import argparse
from tqdm import tqdm
import json
import random
import pickle
import copy

In [10]:
### Finding where you clone your repo, so that code upstream can work automatically ####
machine_path = os.getcwd()
work_dir = os.getcwd()
print('Your working directory is :%s'%work_dir)

Your working directory is :/net/storage001.ib.cluster/om2/user/smadan/Harvard_BAI/assignment_1


In [11]:
### Libraries for visualizing our results and data ###
from PIL import Image
import matplotlib.pyplot as plt

In [12]:
### Import PyTorch ###
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim

In [8]:
### Making helper code under the folder res available. This includes loaders, models, etc. ###
sys.path.append('%s/res/'%work_dir)
from models.models import get_model
from loader.loader import get_loader

Models are being loaded from: /net/storage001.ib.cluster/om2/user/smadan/Harvard_BAI/assignment_1
Loaders are being loaded from: /net/storage001.ib.cluster/om2/user/smadan/Harvard_BAI/assignment_1


In [7]:
### This code base uses Weights and Biases (wandb.ai) for result visualization. ###
### Please make an account at wandb.ai, and follow the steps to login to your account here: ###
import wandb
wandb.login()

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Currently logged in as: [33mspandanmadan[0m (use `wandb login --relogin` to force relogin)


True

In [13]:
### Specifying settings/hyperparameters for our code below ###
wandb_config = {}
wandb_config['batch_size'] = 10
wandb_config['base_lr'] = 0.01
wandb_config['model_arch'] = 'CustomCNN'
wandb_config['num_classes'] = 10
wandb_config['run_name'] = 'TEST_1'
wandb_config['use_gpu'] = 1
wandb_config['num_epochs'] = 2
wandb_config['work_dir'] = work_dir

In [9]:
### Load MNIST which can be done easily with PyTorch. The first time you run it, dataset gets downloaded.###
if not os.path.isdir('%s/datasets'%work_dir):
    os.mkdir('%s/datasets'%work_dir)

    
data_transforms = {}
data_transforms['train'] = torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))])

data_transforms['test'] = torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))])
    
mnist_dataset = {}
mnist_dataset['train'] = torchvision.datasets.MNIST('%s/datasets'%work_dir, train = True, download = True, transform = data_transforms['train'])
mnist_dataset['test'] = torchvision.datasets.MNIST('%s/datasets'%work_dir, train = False, download = True, transform = data_transforms['test'])

In [10]:
train_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('datasets/', train=True, download=True,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=10, shuffle=True)

In [11]:
data_loaders = {}
data_loaders['train'] = torch.utils.data.DataLoader(mnist_dataset['train'], batch_size = wandb_config['batch_size'], shuffle = True)
data_loaders['test'] = torch.utils.data.DataLoader(mnist_dataset['test'], batch_size = wandb_config['batch_size'], shuffle = False)

In [60]:
data_sizes = {}
data_sizes['train'] = len(mnist_dataset['train'])
data_sizes['test'] = len(mnist_dataset['test'])

In [15]:
model = get_model(wandb_config['model_arch'], wandb_config['num_classes'])

In [16]:
# for data in data_loaders['train']:
#     inputs, targets = data
#     break

# inputs = inputs.cuda()

# inputs.shape

# outputs = model_arch(inputs)

# outputs.shape

# targets.shape

In [61]:
def train_model(model, criterion, optimizer, dset_loaders, dset_sizes, configs):
    print('Starting training epoch...')
    best_model = model
    best_acc = 0.0

    model.train()
    running_loss = 0.0
    running_corrects = 0
    iters = 0
    
    for data in tqdm(dset_loaders['train']):
        inputs, labels = data
        if configs.use_gpu:
            inputs = inputs.float().cuda()
            labels = labels.long().cuda()
        else:
            print('WARNING: NOT USING GPU!')
            inputs = inputs.float()
            labels = labels.long()


        optimizer.zero_grad()
        outputs = model(inputs)
        _, preds = torch.max(outputs.data, 1)

        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()
        
        iters += 1
        running_loss += loss.item()
        running_corrects += torch.sum(preds == labels.data)
        wandb.log({"train_running_loss": running_loss/float(iters*len(labels.data))})
        wandb.log({"train_running_corrects": running_corrects/float(iters*len(labels.data))})

    epoch_loss = float(running_loss) / dset_sizes['train']
    epoch_acc = float(running_corrects) / float(dset_sizes['train'])
    wandb.log({"train_accuracy": epoch_acc})
    wandb.log({"train_loss": epoch_loss})
    return model



In [80]:
def test_model(model, best_acc, best_model, dset_loaders, dset_sizes, configs):
    print('Starting testing epoch...')
    model.eval()

    running_corrects = 0
    iters = 0   
    for data in tqdm(dset_loaders['test']):
        inputs, labels = data
        if configs.use_gpu:
            inputs = inputs.float().cuda()
            labels = labels.long().cuda()
        else:
            print('WARNING: NOT USING GPU!')
            inputs = inputs.float()
            labels = labels.long()

        
        outputs = model(inputs)
        _, preds = torch.max(outputs.data, 1)
        
        iters += 1
        running_corrects += torch.sum(preds == labels.data)
        wandb.log({"train_running_corrects": running_corrects/float(iters*len(labels.data))})


    epoch_acc = float(running_corrects) / float(dset_sizes['test'])

    wandb.log({"test_accuracy": epoch_acc})
    
    if epoch_acc > best_acc:
        best_acc = epoch_acc
        best_model = copy.deepcopy(model)
    wandb.log({"best_accuracy": best_acc})
    
    return best_acc, best_model
    

In [81]:
def model_pipeline(model, criterion, optimizer, dset_loaders, dset_sizes, hyperparameters):
    with wandb.init(project="HARVAR_BAI", config=hyperparameters):
        if hyperparameters['run_name']:
            wandb.run.name = hyperparameters['run_name']
        config = wandb.config
        best_model = model
        best_acc = 0.0
        
        print(config)
        
        print(config.num_epochs)
        for epoch_num in range(config.num_epochs):
            wandb.log({"Current Epoch": epoch_num})
            model = train_model(model, criterion, optimizer, dset_loaders, dset_sizes, config)
            best_acc, best_model = test_model(model, best_acc, best_model, dset_loaders, dset_sizes, config)
    
    return best_model



In [82]:
criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.Adam(model.parameters(), lr = wandb_config['base_lr'])

if wandb_config['use_gpu']:
    criterion.cuda()
    model.cuda()

In [83]:
if not os.path.isdir("%s/saved_models/"%work_dir):
    os.mkdir("%s/saved_models/"%work_dir)

In [78]:
###########################################################
best_final_model = model_pipeline(model, criterion, optimizer_ft, data_loaders, data_sizes, wandb_config)

  0%|          | 10/6000 [00:00<01:04, 93.46it/s]

{'use_gpu': 1, 'base_lr': 0.01, 'num_epochs': 2, 'num_classes': 10, 'run_name': 'TEST_1', 'model_arch': 'CustomCNN', 'batch_size': 10}
2
Starting training epoch...


100%|██████████| 6000/6000 [00:44<00:00, 134.19it/s]
  3%|▎         | 32/1000 [00:00<00:03, 312.35it/s]

Starting testing epoch...


100%|██████████| 1000/1000 [00:03<00:00, 309.20it/s]
  0%|          | 13/6000 [00:00<00:49, 120.89it/s]

Starting training epoch...


100%|██████████| 6000/6000 [00:44<00:00, 133.62it/s]
  3%|▎         | 32/1000 [00:00<00:03, 317.39it/s]

Starting testing epoch...


100%|██████████| 1000/1000 [00:03<00:00, 305.09it/s]


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
_step,26009.0
Current Epoch,1.0
_timestamp,1611867175.0
_runtime,102.0
train_loss,0.18569
test_accuracy,0.5742
train_accuracy,0.56663
best_accuracy,0.6309
train_running_corrects,0.5742
train_running_loss,0.18569


0,1
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
Current Epoch,▁█
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
test_accuracy,█▁
train_loss,▁█
train_accuracy,█▁
best_accuracy,▁▁
train_running_corrects,▆▅▄▄▄▄▄▅▆▅▅▆▆▆▆▅▅█▇█▅▅▅▃▂▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄
train_running_loss,▁▂▃▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▃▃▂▂▃▅▇▇███▇▇▇▇▇▆▆▆▆▆▅


In [79]:
save_path = '%s/saved_models/%s_final.pt'%(work_dir, wandb_config['run_name'])

with open(save_path,'wb') as F:
    torch.save(best_final_model,F)