# In this demo we will set up a launch agent that

* retrains your model nightly

* retrains your model when a git commit is pushed

In [1]:
import typing
from datetime import datetime
import PIL
import os

import pprint
import torch
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
from torchvision import datasets, transforms

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


In [5]:
!python3 -m pip uninstall -y wandb
!python3 -m pip install "wandb[launch]" -qqq

Found existing installation: wandb 0.12.20
Uninstalling wandb-0.12.20:
  Successfully uninstalled wandb-0.12.20
[0m

In [6]:
import wandb
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
import json

<class 'dict'>
<class 'dict'>


In [7]:
def get_transform():
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307,), (0.3081,))])
    return transform


def build_dataset(batch_size=100, train=True):
    dataset = datasets.MNIST(".", train=train, download=False,
        transform=get_transform())
    if batch_size is None:
        batch_size = dataset.data.shape[0]
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
    return loader


def build_network(fc_layer_size, dropout):
    network = nn.Sequential(  # fully-connected, single hidden layer
        nn.Flatten(),
        nn.Linear(784, fc_layer_size), nn.ReLU(),
        nn.Dropout(dropout),
        nn.Linear(fc_layer_size, 10),
        nn.LogSoftmax(dim=1))
    return network.to(DEVICE)
        

def build_optimizer(network, optimizer, learning_rate):
    optimizer = optim.Adam(network.parameters(),
        lr=learning_rate)
    return optimizer


def train_epoch(network, loader, optimizer):
    cumu_loss = 0
    for _, (data, target) in enumerate(loader):
        data, target = data.to(DEVICE), target.to(DEVICE)
        optimizer.zero_grad()
        loss = F.nll_loss(network(data), target)
        cumu_loss += loss.item()
        loss.backward()
        optimizer.step()
        wandb.log({"batch loss": loss.item()})
    return cumu_loss / len(loader)


def train(project='mnist_train', config=None):
    with wandb.init(project=project, config=config):
        config = wandb.config
        loader = build_dataset(config.batch_size)
        network = build_network(config.fc_layer_size, config.dropout)
        optimizer = build_optimizer(network, config.optimizer, config.learning_rate)
        for epoch in range(config.epochs):
            avg_loss = train_epoch(network, loader, optimizer)
            wandb.log({"loss": avg_loss, "epoch": epoch})
    return network

In [10]:
entity = 'dpaiton'
project = 'launch-mnist'

# setup a git repo for the demo
# push requirements (from docker dotfiles), 
# push a training script (above) & dataset script
    # download mnist & upload as an artifact in a create_dataset run
    # in the train script get the artifact
# setup a cronjob shell script that runs the launch command nightly
    # use my docker container
    # retrain on mnist

In [7]:
# workaround to fetch MNIST data
if not os.path.exists('./MNIST'):
    !wget www.di.ens.fr/~lelarge/MNIST.tar.gz
    !tar -zxvf MNIST.tar.gz

--2022-06-21 19:54:02--  http://www.di.ens.fr/~lelarge/MNIST.tar.gz
Resolving www.di.ens.fr (www.di.ens.fr)... 129.199.99.14
Connecting to www.di.ens.fr (www.di.ens.fr)|129.199.99.14|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.di.ens.fr/~lelarge/MNIST.tar.gz [following]
--2022-06-21 19:54:03--  https://www.di.ens.fr/~lelarge/MNIST.tar.gz
Connecting to www.di.ens.fr (www.di.ens.fr)|129.199.99.14|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/x-gzip]
Saving to: ‘MNIST.tar.gz’

MNIST.tar.gz            [             <=>    ]  33.20M  11.3MB/s    in 2.9s    

2022-06-21 19:54:07 (11.3 MB/s) - ‘MNIST.tar.gz’ saved [34813078]

MNIST/
MNIST/raw/
MNIST/raw/train-labels-idx1-ubyte
MNIST/raw/t10k-labels-idx1-ubyte.gz
MNIST/raw/t10k-labels-idx1-ubyte
MNIST/raw/t10k-images-idx3-ubyte.gz
MNIST/raw/train-images-idx3-ubyte
MNIST/raw/train-labels-idx1-ubyte.gz
MNIST/raw/t10k-images-idx3-ubyte
MNIST/raw/tra

In [8]:
config = {
    'fc_layer_size': 256,
    'dropout': 0.5,
    'epochs': 50,
    'learning_rate': 0.005,
    'batch_size': 128,
    'optimizer': 'adam'
}

In [9]:
network = train(config=config)

[34m[1mwandb[0m: Currently logged in as: [33mdpaiton[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
batch loss,▄▅▃▄▃▃▃▄▄▂▃▁▃▅▃▃▂▃▃▄▁▃▃▃▂▃▃▂▂▁▃▃▄▂▅█▁▁▃▂
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
loss,█▅▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▁▂▁▁▂▂▁▁▁▁▁▁▁▁

0,1
batch loss,0.14563
epoch,49.0
loss,0.1713


In [12]:
test_loader = build_dataset(batch_size=None, train=False)
test_images, test_labels = next(iter(test_loader))
inferred_labels = network(test_images.to(DEVICE))

_, predicted = torch.max(inferred_labels.data, 1)
correct = (predicted == test_labels.to(DEVICE)).sum().item()
accuracy = correct / test_labels.size(0)

print(f'Network accuracy is {100 * accuracy}%')

Network accuracy is 94.26%
