In [None]:
%matplotlib widget

Check the current GPU usage. Please try to be nice!

In [None]:
!nvidia-smi

> **WARNING**: The card numbers here are *not* the same as in CUDA. You have been warned.

## Imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import time
import torch

# Python 3 standard library
from pathlib import Path

### Set up local parameters

In [None]:
n_epochs = 80

# Name is the output file name
name = 'Aug_16_120000_2layer'

# Make an output folder named "name" (change if you want)
output = Path(name)

# This is the input file to read in
datafile = Path('/data/schreihf/PvFinder/Aug_15_140000.npz')

# Size of batches
batch_size = 32

# How fast to learn
learning_rate = 1e-3

Make the output directory if it does not exist:

In [None]:
output.mkdir(exist_ok=True)

## Get the helper functions

Add the directory with the model
definitions to the path so we can import from it:

> When you type `import X`,
Python searches `sys.path` for a python
file named `X.py` to import. So we need to add the model directory to the path.

In [None]:
import sys
sys.path.append('../model')

In [None]:
# From model/collectdata.py
from collectdata import DataCollector

# From model/loss.py
from loss import Loss

# From model/training.py
from training import trainNet, select_gpu

# From model/models.py
from models import SimpleCNN2Layer as Model

Set up Torch device configuration. All tensors and model parameters need to know where to be put.
This takes a BUS ID number: The BUS ID is the same as the listing at the top of this script.

In [None]:
device = select_gpu(1)

## Loading data

Load the dataset, split into parts, then move to device (see `collectdata.py` in the `../model` directory)

In [None]:
collector = DataCollector(datafile, 120_000, 10_000)
train_loader = collector.get_training(batch_size, 120_000, device=device, shuffle=True)
val_loader = collector.get_validation(batch_size, 10_000, device=device, shuffle=False)

# Preparing the model

Prepare a model, use multiple GPUs if they are VISIBLE, and move the model to the device.

In [None]:
model = Model()
loss = Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
print("Let's use", torch.cuda.device_count(), "GPUs!")
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

Let's move the model's weight matricies to the GPU:

In [None]:
model = model.to(device)

## Train

The body of this loop runs once per epoch. Results is a named tuple of values (loss per epoch for training and validation, time each). Start by setting up a plot first:

In [None]:
fig, ax = plt.subplots()
lines_train, = ax.plot([], [], 'o-', label='Train')
lines_val, = ax.plot([], [], 'o-', label='Validation')
ax.set_xlabel('Epochs')
ax.set_ylabel('Cost')
plt.yscale('log') 
ax.legend();

In [None]:
# Run the epochs, using progress instead of range(n_epochs)
for results in trainNet(model, optimizer, loss,
                        train_loader, val_loader,
                        n_epochs,
                        notebook=True):
    
    # Update the plot above
    lines_train.set_data(np.arange(len(results.cost)),results.cost)
    lines_val.set_data(np.arange(len(results.val)),results.val)
    
    
    #filter first cost epoch (can be really large)
    max_cost = max(max(results.cost if len(results.cost)<2 else results.cost[1:]),max(results.val))
    min_cost = min(min(results.cost),min(results.val))
    
    # The plot limits need updating too
    ax.set_ylim(min_cost*.9, max_cost*1.1)  
    ax.set_xlim(-.5, len(results.cost) - .5)
    
    # Redraw the figure
    fig.canvas.draw()

    # Save each model state dictionary
    torch.save(model.state_dict(), output / f'{name}_{results.epoch}.pyt')

## Results

Let's save some results: (even though if you have not changed the code above, it saves the model every epoch)

Go ahead and save the final model (even though it was also saved above):

In [None]:
torch.save(model.state_dict(), output / f'{name}_final.pyt')

Save the output results:

In [None]:
np.savez(output / f'{name}_stats.npz', **results._asdict())

Save the plot above:

In [None]:
fig.savefig(str(output / f'{name}_stats_a.png'))