In [1]:
%matplotlib notebook  

Check the current GPU usage. Please try to be nice!

In [2]:
!nvidia-smi

Fri Jun 26 19:30:22 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN V             Off  | 00000000:03:00.0 Off |                  N/A |
| 28%   31C    P8    23W / 250W |     12MiB / 12066MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  Off  | 00000000:83:00.0 Off |                    0 |
| N/A   31C    P0    27W / 250W |     10MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN V             Off  | 00000000:84:00.0 Off |                  N/

> **WARNING**: The card numbers here are *not* the same as in CUDA. You have been warned.

## Imports

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import time
import torch
import pandas as pd
import mlflow
import hiddenlayer as HL

# Python 3 standard library
from pathlib import Path

from torchsummary import summary

## Get the helper functions

Add the directory with the model
definitions to the path so we can import from it:

> When you type `import X`,
Python searches `sys.path` for a python
file named `X.py` to import. So we need to add the model directory to the path.

In [4]:
# From model/collectdata.py
from model.collectdata_mdsA import collect_data

# From model/loss.py
##from loss import Loss
from model.alt_loss_A import Loss

# From model/training.py
from model.training import trainNet, select_gpu

# From model/models.py
##  will start with model from TwoFeatures_CNN6Layer_A in the first instance
##  see relevant cell below

## From model/utilities.py
from model.utilities import count_parameters
## from model.utilities import load_first_layer, load_full_state

from model.model_26June2020_A import UNet as Model

from model.training import trainNet, select_gpu, Results
from model.plots import dual_train_plots, replace_in_ax

from mlflow import pytorch

### Set up local parameters

In [5]:
class Params(object):
    def __init__(self, batch_size, epochs, lr, epoch_start=0):
        self.epoch_start = epoch_start
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        
args = Params(64, 200, 0.00005)

Set up Torch device configuration. All tensors and model parameters need to know where to be put.
This takes a BUS ID number: The BUS ID is the same as the listing at the top of this script.

In [6]:
device = select_gpu(0)

1 available GPUs (initially using device 0):
  0 TITAN V


## Loading data

Load the dataset, split into parts, then move to device (see `collectdata.py` in the `../model` directory)

In [7]:
## newer vernacular
## Training dataset. You can put as many files here as desired.

## in this DEMO example we use only one 80K training set -- the model starts with well-trained weights,
## and using a smaller training set reduces both the time to load the data and the time to train an epoch
##  set the option load_XandXsq = True to use both DKE and KDE^2 as input features
train_loader = collect_data('/share/lazy/sokoloff/ML-data_AA/Aug14_80K_train.h5',
                            '/share/lazy/sokoloff/ML-data_AA/Oct03_80K_train.h5',
                            '/share/lazy/sokoloff/ML-data_AA/Oct03_40K_train.h5',
##                            'dataAA/Oct03_80K2_train.h5',
                             batch_size=args.batch_size,
## if we are using a larger dataset (240K events, with the datasets above, and 11 GB  of GPU memory),
## not the dataset will overflow the GPU memory; device=device will allow the data to move back
## and forth between the CPU and GPU memory. While this allows use of a larger dataset, it slows
## down performance by about 10%.  So comment out when not needed.
##                           device=device,
                            masking=True, shuffle=True,
                            load_XandXsq=False,
                            load_xy=False)

# Validation dataset. You can slice to reduce the size.
## dataAA -> /share/lazy/sokoloff/ML-data_AA/
val_loader = collect_data('/share/lazy/sokoloff/ML-data_AA/Oct03_20K_val.h5',
## mds val_loader = collect_data('dataAA/HLT1CPU_1kevts_val.h5',

                          batch_size=args.batch_size,
                          slice=slice(256 * 39),
                          device=device,
                          masking=True, shuffle=False,
                          load_XandXsq=False,
                          load_xy=False)

Loading data...
Loaded /share/lazy/sokoloff/ML-data_AA/Aug14_80K_train.h5 in 13.82 s
Loaded /share/lazy/sokoloff/ML-data_AA/Oct03_80K_train.h5 in 3.384 s


KeyboardInterrupt: 

# Preparing the model

Prepare a model, use multiple GPUs if they are VISIBLE, and move the model to the device.

In [None]:
model = Model()
##  mds 200121 loss = Loss(epsilon=1e-5,coefficient=1.0)
loss = Loss(epsilon=1e-5,coefficient=2.5)

In [None]:
print("Let's use", torch.cuda.device_count(), "GPUs!")
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

Let's move the model's weight matricies to the GPU:

In [None]:
model = model.to(device)

## Create Experiment

The first line sets the location to save the experiment. This will usually appear as a "0", "1", and so on (depending on how many experiments you have already created in the directory) in the specified folder. The next line sets the experiment name, which will be visible when launching MLFlow through localhost. This will be a piece of metadata and does not create a directory or file.

In [None]:
mlflow.tracking.set_tracking_uri('file:/share/lazy/pv-finder_model_repo')
mlflow.set_experiment('Modified UNet')

## Train 

The first for loop saves the model and its corresponding data to the experiment so that it may be accessed in MLFlow.

The body of this loop runs once per epoch. Results is a named tuple of values (loss per epoch for training and validation, time each). Log this data into the mlflow experiment set in the previous cell.

In [None]:
model = Model().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

we_are_training_from_scratch = True

if not we_are_training_from_scratch:
    # If we are not training from scratch, this path should be the path to the "run_stats" file in the artifacts 
    # directory of whatever run you are using as a baseline. 
    # You can find the path in the MLFlow UI. It should end in /artifacts/run_stats
    PATH = '/share/lazy/pv-finder_model_repo/0/48ce52a07acd43ed8cf8316aff07aad9/artifacts/run_stats.pyt'
    
    epoch_start = load_full_state(model, optimizer, Path)

run_name = 'Unmodified, More Data'

with mlflow.start_run(run_name=run_name) as run:

    for key, value in vars(args).items():
        mlflow.log_param(key, value)
        
    mlflow.log_param('Parameters', count_parameters(model))
        
    for result in trainNet(model, optimizer, loss,
                            train_loader, val_loader,
                            args.epochs+args.epoch_start, epoch_start=args.epoch_start,
                            notebook=True):

        # Log metrics
        result = result._asdict()
        mlflow.log_metric('Efficiency', result['eff_val'].eff_rate, result['epoch'])
        mlflow.log_metric('False Positive Rate',  result['eff_val'].fp_rate, result['epoch'])
        mlflow.log_metric('Validation Loss', result['val'], result['epoch'])
        mlflow.log_metric('Training Loss',  float(result['cost']), int(result['epoch']))

    # Log tags
    mlflow.set_tag('Optimizer', 'Adam')
    mlflow.set_tag('Activation', 'ReLU')
    if result['eff_val'].eff_rate <= 0.05:
        mlflow.set_tag('Stalled', 'True')
    else:
        mlflow.set_tag('Stalled', 'False')

    # Save model AND optimizer state_dict AND epoch number.
    torch.save({
        'model':model.state_dict(),
        'optimizer':optimizer.state_dict(),
        'epoch':args.epochs+result['epoch']
        }, 'run_stats.pyt')
    mlflow.log_artifact('run_stats.pyt')
    
    # save a diagram of the architecture
    HL.transforms.Fold("Conv > BatchNorm > ReLU", "ConvBNReluDrop"),
    HL.build_graph(model, torch.zeros([args.batch_size, 1, 4000]).to(device)).save('architecture', format='png')
    mlflow.log_artifact('architecture.png')

In [None]:
##quit()