In [1]:
%matplotlib notebook  

Check the current GPU usage. Please try to be nice!

In [2]:
!nvidia-smi

Wed Jun 24 09:39:00 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN V             Off  | 00000000:03:00.0 Off |                  N/A |
| 28%   31C    P8    23W / 250W |   1697MiB / 12066MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  Off  | 00000000:83:00.0 Off |                    0 |
| N/A   31C    P0    27W / 250W |     10MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN V             Off  | 00000000:84:00.0 Off |                  N/A |
| 29%   

> **WARNING**: The card numbers here are *not* the same as in CUDA. You have been warned.

## Imports

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import time
import torch
import pandas as pd
import mlflow

# Python 3 standard library
from pathlib import Path

from torchsummary import summary

## Get the helper functions

Add the directory with the model
definitions to the path so we can import from it:

> When you type `import X`,
Python searches `sys.path` for a python
file named `X.py` to import. So we need to add the model directory to the path.

In [4]:
# From model/collectdata.py
from model.collectdata_mdsA import collect_data

# From model/loss.py
##from loss import Loss
from model.alt_loss_A import Loss

# From model/training.py
from model.training import trainNet, select_gpu

# From model/models.py
##  will start with model from TwoFeatures_CNN6Layer_A in the first instance
##  see relevant cell below


from model.models_mds_01June20 import SimpleCNN5Layer_Ca as Model

from model.training import trainNet, select_gpu, Results
from model.plots import dual_train_plots, replace_in_ax

from mlflow import pytorch

KeyboardInterrupt: 

# U-Net Architecture

In [None]:
# my model 
from torch import nn
class Conv(nn.Module):
    """convolution => [BN] => Pooling => ReLU"""

    def __init__(self, in_channels, out_channels, kernel_size = 7, padding = 3):
        super().__init__()

        self.conv = nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size, stride = 1, padding = padding),
            nn.BatchNorm1d(out_channels),
            nn.MaxPool1d(2),
            nn.ReLU(),
            nn.Dropout(0.4))

    def forward(self, x):
        return self.conv(x)

class Up(nn.Module):
    """Upscaling then conv"""

    def __init__(self, in_channels, out_channels):
        super().__init__()

        self.up = nn.Sequential(
            nn.ConvTranspose1d(in_channels, out_channels, kernel_size=2, stride=2),
            nn.Conv1d(out_channels, out_channels, kernel_size=7, stride = 1, padding=3),
            nn.BatchNorm1d(out_channels),
            nn.ReLU(),
            nn.Dropout(0.4))

    def forward(self, x1):
#         x1 = self.up(x1)
        # skip connections
        # diffY = torch.tensor([x2.size()[2] - x1.size()[2]])
        # diffX = torch.tensor([x2.size()[3] - x1.size()[3]])

        # x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
        #                 diffY // 2, diffY - diffY // 2])
        # x = torch.cat([x2, x1], dim=1)
        return self.up(x1)

class UNet(nn.Module):
    def __init__(self):
        super(UNet, self).__init__()
        self.inc = Conv(1, 25, kernel_size = 25, padding=12)
        self.down1 = Conv(25, 64)
        self.down2 = Conv(64, 64)
        self.down3 = Conv(64, 64)
        self.down4 = Conv(64, 64)
        self.down5 = Conv(64, 64)
        self.up1 = Up(64, 64)
        self.up2 = Up(64, 64)
        self.up3 = Up(64, 64)
        self.up4 = Up(64, 64)
        self.up5 = Up(64, 64)
        self.outc = nn.Conv1d(64, 1, 3, padding=1)

    def forward(self, x):
#         First_two = x[:, 0:2, :]  ## picks out the 0 & 1 feature sets, X & Xsq
#         Second_two = x[:, 2:4, :]
        x1 = self.inc(x) #First_two
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)
        x = self.up1(x5)
        x = self.up2(x)
        x = self.up3(x)
        x = self.up4(x)
#         x = self.up5(torch.cat([x, x1[:,:, 1:2001]], 1))
        x = self.up5(x)
#         x = self.up5(x)
        logits_x0 = self.outc(x)

#         x1 = self.inc(Second_two)
#         x2 = self.down1(x1)
#         x3 = self.down2(x2)
#         x4 = self.down3(x3)
#         x5 = self.down4(x4)
#         x = self.up1(x5)
#         x = self.up2(x)
#         x = self.up3(x)
#         x = self.up4(x)
#         x = self.up5(torch.cat([x, x1[:,:, 1:2001]], 1))
# #         x = self.up5(x)
#         logits_x1 = self.outc(x)

#         ret = torch.nn.Softplus()(logits_x0 * logits_x1).reshape(128, 4000)
        # Use this line for looking at summary() to understand shape and structure of U-net
#        ret = torch.nn.Softplus()(logits_x0)
        ret = torch.nn.Softplus()(logits_x0).reshape(64, 4000)
        return  ret

### Set up local parameters

In [None]:
class Params(object):
    def __init__(self, batch_size, epochs, lr):
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        
args = Params(128, 5, 0.0001)

Set up Torch device configuration. All tensors and model parameters need to know where to be put.
This takes a BUS ID number: The BUS ID is the same as the listing at the top of this script.

In [None]:
device = select_gpu(0)

## Loading data

Load the dataset, split into parts, then move to device (see `collectdata.py` in the `../model` directory)

In [None]:
## newer vernacular
## Training dataset. You can put as many files here as desired.

## in this DEMO example we use only one 80K training set -- the model starts with well-trained weights,
## and using a smaller training set reduces both the time to load the data and the time to train an epoch
##  set the option load_XandXsq = True to use both DKE and KDE^2 as input features
train_loader = collect_data('/share/lazy/sokoloff/ML-data_AA/Aug14_80K_train.h5',
                            '/share/lazy/sokoloff/ML-data_AA/Oct03_80K_train.h5',
##                            'dataAA/Oct03_80K2_train.h5',
                             batch_size=args.batch_size,
## if we are using a larger dataset (240K events, with the datasets above, and 11 GB  of GPU memory),
## not the dataset will overflow the GPU memory; device=device will allow the data to move back
## and forth between the CPU and GPU memory. While this allows use of a larger dataset, it slows
## down performance by about 10%.  So comment out when not needed.
##                           device=device,
                            masking=True, shuffle=True,
                            load_XandXsq=False,
                            load_xy=False)

# Validation dataset. You can slice to reduce the size.
## dataAA -> /share/lazy/sokoloff/ML-data_AA/
val_loader = collect_data('/share/lazy/sokoloff/ML-data_AA/Oct03_20K_val.h5',
## mds val_loader = collect_data('dataAA/HLT1CPU_1kevts_val.h5',

                          batch_size=args.batch_size,
                          slice=slice(256 * 39),
                          device=device,
                          masking=True, shuffle=False,
                          load_XandXsq=False,
                          load_xy=False)

# Preparing the model

Prepare a model, use multiple GPUs if they are VISIBLE, and move the model to the device.

In [None]:
model = Model()
##  mds 200121 loss = Loss(epsilon=1e-5,coefficient=1.0)
loss = Loss(epsilon=1e-5,coefficient=2.5)

In [None]:
print("Let's use", torch.cuda.device_count(), "GPUs!")
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

Let's move the model's weight matricies to the GPU:

In [None]:
model = UNet().to(device)

## Create Experiment

The first line sets the location to save the experiment. This will usually appear as a "0", "1", and so on (depending on how many experiments you have already created in the directory) in the specified folder. The next line sets the experiment name, which will be visible when launching MLFlow through localhost. This will be a piece of metadata and does not create a directory or file.

In [None]:
mlflow.tracking.set_tracking_uri('file:/share/lazy/pv-finder_model_repo')
mlflow.set_experiment('Weird U-Net')

In [None]:
# Used for looking at U-net architecture in terms of layer (type), output shape, and Param #
#summary(UNet().to('cuda:0'), (1, 4000))

## Train 

The first for loop saves the model and its corresponding data to the experiment so that it may be accessed in MLFlow.

The body of this loop runs once per epoch. Results is a named tuple of values (loss per epoch for training and validation, time each). Log this data into the mlflow experiment set in the previous cell.

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

we_are_training_from_scratch = True

if not we_are_training_from_scratch:
    # If we are not training from scratch, this path should be the path to the "run_stats" file in the artifacts 
    # directory of whatever run you are using as a baseline. 
    # You can find the path in the MLFlow UI. It should end in /artifacts/run_stats
    PATH = '/share/lazy/pv-finder_model_repo'
    
    # Load the model and optimizer state_dict, and the total number of epochs
    # The use case for this is if we care about the optimizer state_dict, which we do if we have multiple training 
    # sessions with momentum and/or learning rate decay. this will track the decay/momentum.
    checkpoint = torch.load(PATH)
    model.load_state_dict(checkpoint['model'])
    epoch_start = checkpoint['epoch']
    
    # do this so it does not use the learning rate from the previous run. this is unwanted behavior
    # in our scenario since we are not using a learning rate scheduler, rather we want to tune the learning
    # rate further after we have gotten past the stalling
    checkpoint['optimizer']['param_groups'][0]['lr'] = args.lr
    optimizer.load_state_dict(checkpoint['optimizer'])

else:
    epoch_start = 0

run_name = 'Run 1'

with mlflow.start_run(run_name = run_name) as run:
# with mlflow.start_run() as run:

    for key, value in vars(args).items():
        mlflow.log_param(key, value)
        mlflow.set_tag('Optimizer', 'Adam')
        
    for result in trainNet(model, optimizer, loss,
                            train_loader, val_loader,
                            args.epochs+epoch_start, epoch_start=epoch_start,
                            notebook=True):

        result = result._asdict()
        mlflow.log_metric('Efficiency', result['eff_val'].eff_rate, result['epoch'])
        mlflow.log_metric('False Positive Rate',  result['eff_val'].fp_rate, result['epoch'])
        mlflow.log_metric('Validation Loss',  (2. * float(result['cost'])), int(result['epoch']))
            
        if result['eff_val'].eff_rate == 0:
            mlflow.set_tag('Stalled', 'True')
        else:
            mlflow.set_tag('Stalled', 'False')

    # Save model AND optimizer state_dict AND epoch number.
    torch.save({
        'model':model.state_dict(),
        'optimizer':optimizer.state_dict(),
        'epoch':args.epochs+result['epoch']
        }, 'run_stats.pyt')
    mlflow.log_artifact('run_stats.pyt')

In [None]:
##quit()