In [1]:
##%matplotlib widget
## with %matplotlib notebook: seems to require ipympl as part of environment, either
## part of the conda environment or "pip install ipympl"
## otherwise, does not show ANY plots in notebook, plt.savefig() works
%matplotlib notebook  
##%matplotlib inline    ## --plt.savefig()  works, but re-sizing does NOT

This notebook is a short demo to illustrate execution.   For odd historical reasons, it uses "toy Monte Carlo" (simulated data)for "training" and "full LHCB MC" for validation.

The network architecture is a "simple" model that uses 1 input channel (the KDE [kernel density estimator] but from the track parameters) feeding 5 convolutional layers followed by a fully connected layer.

In today's version, the network will start with weights from a previously trained version.
 

Check the current GPU usage. Please try to be nice!

In [2]:
!nvidia-smi

Tue Jul  7 15:29:55 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN V             Off  | 00000000:03:00.0 Off |                  N/A |
| 28%   32C    P8    23W / 250W |   6387MiB / 12066MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  Off  | 00000000:83:00.0 Off |                    0 |
| N/A   32C    P0    27W / 250W |     10MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN V             Off  | 00000000:84:00.0 Off |                  N/

> **WARNING**: The card numbers here are *not* the same as in CUDA. You have been warned.

## Imports

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import time
import torch
import pandas as pd
import mlflow

# Python 3 standard library
from pathlib import Path

from torchsummary import summary

'''
HELPER FUNCTIONS
'''
# From model/collectdata.py
from model.collectdata_mdsA import collect_data

# From model/loss.py
##from loss import Loss
from model.alt_loss_A import Loss

# From model/training.py
from model.training import trainNet, select_gpu, Results

# From model/models.py
##  will start with model from TwoFeatures_CNN6Layer_A in the first instance
##  see relevant cell below

from model.models_mds_07July2020 import All_CNN6Layer_A as ModelA
from model.models_mds_07July2020 import All_CNN6Layer_B as ModelB
from model.models_mds_07July2020 import All_CNN6Layer_C as ModelC
from model.models_mds_07July2020 import All_CNN6Layer_D as ModelD
from model.models_mds_07July2020 import All_CNN6Layer_E as ModelE

# From model/utilities.py
from model.utilities import load_full_state, count_parameters, Params

from model.plots import dual_train_plots, replace_in_ax

## adds image of model architecture
import hiddenlayer as HL

Set up Torch device configuration. All tensors and model parameters need to know where to be put.
This takes a BUS ID number: The BUS ID is the same as the listing at the top of this script.

In [4]:
device = select_gpu(2)

1 available GPUs (initially using device 0):
  0 TITAN V


## Loading data

Load the dataset, split into parts, then move to device (see `collectdata.py` in the `../model` directory)

In [5]:
## newer vernacular
## Training dataset. You can put as many files here as desired.

## in this DEMO example we use only one 80K training set -- the model starts with well-trained weights,
## and using a smaller training set reduces both the time to load the data and the time to train an epoch
##  set the option load_XandXsq = True to use both DKE and KDE^2 as input features
train_loader = collect_data('/share/lazy/sokoloff/ML-data_AA/Aug14_80K_train.h5',
                            '/share/lazy/sokoloff/ML-data_AA/Oct03_80K_train.h5',
                            #'/share/lazy/sokoloff/ML-data_AA/Oct03_80K2_train.h5',
                             batch_size=128,
## if we are using a larger dataset (240K events, with the datasets above, and 11 GB  of GPU memory),
## not the dataset will overflow the GPU memory; device=device will allow the data to move back
## and forth between the CPU and GPU memory. While this allows use of a larger dataset, it slows
## down performance by about 10%.  So comment out when not needed.
                            device=device,
                            masking=True, shuffle=True,
                            load_XandXsq=False,
                            load_xy=False)

# Validation dataset. You can slice to reduce the size.
## dataAA -> /share/lazy/sokoloff/ML-data_AA/
val_loader = collect_data('/share/lazy/sokoloff/ML-data_AA/Oct03_20K_val.h5',
## mds val_loader = collect_data('dataAA/HLT1CPU_1kevts_val.h5',

                          batch_size=128,
                          slice=slice(256 * 39),
                          device=device,
                          masking=True, shuffle=False,
                          load_XandXsq=False,
                          load_xy=False)

Loading data...
Loaded /share/lazy/sokoloff/ML-data_AA/Aug14_80K_train.h5 in 12.75 s
Loaded /share/lazy/sokoloff/ML-data_AA/Oct03_80K_train.h5 in 11.21 s
Constructing 160000 event dataset took 4.69 s
Loading data...
Loaded /share/lazy/sokoloff/ML-data_AA/Oct03_20K_val.h5 in 2.852 s
Constructing 9984 event dataset took 0.08995 s


### Set up local parameters

In [6]:
# Name is the output file name
##  190810  mds
folder = '02July2020_AllCNN_01'
name = '02July2020_AllCNN_01'

# Make an output folder named "name" (change if you want)

## Special instructions for those working on goofy at UC
## Please be very careful to make sure that your folder
## does not live in a subdirectory of your home directory
## this disk has very little capacity. Instead, use 
## a subdirectory in /share/lazy with a symbolic link to
## it in this (the notebooks) subdirectory
folder = '/share/lazy/pv-finder_model_repo/ML/' + folder
output = Path(folder)

Make the output directory if it does not exist:

In [7]:
output.mkdir(exist_ok=True)

# Preparing the model

Prepare a model, use multiple GPUs if they are VISIBLE, and move the model to the device.

In [8]:
model = ModelA()

##summary(model, input_size=(4, 4000))
##print(model.parameters)

mlflow.tracking.set_tracking_uri('file:/share/lazy/pv-finder_model_repo')
mlflow.set_experiment('ALLCNN')

## add the following code to allow the user to freeze the some of the weights corresponding 
## to those taken from an earlier model trained with the original target histograms
## presumably -- this leaves either the perturbative filter "fixed" and lets the 
## learning focus on the non-perturbative features, so get started faster, or vice versa
# model
ct = 0
for child in model.children():
    print('ct, child = ',ct, "  ", child)
    if ct < 0:
        print("     About to set param.requires_grad=False for ct = ", ct, "params")
        for param in child.parameters():
            param.requires_grad = False 
    ct += 1

Traceback (most recent call last):
  File "/home/michael24peters/.local/lib/python3.7/site-packages/mlflow/store/tracking/file_store.py", line 197, in list_experiments
    experiment = self._get_experiment(exp_id, view_type)
  File "/home/michael24peters/.local/lib/python3.7/site-packages/mlflow/store/tracking/file_store.py", line 260, in _get_experiment
    meta = read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/home/michael24peters/.local/lib/python3.7/site-packages/mlflow/utils/file_utils.py", line 167, in read_yaml
    raise MissingConfigException("Yaml file '%s' does not exist." % file_path)
mlflow.exceptions.MissingConfigException: Yaml file '/share/lazy/pv-finder_model_repo/ML/meta.yaml' does not exist.


ct, child =  0    Conv1d(1, 20, kernel_size=(25,), stride=(1,), padding=(12,))
ct, child =  1    Conv1d(20, 10, kernel_size=(15,), stride=(1,), padding=(7,))
ct, child =  2    Conv1d(10, 10, kernel_size=(15,), stride=(1,), padding=(7,))
ct, child =  3    Conv1d(10, 10, kernel_size=(15,), stride=(1,), padding=(7,))
ct, child =  4    Conv1d(10, 1, kernel_size=(5,), stride=(1,), padding=(2,))
ct, child =  5    Conv1d(1, 1, kernel_size=(91,), stride=(1,), padding=(45,))
ct, child =  6    Dropout(p=0.15, inplace=False)
ct, child =  7    Dropout(p=0.15, inplace=False)
ct, child =  8    Dropout(p=0.15, inplace=False)
ct, child =  9    Dropout(p=0.15, inplace=False)
ct, child =  10    Dropout(p=0.15, inplace=False)
ct, child =  0    Conv1d(1, 16, kernel_size=(25,), stride=(1,), padding=(12,))
ct, child =  1    Conv1d(16, 9, kernel_size=(15,), stride=(1,), padding=(7,))
ct, child =  2    Conv1d(9, 9, kernel_size=(15,), stride=(1,), padding=(7,))
ct, child =  3    Conv1d(9, 9, kernel_size=(15,),

In [9]:
print("Let's use", torch.cuda.device_count(), "GPUs!")
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

Let's use 1 GPUs!


Let's move the model's weight matricies to the GPU:

In [10]:
print('output = ',output)

##  use the first five layers from a pre-existing model
##  see example at https://discuss.pytorch.org/t/how-to-load-part-of-pre-trained-model/1113

##   ML -> /share/lazy/sokoloff/ML
pretrained_dict = torch.load('/share/lazy/pv-finder_model_repo/ML/02July2020_From_SimpleCNN_toAllCNN_01/02July2020_From_SimpleCNN_toAllCNN_01_final.pyt')
model_dict = model.state_dict()
## mds 190725 for debugging
print("for model_dict")
index = 0
for k,v in model_dict.items():
    print("index, k =  ",index,"  ",k)
    index = index+1
    
print(" \n","  for pretrained_dict")
index = 0
for k,v in pretrained_dict.items():
    print("index, k =  ",index,"  ",k)
    index = index+1
## mds  

print("model_dict instantiated")
# 1. filter out unnecessary keys
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}

print(" \n","  for 'reduced' pretrained_dict")
index = 0
for k,v in pretrained_dict.items():
    print("index, k =  ",index,"  ",k)
    index = index+1
## mds  
print("pretrained_dict iterated")
# 2. overwrite entries in the existing state dict
model_dict.update(pretrained_dict) 
print('model_dict updated')
##print('model_dict =    ', model_dict)
##
#   when starting from a model with a fully connected last layer rather than a convolutional layer
# 3. load the new state dict
#   need to use strict=False as the two models state model attributes do not agree exactly
#   see https://pytorch.org/docs/master/_modules/torch/nn/modules/module.html#Module.load_state_dict

model.load_state_dict(model_dict,strict=False)

## print('model_dict =    ', model_dict)

output =  /share/lazy/pv-finder_model_repo/ML/02July2020_AllCNN_01
for model_dict
index, k =   0    conv1.weight
index, k =   1    conv1.bias
index, k =   2    conv2.weight
index, k =   3    conv2.bias
index, k =   4    conv3.weight
index, k =   5    conv3.bias
index, k =   6    conv4.weight
index, k =   7    conv4.bias
index, k =   8    conv5.weight
index, k =   9    conv5.bias
index, k =   10    finalFilter.weight
index, k =   11    finalFilter.bias
 
   for pretrained_dict
index, k =   0    conv1.weight
index, k =   1    conv1.bias
index, k =   2    conv2.weight
index, k =   3    conv2.bias
index, k =   4    conv3.weight
index, k =   5    conv3.bias
index, k =   6    conv4.weight
index, k =   7    conv4.bias
index, k =   8    conv5.weight
index, k =   9    conv5.bias
index, k =   10    finalFilter.weight
index, k =   11    finalFilter.bias
model_dict instantiated
 
   for 'reduced' pretrained_dict
index, k =   0    conv1.weight
index, k =   1    conv1.bias
index, k =   2    conv2.we

RuntimeError: Error(s) in loading state_dict for All_CNN6Layer_B:
	size mismatch for conv1.weight: copying a param with shape torch.Size([20, 1, 25]) from checkpoint, the shape in current model is torch.Size([16, 1, 25]).
	size mismatch for conv1.bias: copying a param with shape torch.Size([20]) from checkpoint, the shape in current model is torch.Size([16]).
	size mismatch for conv2.weight: copying a param with shape torch.Size([10, 20, 15]) from checkpoint, the shape in current model is torch.Size([9, 16, 15]).
	size mismatch for conv2.bias: copying a param with shape torch.Size([10]) from checkpoint, the shape in current model is torch.Size([9]).
	size mismatch for conv3.weight: copying a param with shape torch.Size([10, 10, 15]) from checkpoint, the shape in current model is torch.Size([9, 9, 15]).
	size mismatch for conv3.bias: copying a param with shape torch.Size([10]) from checkpoint, the shape in current model is torch.Size([9]).
	size mismatch for conv4.weight: copying a param with shape torch.Size([10, 10, 15]) from checkpoint, the shape in current model is torch.Size([9, 9, 15]).
	size mismatch for conv4.bias: copying a param with shape torch.Size([10]) from checkpoint, the shape in current model is torch.Size([9]).
	size mismatch for conv5.weight: copying a param with shape torch.Size([1, 10, 5]) from checkpoint, the shape in current model is torch.Size([1, 9, 5]).

In [None]:
# params order - batch size, epochs, lr, epoch_start (which is usually set to 0)
# creating runs allows for multiple runs to be run sequentially (sep. by commas)
runs = [
    (model.to(device), Params(128, 5, 1e-3, 19))
]    

## Train 



The body of this loop runs once per epoch. Results is a named tuple of values (loss per epoch for training and validation, time each). Start by setting up a plot first:

In [None]:
# Loop through models in runs dictionary
for (model, args) in runs:
    
    ax, tax, lax, lines = dual_train_plots()
    fig = ax.figure
    plt.tight_layout()
    # This gets built up during the run - do not rerun this cell
    results = pd.DataFrame([], columns=Results._fields)
    
    print('for model: ', model)
    ##  mds 200121 loss = Loss(epsilon=1e-5,coefficient=1.0)
    loss = Loss(epsilon=1e-5,coefficient=2.5)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    
    run_name = 'ACNN'
    # Create an mlflow run
    with mlflow.start_run(run_name=run_name) as run:
        # Log parameters of the model
        for key, value in vars(args).items():
            print(key, value)
            mlflow.log_param(key, value)
        
        # Log parameter count in the model
        mlflow.log_param('Parameters', count_parameters(model))
        
        # Begin run
        for result in trainNet(model, optimizer, loss,
                                train_loader, val_loader,
                                args.epochs+args.epoch_start, epoch_start=args.epoch_start,
                                notebook=True, device=device):
    
            result = result._asdict()
            results = results.append(pd.Series(result), ignore_index=True)
            xs = results.index
    
            # Update the plot above
            lines['train'].set_data(results.index, results.cost)
            lines['val'].set_data(results.index, results.val)
    
            #filter first cost epoch (can be really large)
            max_cost = max(max(results.cost if len(results.cost)<2 else results.cost[1:]), max(results.val))
            min_cost = min(min(results.cost), min(results.val))
    
            # The plot limits need updating too
            ax.set_ylim(min_cost*.9, max_cost*1.1)  
            ax.set_xlim(-.5, len(results.cost) - .5)
    
            replace_in_ax(lax, lines['eff'], xs, results['eff_val'].apply(lambda x: x.eff_rate))
            replace_in_ax(tax, lines['fp'], xs, results['eff_val'].apply(lambda x: x.fp_rate))
    
            # Redraw the figure
            fig.canvas.draw()
            plt.tight_layout()
            fig.savefig('plot.png')
            
            
            ## MLFLOW ##
            # Log metrics
            mlflow.log_metric('Efficiency', result['eff_val'].eff_rate, result['epoch'])
            mlflow.log_metric('False Positive Rate',  result['eff_val'].fp_rate, result['epoch'])
            mlflow.log_metric('Validation Loss',  result['val'], result['epoch'])
            mlflow.log_metric('Training Loss',  result['cost'], result['epoch'])
            
            # Log tags
#            mlflow.set_tag('Optimizer', 'Adam')
#            mlflow.set_tag('Kernel size', 'Mixed')
#            mlflow.set_tag('Skip connections', '4')
#            mlflow.set_tag('Activation', 'Softplus')
#            mlflow.set_tag('Mid Activation', 'Relu')

            # Save model state dictionary, optimizer state dictionary, and epoch number
            torch.save({
                'model':model.state_dict(),
                'optimizer':optimizer.state_dict(),
                'epoch':args.epochs+result['epoch']
                }, 'run_stats.pyt')
            # Save the run stats into mlflow
            mlflow.log_artifact('run_stats.pyt')
            
            # Save a diagram of the architecture
            HL.transforms.Fold("Conv", "Conv"),
            HL.build_graph(model, torch.zeros([args.batch_size, 1, 4000]).to(device)).save('architecture', format='png')
            mlflow.log_artifact('architecture.png')
        
            # log the code for the model architecture
#            mlflow.log_artifact('architecture.txt')
        
            # save plot
            mlflow.log_artifact('plot.png')
            
            # Save each model state dictionary
            torch.save(model.state_dict(), (output / f'{name}_{args.epochs}.pyt'))
    
    # Go ahead and save the final model (even though it was also saved above):
    torch.save(model.state_dict(), output / f'{name}_final.pyt')
    # Save the output results:
    results.to_hdf(f'{name}_stats.hdf5', 'results')
    
    # Save the plot above:
#    plt.tight_layout()
#    plt.savefig(str(output / f'{name}_stats_a.png'))

In [None]:
##quit()