In [2]:
%matplotlib widget

Check the current GPU usage. Please try to be nice!

In [3]:
!nvidia-smi

Wed Dec 19 15:42:48 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 396.44                 Driver Version: 396.44                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN V             On   | 00000000:03:00.0 Off |                  N/A |
| 28%   32C    P8    24W / 250W |      0MiB / 12066MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  On   | 00000000:83:00.0 Off |                    0 |
| N/A   33C    P0    30W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN V             On   | 00000000:84:00.0 Off |                  N/

> **WARNING**: The card numbers here are *not* the same as in CUDA. You have been warned. However, these numbers are correct if you use the `select_gpu` helper function.

## Imports

In [4]:
import matplotlib.pyplot as plt
import numpy as np
import time
import torch
import pandas as pd

# Python 3 standard library
from pathlib import Path

## Get the helper functions

In [5]:
from model.collectdata import collect_data
from model.loss import Loss
from model.training import trainNet, select_gpu, Results
from model.plots import dual_train_plots, replace_in_ax

from model.models import SimpleCNN2Layer as Model

### Set up local parameters

In [6]:
# This gets built up during the run - do not rerun this cell
results = pd.DataFrame([], columns=Results._fields)

In [7]:
n_epochs = 10

# Size of batches
batch_size = 128

# How fast to learn
learning_rate = 4e-6

# Name is the output file name
name = 'Dec_6_mask_120000_2layer_'+str(learning_rate)

# Make an output folder named "name" (change if you want)
output = Path(name)


Make the output directory if it does not exist:

In [8]:
output.mkdir(exist_ok=True)

Set up Torch device configuration. All tensors and model parameters need to know where to be put.
This takes a BUS ID number: The BUS ID is the same as the listing at the top of this script.

In [9]:
device = select_gpu(0)

1 available GPUs (initially using device 0):
  0 TITAN V


## Loading data

Load the dataset, split into parts, then move to device if `device=device` is present. If this line is commented out, then load the datasets as the calculations progress. Allows larger datasets and plays nicer with memory, but very slightly slower. See `collectdata.py` in the `../model` directory for the source. Datasets are listed in the model directory README, repeated here:

|        From       |          To         |         Events          |
|-------------------|---------------------|-------------------------|
| `kernel_20181003` | `Oct03_20K_val`     | 1,2                     |
| `kernel_20181003` | `Oct03_20K_test`    | 3,4                     |
| `kernel_20181003` | `Oct03_40K_train`   | 5,6,7,8                 |
| `kernel_20181003` | `Oct03_80K_train`   | 9,10,11,12,13,14,15,16  |
| `kernel_20181003` | `Oct03_80K2_train`  | 17,18,19,20,21,22,23,24 |
| `kernel_20180814` | `Aug14_80K_train`   | 1,2,3,4,5,6,7,8         |

In [10]:
# Training dataset. You can put as many files here as desired.
train_loader = collect_data('data/Oct03_80K_train.h5',
                            'data/Oct03_80K2_train.h5',
                            batch_size=batch_size,
                            device=device,
                            masking=True, shuffle=True)

# Validation dataset. You can slice to reduce the size.
val_loader = collect_data('data/Oct03_20K_val.h5',
                          batch_size=batch_size,
                          slice=slice(256 * 39),
                          device=device,
                          masking=True, shuffle=False)

Loading data...
Loaded data/Oct03_80K_train.h5 in 11.97 s
Loaded data/Oct03_80K2_train.h5 in 12.58 s
Constructing 160000 event dataset took 18.44 s
Loading data...
Loaded data/Oct03_20K_val.h5 in 2.488 s
Constructing 9984 event dataset took 0.2516 s


In [11]:
print(train_loader)

<torch.utils.data.dataloader.DataLoader object at 0x7f8d381769e8>


# Preparing the model

Prepare a model, use multiple GPUs if they are VISIBLE, and move the model to the device.

In [12]:
model = Model()
loss = Loss(epsilon=1e-5)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

This should support multi-gpu, but doesn't work very well.

In [13]:
print("Let's use", torch.cuda.device_count(), "GPUs!")
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

Let's use 1 GPUs!


Let's move the model's weight matricies to the GPU:

In [14]:
model = model.to(device)

## Train

The body of this loop runs once per epoch. Results is a named tuple of values (loss per epoch for training and validation, time each). Start by setting up a plot first:

In [15]:
ax, tax, lax, lines = dual_train_plots()
fig = ax.figure
plt.tight_layout()

In [16]:
for result in trainNet(model, optimizer, loss,
                        train_loader, val_loader,
                        n_epochs, epoch_start=len(results),
                        notebook=True):
    
    results = results.append(pd.Series(result._asdict()), ignore_index=True)
    
    xs = results.index
    
    # Update the plot above
    lines['train'].set_data(results.index,results.cost)
    lines['val'].set_data(results.index,results.val)
    
    #filter first cost epoch (can be really large)
    max_cost = max(max(results.cost if len(results.cost)<2 else results.cost[1:]), max(results.val))
    min_cost = min(min(results.cost), min(results.val))
    
    # The plot limits need updating too
    ax.set_ylim(min_cost*.9, max_cost*1.1)  
    ax.set_xlim(-.5, len(results.cost) - .5)
    
    replace_in_ax(lax, lines['eff'], xs, results['eff_val'].apply(lambda x: x.eff_rate))
    replace_in_ax(tax, lines['fp'], xs, results['eff_val'].apply(lambda x: x.fp_rate))
    
    # Redraw the figure
    fig.canvas.draw()

    # Save each model state dictionary
    torch.save(model.state_dict(), output / f'{name}_{result.epoch}.pyt')

Number of batches: train = 1250, val = 78


Epoch 0: train=1214.72, val=1089.28, took 15.328 s
  Validation Found 315 of 54700, added 9500 (eff 0.58%) (0.951 FP/event)


Epoch 1: train=834.706, val=540.163, took 14.254 s
  Validation Found 0 of 54700, added 0 (eff 0.00%) (0.0 FP/event)


Epoch 2: train=240.019, val=40.7405, took 14.154 s
  Validation Found 0 of 54700, added 0 (eff 0.00%) (0.0 FP/event)


Epoch 3: train=20.2519, val=13.0246, took 14.123 s
  Validation Found 0 of 54700, added 0 (eff 0.00%) (0.0 FP/event)


Epoch 4: train=11.5608, val=10.7032, took 14.103 s
  Validation Found 0 of 54700, added 0 (eff 0.00%) (0.0 FP/event)


Epoch 5: train=10.3343, val=10.0971, took 14.688 s
  Validation Found 0 of 54700, added 0 (eff 0.00%) (0.0 FP/event)


Epoch 6: train=9.95728, val=9.87553, took 14.014 s
  Validation Found 0 of 54700, added 0 (eff 0.00%) (0.0 FP/event)


Epoch 7: train=9.80974, val=9.78262, took 14.195 s
  Validation Found 0 of 54700, added 0 (eff 0.00%) (0.0 FP/event)


Epoch 8: train=9.74596, val=9.7413, took 13.973 s
  Validation Found 0 of 54700, added 0 (eff 0.00%) (0.0 FP/event)


Epoch 9: train=9.71725, val=9.72256, took 14.004 s
  Validation Found 0 of 54700, added 0 (eff 0.00%) (0.0 FP/event)



## Results

Let's save some results: (even though if you have not changed the code above, it saves the model every epoch)

In [50]:
print(results)

  epoch         cost          val       time                        eff_val  \
0     0  1214.723770  1089.277004  15.327566  (315, 312, 54385, 9500, 9985)   
1     1   834.706196   540.162888  14.254317         (0, 0, 54700, 0, 9985)   
2     2   240.018768    40.740535  14.154000         (0, 0, 54700, 0, 9985)   
3     3    20.251921    13.024568  14.123296         (0, 0, 54700, 0, 9985)   
4     4    11.560848    10.703176  14.103109         (0, 0, 54700, 0, 9985)   
5     5    10.334343    10.097114  14.687786         (0, 0, 54700, 0, 9985)   
6     6     9.957279     9.875535  14.013991         (0, 0, 54700, 0, 9985)   
7     7     9.809738     9.782617  14.195371         (0, 0, 54700, 0, 9985)   
8     8     9.745959     9.741298  13.972584         (0, 0, 54700, 0, 9985)   
9     9     9.717245     9.722561  14.004029         (0, 0, 54700, 0, 9985)   

                                                outA  
0  [[[tensor([-0.0012, -0.0012, -0.0012,  ..., -0...  
1  [[[tensor([-0.001

In [21]:
A = results.outA
B = A.values

In [22]:
L1 = B[9][0]
L2 = B[9][1]

In [51]:
# Plot the surface.
surf = ax.plot_surface(L1, Y, Z, cmap=cm.coolwarm,
                       linewidth=0, antialiased=False)

# Customize the z axis.
ax.set_zlim(-1.01, 1.01)
ax.zaxis.set_major_locator(LinearLocator(10))
ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))

# Add a color bar which maps values to colors.
fig.colorbar(surf, shrink=0.5, aspect=5)

plt.show()

RuntimeError: Can't call numpy() on Variable that requires grad. Use var.detach().numpy() instead.

In [44]:
import numpy as np
import math
import matplotlib.pyplot as plot
import mpl_toolkits.mplot3d.axes3d as axes3d
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator

def cube_marginals(cube, normalize=False):
    c_fcn = np.mean if normalize else np.sum
    xy = c_fcn(cube, axis=0)
    xz = c_fcn(cube, axis=1)
    yz = c_fcn(cube, axis=2)
    return(xy,xz,yz)

def plotcube(cube,x=None,y=None,z=None,normalize=False,plot_front=False):
    """Use contourf to plot cube marginals"""
    (Z,Y,X) = cube.shape
    (xy,xz,yz) = cube_marginals(cube,normalize=normalize)
    if x == None: x = np.arange(X)
    if y == None: y = np.arange(Y)
    if z == None: z = np.arange(Z)

    fig = plot.figure()
    ax = fig.gca(projection='3d')

    cset = ax.plot_surface(x[None,:].repeat(Y,axis=0), y[:,None].repeat(X,axis=1), xy,   cmap=plot.cm.coolwarm, alpha=0.75)
    cset = ax.plot_surface(x[None,:].repeat(Z,axis=0), xz, z[:,None].repeat(X,axis=1),  cmap=plot.cm.coolwarm, alpha=0.75)
    cset = ax.plot_surface(yz, y[None,:].repeat(Z,axis=0), z[:,None].repeat(Y,axis=1),  cmap=plot.cm.coolwarm, alpha=0.75)

    fig.colorbar(surf, shrink=0.5, aspect=5)
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    plot.show()

# Plotting

In [45]:
plotcube(B[9][1].cpu().detach().numpy())
plotcube(B[5][0].cpu().detach().numpy())

In [52]:
fig = plt.figure()
ax = fig.gca(projection='3d')
n = 1
surf = ax.plot_surface(C[n,:,:], C[:,n,:],C[:,:,n], cmap=cm.coolwarm,
                       linewidth=0, antialiased=False)
plt.show()


ValueError: shape mismatch: objects cannot be broadcast to a single shape

Go ahead and save the final model (even though it was also saved above):

In [30]:
torch.save(model.state_dict(), output / f'{name}_final.pyt')

Save the output results:

In [31]:
np.savez(output / f'{name}_stats.npz', **results.to_dict())

Save the plot (remake the plot just in case the one above has broken):

In [32]:
dual_train_plots(results.index,
                 results.cost, results.val, 
                 results['eff_val'].apply(lambda x: x.eff_rate),
                 results['eff_val'].apply(lambda x: x.fp_rate))
plt.tight_layout()
plt.savefig(str(output / f'{name}_stats_a.png'))

Quit the kernel (try to be nice to other users)

In [16]:
quit()

In [33]:
z,x,y = (B[5][0].cpu().detach().numpy()).nonzero()
C = B[5][0].cpu().detach().numpy()