In [1]:
%%capture _ 
# ^ this just silences warnings
import torch
import mlflow
# import hiddenlayer as HL

from model.collectdata_poca_KDE import collect_data_poca_ATLAS as collect_data_poca
from model.alt_loss_A import Loss
from model.training import trainNet, select_gpu
from model.utilities import load_full_state, count_parameters, Params, save_to_mlflow

from model.autoencoder_models import UNet
from model.autoencoder_models import UNetPlusPlus

In [2]:
args = Params(
    batch_size=64,
    device = select_gpu(-1),
    epochs=100,
    lr=1e-2,
    experiment_name='ATLAS UNet++',
    asymmetry_parameter=0
)

In [3]:
events = 3000
## This is used when training with the new KDE
train_loader = collect_data_poca('/share/lazy/ekauffma/ATLAS_PVFinderData.h5',
                            batch_size=args['batch_size'],
                            device=args['device'], 
                            shuffle=True,
                            load_A_and_B=True,
                            load_xy=True,
                            slice = slice(0,2400)
                            #slice = slice(0,300)
                           )

val_loader = collect_data_poca('/share/lazy/ekauffma/ATLAS_PVFinderData.h5',
                            batch_size=args['batch_size'],
                            device=args['device'],
                            shuffle=True,
                            load_A_and_B=True,
                            load_xy=True,
                            slice = slice(2400,3000)
                            #slice = slice(300,350)
                           )

Loading data...
Loaded /share/lazy/ekauffma/ATLAS_PVFinderData.h5 in 716.7 s
Constructing 600 event dataset took 0.02162 s


In [4]:
mlflow.tracking.set_tracking_uri('file:/share/lazy/pv-finder_model_repo')
mlflow.set_experiment(args['experiment_name'])

Traceback (most recent call last):
  File "/data/home/ekauffma/.local/lib/python3.7/site-packages/mlflow/store/tracking/file_store.py", line 256, in list_experiments
    experiment = self._get_experiment(exp_id, view_type)
  File "/data/home/ekauffma/.local/lib/python3.7/site-packages/mlflow/store/tracking/file_store.py", line 336, in _get_experiment
    meta = read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/data/home/ekauffma/.local/lib/python3.7/site-packages/mlflow/utils/file_utils.py", line 175, in read_yaml
    raise MissingConfigException("Yaml file '%s' does not exist." % file_path)
mlflow.exceptions.MissingConfigException: Yaml file '/share/lazy/pv-finder_model_repo/ML/meta.yaml' does not exist.
Traceback (most recent call last):
  File "/data/home/ekauffma/.local/lib/python3.7/site-packages/mlflow/store/tracking/file_store.py", line 256, in list_experiments
    experiment = self._get_experiment(exp_id, view_type)
  File "/data/home/ekauffma/.local/lib/python3

In [5]:
model = UNetPlusPlus().to(args['device'])
# for name, param in model.named_parameters():
#     print (name, param.data)
optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'], betas=(0.9, 0.999))
loss = Loss(epsilon=1e-5,coefficient=args['asymmetry_parameter'])

parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

In [6]:
print(model)

UNetPlusPlus(
  (rcbn1): ConvBNrelu(
    (0): Conv1d(4, 64, kernel_size=(25,), stride=(1,), padding=(12,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.25, inplace=False)
  )
  (rcbn2): ConvBNrelu(
    (0): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.25, inplace=False)
  )
  (rcbn3): ConvBNrelu(
    (0): Conv1d(64, 64, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.25, inplace=False)
  )
  (rcbn4): ConvBNrelu(
    (0): Conv1d(64, 64, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.25, inplace=False)
  )
  (rcbn5): ConvBNrelu

In [None]:
#load_full_state(model, optimizer, '/share/lazy/pv-finder_model_repo/24/9a2b98a397eb404497b26ab5eaa091a5/artifacts/train.ipynb')

run_name = 'second pass with ATLAS data - increased learning rate'

# tune kernel based on gpu
#torch.backends.cudnn.benchmark=True
train_iter = enumerate(trainNet(model, optimizer, loss, train_loader, val_loader, args['epochs'], notebook=True))
with mlflow.start_run(run_name = run_name) as run:
    mlflow.log_artifact('train.ipynb')
    for i, result in train_iter:
        print(result.cost)
        torch.save(model, 'run_stats.pyt')
        mlflow.log_artifact('run_stats.pyt')

        save_to_mlflow({
            'Metric: Training loss':result.cost,
            'Metric: Validation loss':result.val,
            'Metric: Efficiency':result.eff_val.eff_rate,
            'Metric: False positive rate':result.eff_val.fp_rate,
            'Param: Parameters':parameters,
            'Param: Events':events,
            'Param: Asymmetry':args['asymmetry_parameter'],
            'Param: Epochs':args['epochs'],
        }, step=i)


Number of batches: train = 38, val = 10


HBox(children=(FloatProgress(value=0.0, description='Epochs', layout=Layout(flex='2'), style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=38.0, style=Progr…