In [1]:
import uproot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from ttjjana import prepdata

from flowmatching import *

# Read data and reshape

In [None]:
rawinputs, normedinputs, inputmeans, inputsigma, ncat_per_feature = prepdata()

In [3]:
inputtmp

Unnamed: 0,met,ht,pt5,pt6,njet,nbtag
0,84.002205,765.415955,96.491432,83.413086,7,2
1,95.233681,924.504883,71.028915,68.117470,11,2
2,54.860222,1668.847900,76.102409,46.142303,10,3
3,37.822800,355.041229,38.004093,36.965599,8,3
4,29.449005,555.668457,62.723263,45.919170,7,2
...,...,...,...,...,...,...
227371,76.066948,356.712494,40.887901,25.034538,7,2
227372,112.630661,982.286682,64.555443,62.528595,9,2
227373,8.826685,555.999023,38.395134,34.182259,9,2
227374,41.213657,687.215576,58.986603,51.859745,9,2


In [4]:
# this contains how many conditional features are in each category
ncat_per_feature

[0, 0, 0, 0, 3, 2]

In [5]:
normedinputs.shape

(227376, 9)

In [6]:
#conditional dimensions
cdim = np.sum(ncat_per_feature)
M = normedinputs.shape[1] - cdim
if cdim > 0:
    print(f"Conditional dimension: {cdim}")
    isconditional = True
else:
    print("No conditional dimension")
    isconditional = False

config = Config(
        M=M,
        nhidden=1024,        # Reduced hidden size for faster example
        nlayers=2,          # Number of layers (tune)
        batch_size=128,    # Adjust based on VRAM
        learning_rate=1e-3, # Tune learning rate
        epochs=100,         # Number of training epochs
        time_embed_dim=64, # Dimension for time embedding
        ode_steps=50,       # Number of steps for sampling ODE solver
        epsilon=1e-5,        # Small offset for time sampling
        conditional=isconditional,   # Enable conditional training
        conditional_dim=cdim    # Dimension of conditional variable
    )

Conditional dimension: 5
Using device: cuda
Automatic Mixed Precision (AMP) enabled: True


In [7]:
data_cpu = torch.from_numpy(normedinputs).float()

# 3. Move Entire Dataset to Target Device (GPU if available)
data_on_device = None
if config.device.type == 'cuda':
    print(f"Attempting to move dataset ({data_cpu.nelement() * data_cpu.element_size() / 1024**2:.2f} MB) to {config.device}...")
    try:
        data_on_device = data_cpu.to(config.device)
        print(f"Successfully moved dataset to {data_on_device.device}")
    except RuntimeError as e:
        print(f"\n----- ERROR moving full dataset to GPU: {e}. Exiting. -----")
        exit()
else:
    print("Running on CPU, keeping data on CPU.")
    data_on_device = data_cpu


Attempting to move dataset (7.81 MB) to cuda...
Successfully moved dataset to cuda:0


# Train model

In [8]:
# --- Training ---
train_model_flag = True
config_dir = get_config_directory(config)
os.makedirs(config_dir, exist_ok=True)
model_filename = "model_fm.pth" # Flow Matching model filename
epoch_losses = []

if train_model_flag:
    print("\n--- Starting Flow Matching Training ---")
    # Call the flow matching training function
    trained_model, epoch_losses = train_flow_matching(data_on_device, config)

    if trained_model:
        save_model(trained_model, config, filename=model_filename)
        if epoch_losses:
            plt.figure(figsize=(10, 5))
            plt.plot(epoch_losses, label='Training Loss (Flow Matching)')
            plt.xlabel('Epoch')
            plt.ylabel('Loss (MSE)')
            plt.yscale('log')
            plt.grid(True, which='both', linestyle='--', linewidth=0.5)
            plt.legend()
            plt.title(f'Flow Matching Training Loss\n{get_config_description(config)}')
            loss_fig = plt.gcf()
            save_plot(loss_fig, config, "training_loss_fm.png")
    else:
        print("Training failed, model not saved.")



--- Starting Flow Matching Training ---
Training Flow Matching model with data on device: cuda:0
DataLoader created with batch size 128, num_workers=0, pin_memory=False
Initialized VelocityMLP with 2 layers, hidden size 1024, time_embed_dim 64, conditional True, conditional_dim 5
Starting Flow Matching training on cuda for 100 epochs...


Training FM (Epochs): 100%|██████████| 100/100 [15:32<00:00,  9.33s/epoch, avg_loss=1.197127]



Training finished.
Model saved to FM_M4_nh1024_nl2_ted64_BS128_LR1e-03_E100\model_fm.pth
Plot saved to FM_M4_nh1024_nl2_ted64_BS128_LR1e-03_E100\training_loss_fm.png


In [10]:
condlist = [
            [[1., 0., 0.,   1., 0., ]],
            [[0., 1., 0.,   1., 0., ]],
            [[0., 0., 1.,   1., 0., ]],
            [[1., 0., 0.,   0., 1., ]],
            [[0., 1., 0.,   0., 1., ]],
            [[0., 0., 1.,   0., 1., ]]
        ]
select0 = (rawinputs['njet']==7) & (rawinputs['nbtag']==2)
select1 = (rawinputs['njet']==8) & (rawinputs['nbtag']==2)
select2 = (rawinputs['njet']>=9) & (rawinputs['nbtag']==2)
select3 = (rawinputs['njet']==7) & (rawinputs['nbtag']>=3)
select4 = (rawinputs['njet']==8) & (rawinputs['nbtag']>=3)
select5 = (rawinputs['njet']>=9) & (rawinputs['nbtag']>=3)
select_data = [select0, select1, select2, select3, select4, select5]

plottextlist=[
    f'$N_j=7, N_b=2$',
    f'$N_j=8, N_b=2$',
    f'$N_j\geq 9, N_b=2$',
    f'$N_j=7, N_b\geq 3$',
    f'$N_j=8, N_b\geq 3$',
    f'$N_j\geq 9, N_b\geq 3$'
]
njlist = [7, 8, 9, 7, 8, 9]
nblist = [2, 2, 2, 3, 3, 3]

NameError: name 'rawinputs' is not defined

In [11]:
cond = torch.tensor(condlist[0]*4000).to(config.device).float()

In [12]:
cond

tensor([[1., 0., 0., 1., 0.],
        [1., 0., 0., 1., 0.],
        [1., 0., 0., 1., 0.],
        ...,
        [1., 0., 0., 1., 0.],
        [1., 0., 0., 1., 0.],
        [1., 0., 0., 1., 0.]], device='cuda:0')