# 1. Create the training sample

In [11]:
%load_ext autoreload
%autoreload 2

import uproot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
from tqdm.auto import tqdm
import sys
sys.path.append('/home/belle/zhangboy/inclusive_R_D/')
import utilities as util
plt.rcParams["axes.prop_cycle"] = plt.cycler("color", plt.cm.tab20.colors)

training_variables = util.training_variables
columns = util.all_relevant_variables

Welcome to JupyROOT 6.26/14


In [12]:
# Create Pandas DataFrames (df) from csv files and shuffle the rows
signal_region = '(1.855<D_M) & (D_M<1.885) & (B0_roeMbc_my_mask>5) & (B0_roeDeltae_my_mask<2)'
signal_region += ' & (B0_CMS2_weMbc>4.3) & (-3<B0_CMS0_weDeltae) & (B0_CMS0_weDeltae<2)'

sig_mc = uproot.concatenate([f'../Samples/Generic_MC15ri/e_channel/sigDDst_quaxo_2.root:B0'],
                          library="np",
                          cut = signal_region,
                          filter_branch=lambda branch: branch.name in columns)

generic_mc = uproot.concatenate([f'../Samples/Generic_MC15ri/e_channel/MC15ri_local_200fb_control/*.root:B0'],
                          library="np",
                          cut = signal_region,
                          filter_branch=lambda branch: branch.name in columns)

df_sig = pd.DataFrame(sig_mc)
df_generic = pd.DataFrame(generic_mc)

In [13]:
# separate components
sig_samples=util.get_dataframe_samples_new(df_sig, 'e', template=False)
bkg_samples=util.get_dataframe_samples_new(df_generic, 'e', template=False)

train_dic = {r'$D\tau\nu$':       sig_samples[r'$D\tau\nu$'],
             'bkg_FakeD':         bkg_samples['bkg_FakeD'].sample(frac=0.5,random_state=0),
             'bkg_combinatorial': bkg_samples['bkg_combinatorial'], 
             'bkg_continuum':     bkg_samples['bkg_continuum'],
             'bkg_singleBbkg':    bkg_samples['bkg_singleBbkg'],
             'bkg_TDFl':          bkg_samples['bkg_TDFl'],
             'bkg_fakeTracks':    bkg_samples['bkg_fakeTracks']}

for name, df in train_dic.items():
    print(name, df.iloc[0]['mode'], len(df))

$D\tau\nu$ 8 57718
bkg_FakeD 1 421298
bkg_combinatorial 4 91828
bkg_continuum 3 69691
bkg_singleBbkg 5 36815
bkg_TDFl 2 26843
bkg_fakeTracks 0 25743


In [14]:
# create the trainig sample and shuffle, use 'mode' as the training label
df_train = pd.concat( list(train_dic.values()) ).sample(frac=1)

In [5]:
df_train['mode'].value_counts()

mode
1    421298
4     91828
3     69691
8     57718
5     36815
2     26843
0     25743
Name: count, dtype: int64

In [5]:
print(df_train.isna().sum()[df_train.isna().sum()!=0])

B0_mcDaughter_0_PDG    26546
B0_mcDaughter_1_PDG    26546
B0_mcPDG               26546
D_K_mcPDG               4840
D_mcPDG                21614
ell_mcPDG               3405
dtype: int64


In [30]:
# define relevant variables to your training
target_column = ['mode']
feature_column = training_variables

df_train_sub = df_train[target_column + feature_column].copy().reset_index(drop=True)

In [32]:
with uproot.recreate(f'../AutogluonModels/train.root') as file:
    file['B0'] = df_train_sub

# 2. Setup autogluon and perfrom the training

In [1]:
from autogluon.tabular import TabularPredictor
import uproot
import pandas as pd

train_sub = uproot.concatenate([f'../AutogluonModels/train.root:B0'],library="np")
df_train_sub = pd.DataFrame({k:v for k, v in train_sub.items() if k!='index'})

# Split the training set to train and validation
train_data = df_train_sub.sample(frac=0.8, random_state=0)
validation_data = df_train_sub.drop(train_data.index)

In [2]:
# Define and fit the AutoGluon classifier
hyperpar = {
    "GBM": {
        "num_boost_round": 500,
        "early_stopping_rounds": 20,
        "learning_rate": 0.05,  # Lower learning rate
        "min_data_in_leaf": 20,
        'max_depth': 6,
        "lambda_l1": 1,        # L1 regularization
        "lambda_l2": 1         # L2 regularization
        'lambda_l1': 1e-2,
        'lambda_l2': 1e-2,
    },
    'CAT': {
        'iterations': 500,
        'od_type': 'Iter',
        'od_wait': 20,
    },
    'XGB': {
        'learning_rate': 0.05,
        'n_estimators': 500,
        'early_stopping_rounds': 20,
        'reg_alpha': 1e-2,
        'reg_lambda': 1e-2,
    },
    "NN_TORCH": {
        "num_epochs": 50,
        'num_layers': 2,
        'hidden_size': 128,
        "early_stop_patience": 5,
        "dropout_prob": 0.5,       # Add dropout to reduce overfitting
        "batch_size": 32,
        "weight_decay": 1e-4,      # Apply L2 regularization
        "learning_rate": 0.001,     # Adjust learning rate
        'lr_scheduler': 'CosineAnnealingLR',  # or other supported schedulers
        'lr_scheduler_params': {
            'T_max': 50,
        }
    }
}

hyperpar_tuner = {
        'num_trials': 20,
        'scheduler': 'local',
        'searcher': 'random',
    }

ag = TabularPredictor(label='mode', eval_metric='f1_macro') # many more configuration
predictor = ag.fit(train_data, presets='good_quality', save_bag_folds=True,
                   excluded_model_types=['FASTAI'],time_limit=600,
                   hyperparameters=hyperpar,
                   hyperparameter_tune_kwargs=hyperpar_tuner)

No path specified. Models will be saved in: "AutogluonModels/ag-20241122_083841"
Presets specified: ['good_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Note: `save_bag_folds=False`! This will greatly reduce peak disk usage during fit (by ~8x), but runs the risk of an out-of-memory error during model refit if memory is small relative to the data size.
	You can avoid this risk by setting `save_bag_folds=True`.
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked

# 3. Load and inspect model

In [7]:
predictor = TabularPredictor.load("../AutogluonModels/ag-20241122_085044")

In [4]:
# display the trained models
predictor.leaderboard(validation_data, extra_metrics=['roc_auc_ovo_macro','balanced_accuracy'])

Unnamed: 0,model,score_test,roc_auc_ovo_macro,balanced_accuracy,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBM_BAG_L1_FULL,0.393774,0.832627,0.375637,,f1_macro,0.832892,,28.590117,0.832892,,28.590117,1,True,8
1,WeightedEnsemble_L2_FULL,0.393774,0.832627,0.375637,,f1_macro,0.85618,,36.702713,0.023288,,8.112596,2,True,9
2,WeightedEnsemble_L3_FULL,0.393774,0.832627,0.375637,,f1_macro,0.859019,,43.920925,0.026127,,15.330808,3,True,12
3,LightGBMXT_BAG_L1,0.392849,0.834741,0.375076,0.394411,f1_macro,15.29967,1530.492718,2805.307317,15.29967,1530.492718,2805.307317,1,True,1
4,LightGBM_BAG_L1,0.392524,0.833781,0.374904,0.395633,f1_macro,6.421356,563.666807,388.340183,6.421356,563.666807,388.340183,1,True,2
5,WeightedEnsemble_L2,0.392524,0.833781,0.374904,0.395633,f1_macro,6.44296,563.799349,396.452779,0.021604,0.132542,8.112596,2,True,3
6,WeightedEnsemble_L3,0.392524,0.833781,0.374904,0.395633,f1_macro,6.466001,563.796647,403.670991,0.044645,0.129839,15.330808,3,True,6
7,LightGBMXT_BAG_L1_FULL,0.392194,0.833554,0.374755,,f1_macro,1.42346,,50.589496,1.42346,,50.589496,1,True,7
8,LightGBM_BAG_L2_FULL,0.391896,0.835256,0.378817,,f1_macro,3.202905,,118.387522,0.946553,,39.20791,2,True,11
9,LightGBM_BAG_L2,0.390768,0.837974,0.378746,0.392154,f1_macro,29.169917,2103.727971,3435.610131,7.448891,9.568446,241.962631,2,True,5


In [8]:
# display the trained models
predictor.leaderboard(validation_data, extra_metrics=['roc_auc_ovo_macro','balanced_accuracy'])

Unnamed: 0,model,score_test,roc_auc_ovo_macro,balanced_accuracy,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetFastAI_BAG_L2,0.413156,0.841535,0.396298,0.412699,f1_macro,16.515323,217.615295,3374.677931,5.700277,84.533147,825.772531,2,True,4
1,WeightedEnsemble_L3,0.413156,0.841535,0.396298,0.412699,f1_macro,16.543453,217.746156,3393.703473,0.028129,0.130861,19.025541,3,True,7
2,LightGBM_BAG_L2,0.389424,0.842302,0.379818,0.391058,f1_macro,13.010087,136.24393,2580.149384,2.195041,3.161782,31.243984,2,True,6
3,LightGBMXT_BAG_L2,0.387497,0.842365,0.378457,0.388879,f1_macro,14.283112,242.426528,2886.443301,3.468066,109.34438,337.537901,2,True,5
4,LightGBMXT_BAG_L1,0.385262,0.831248,0.367961,0.385665,f1_macro,5.967541,14.706898,74.360766,5.967541,14.706898,74.360766,1,True,2
5,WeightedEnsemble_L2,0.385262,0.831248,0.367961,0.385665,f1_macro,6.003815,14.836629,82.575466,0.036275,0.129731,8.2147,2,True,3
6,LightGBM_BAG_L2_FULL,0.385091,0.840151,0.376995,,f1_macro,1.702802,,492.900146,0.290777,,307.495451,2,True,13
7,LightGBMXT_BAG_L2_FULL,0.384012,0.84005,0.376155,,f1_macro,1.935358,,1142.132465,0.523333,,956.72777,2,True,12
8,LightGBMXT_BAG_L1_FULL,0.383193,0.830669,0.366397,,f1_macro,0.767725,,21.813005,0.767725,,21.813005,1,True,9
9,WeightedEnsemble_L2_FULL,0.383193,0.830669,0.366397,,f1_macro,0.813939,,30.027705,0.046214,,8.2147,2,True,10


In [5]:
predictor.model_best

'LightGBM_BAG_L1_FULL'

In [6]:
predictor.feature_importance(validation_data)

Computing feature importance via permutation shuffling for 44 features using 5000 rows with 5 shuffle sets...
	13.39s	= Expected runtime (2.68s per shuffle set)
	6.23s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
B0_roeDeltae_my_mask,0.054594,0.006399,2.2e-05,5,0.067769,0.041418
B0_roeMbc_my_mask,0.043414,0.004393,1.2e-05,5,0.052459,0.03437
D_daughterInvM_0_1,0.029828,0.002667,8e-06,5,0.03532,0.024336
D_daughterInvM_1_2,0.02831,0.005647,0.00018,5,0.039936,0.016683
D_A1FflightDistanceSig_IP,0.026893,0.012865,0.004743,5,0.053382,0.000405
B0_vtxDDSig,0.026874,0.009656,0.001697,5,0.046755,0.006993
B0_R2,0.023055,0.002275,1.1e-05,5,0.027738,0.018371
B0_KSFWV14,0.021329,0.005471,0.000477,5,0.032594,0.010065
D_pi1_pValue,0.016919,0.003883,0.000311,5,0.024914,0.008924
B0_vtxReChi2,0.013495,0.003449,0.000471,5,0.020597,0.006392


In [9]:
predictor.feature_importance(validation_data,'WeightedEnsemble_L3')

TypeError: Exception occured in `TrainEvalCallback` when calling event `before_fit`:
	'bool' object is not callable

In [6]:
# print the evaluation result for the best model
results = predictor.evaluate(validation_data)
print(results)

{'f1_macro': 0.40615379527114787, 'accuracy': 0.8117369607421595, 'balanced_accuracy': 0.38936295971515233, 'mcc': 0.5181652000337745}


# 4. Apply model

In [51]:
pred = predictor.predict_proba(validation_data)
pred = pred.rename(columns={0: 'fakeTracks_prob', 
                            1: 'fakeD_prob',
                            2: 'fakeL_prob',
                            3: 'continuum_prob',
                            4: 'combinatorial_prob',
                            5: 'singleBbkg_prob',
                            8: 'sig_prob'})

In [56]:
pred

Unnamed: 0,fakeTracks_prob,fakeD_prob,fakeL_prob,continuum_prob,combinatorial_prob,singleBbkg_prob,sig_prob
1,0.006789,0.777897,0.024496,0.010685,0.114710,0.030317,0.035106
4,0.006194,0.145342,0.035548,0.035458,0.249365,0.071790,0.456304
15,0.012899,0.885186,0.016199,0.047766,0.017444,0.003973,0.016532
22,0.012091,0.316573,0.067048,0.049029,0.398908,0.061314,0.095037
26,0.004274,0.388828,0.011429,0.521257,0.043394,0.003799,0.027018
...,...,...,...,...,...,...,...
729913,0.018933,0.520465,0.029479,0.418423,0.006764,0.002547,0.003390
729920,0.012418,0.867310,0.023997,0.008962,0.039717,0.043069,0.004527
729924,0.013535,0.171850,0.045695,0.112353,0.564511,0.053809,0.038247
729925,0.013976,0.809941,0.024437,0.007302,0.080575,0.060410,0.003358


In [59]:
df_pred['mode'].value_counts()

mode
1    84008
4    18590
3    13926
8    11672
5     7330
2     5431
0     5030
Name: count, dtype: int64

In [60]:
# make predictions on unlabeled sample
df_pred = pd.concat([validation_data, pred], axis=1)
df_pred.query('sig_prob>0.4')['mode'].value_counts()

mode
8    4690
4     905
1     799
5     455
3     339
2     300
0     101
Name: count, dtype: int64