# 1. Create the training sample

In [1]:
%load_ext autoreload
%autoreload 2

import uproot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
from tqdm.auto import tqdm
import sys
sys.path.append('/home/belle/zhangboy/inclusive_R_D/')
import utilities as util
plt.rcParams["axes.prop_cycle"] = plt.cycler("color", plt.cm.tab20.colors)

training_variables = util.training_variables
columns = util.all_relevant_variables

Welcome to JupyROOT 6.26/14


In [2]:
# Create Pandas DataFrames (df) from csv files and shuffle the rows
signal_region = '(1.855<D_M) & (D_M<1.885) & (B0_roeMbc_my_mask>5) & (B0_roeDeltae_my_mask<2)'
signal_region += ' & (B0_CMS2_weMbc>4.3) & (-3<B0_CMS0_weDeltae) & (B0_CMS0_weDeltae<2)'

sig_mc = uproot.concatenate([f'../Samples/Generic_MC15ri/e_channel/sigDDst_quaxo_2.root:B0'],
                          library="np",
                          cut = signal_region,
                          filter_branch=lambda branch: branch.name in columns)

generic_mc = uproot.concatenate([f'../Samples/Generic_MC15ri/e_channel/MC15ri_MVA_200fb_quaxo/*.root:B0'],
                          library="np",
                          cut = signal_region,
                          filter_branch=lambda branch: branch.name in columns)

df_sig = pd.DataFrame(sig_mc)
df_generic = pd.DataFrame(generic_mc)

In [3]:
# separate components
sig_samples=util.get_dataframe_samples_new(df_sig, 'e', template=False)
bkg_samples=util.get_dataframe_samples_new(df_generic, 'e', template=False)

fakeD_sample = pd.concat( [bkg_samples['bkg_FakeD'].query('B0_isContinuumEvent==0'),
                           bkg_samples['bkg_fakeTracks'].query('B0_isContinuumEvent==0')] )

fakeB_sample = pd.concat( [bkg_samples['bkg_combinatorial'],
                           bkg_samples['bkg_singleBbkg']] )

continuum_sample = pd.concat( [bkg_samples['bkg_FakeD'].query('B0_isContinuumEvent==1'),
                               bkg_samples['bkg_fakeTracks'].query('B0_isContinuumEvent==1'),
                               bkg_samples['bkg_continuum']] )

train_dic = {r'$D\tau\nu$':   sig_samples[r'$D\tau\nu$'],
             'fakeD':         fakeD_sample.sample(frac=0.6,random_state=0),
             'fakeB':         fakeB_sample, 
             'continuum':     continuum_sample}

In [4]:
for i, (name, df) in enumerate(train_dic.items()):
    df['__weight__'] = round(len(train_dic['fakeD'])/len(df))
    df['target'] = i
    print(name, 'target =', i, 'size =', len(df), 
          'weight =', df.iloc[0]['__weight__'])

$D\tau\nu$ target = 0 size = 57718 weight = 6
fakeD target = 1 size = 374984 weight = 1
fakeB target = 2 size = 130082 weight = 3
continuum target = 3 size = 315294 weight = 1


In [5]:
# create the trainig sample and shuffle, use 'mode' as the training label
df_train = pd.concat( list(train_dic.values()) ).sample(frac=1)

In [9]:
df_train['target'].value_counts()

target
1    372579
3    317064
2    128643
0     57718
Name: count, dtype: int64

In [10]:
print(df_train.isna().sum()[df_train.isna().sum()!=0])

B0_mcDaughter_0_PDG    19419
B0_mcDaughter_1_PDG    19419
B0_mcPDG               19419
D_K_mcPDG               3406
D_mcPDG                15236
ell_mcPDG               2643
dtype: int64


In [6]:
# define relevant variables to your training
target_column = ['target']
feature_column = training_variables + ['__weight__']

df_train_sub = df_train[target_column + feature_column].copy().reset_index(drop=True)

In [7]:
with uproot.recreate(f'../AutogluonModels/train.root') as file:
    file['B0'] = df_train_sub

# 2. Setup autogluon and perfrom the training

In [1]:
from autogluon.tabular import TabularPredictor
import uproot
import pandas as pd

train_sub = uproot.concatenate([f'../AutogluonModels/train.root:B0'],library="np")
df_train_sub = pd.DataFrame({k:v for k, v in train_sub.items() if k!='index'})

# Split the training set to train and validation
train_data = df_train_sub.sample(frac=0.8, random_state=0)
validation_data = df_train_sub.drop(train_data.index)

In [14]:
# Define and fit the AutoGluon classifier
hyperpar = {
    "GBM": [
        'GBMLarge',
    ],
#     "NN_TORCH": {
#         "num_epochs": 50,
#         'num_layers': 3,
#         'hidden_size': 128,
#         "dropout_prob": 0.5,       # Add dropout to reduce overfitting
#         "batch_size": 32,
#         "weight_decay": 1e-4,      # Apply L2 regularization
#         "learning_rate": 0.01,     # Adjust learning rate
#     },
}

hyperpar_tuner = {
        'num_trials': 20,
        'searcher': 'auto',
    }

ag = TabularPredictor(label='mode', eval_metric='f1_macro') # many more configuration
predictor = ag.fit(train_data=train_data, #tuning_data=validation_data,
                   presets='good_quality', save_bag_folds=True,
                   excluded_model_types=['FASTAI'],time_limit=10*60,
                   hyperparameters=hyperpar,
                   hyperparameter_tune_kwargs=hyperpar_tuner)

No path specified. Models will be saved in: "AutogluonModels/ag-20241127_093018"
Presets specified: ['good_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels/ag-20241127_093018/ds_sub_fit/sub_fit_ho.
Running the sub-fit in a ray process to avoid memory leakage.
Spend 287 seconds

# 3. Load and inspect model

In [6]:
predictor = TabularPredictor.load("../AutogluonModels/ag-20241203_203215")

In [7]:
# display the trained models
predictor.leaderboard(validation_data, extra_metrics=['roc_auc_ovo_macro',])

Unnamed: 0,model,score_test,roc_auc_ovo_macro,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBM,0.751681,0.956966,0.636566,f1_macro,3.089117,3.00395,3609.481638,3.089117,3.00395,3609.481638,1,True,2
1,WeightedEnsemble_L2,0.710542,0.93004,0.660752,f1_macro,5.678794,3.772579,9999.670002,0.037952,0.042799,0.262064,2,True,5
2,NeuralNetTorch,0.657478,0.891127,0.650468,f1_macro,1.963876,0.488263,6114.567853,1.963876,0.488263,6114.567853,1,True,3
3,LightGBMXT,0.631751,0.900392,0.615657,f1_macro,0.481176,0.035729,61.705243,0.481176,0.035729,61.705243,1,True,1
4,LightGBMLarge,0.590424,0.873907,0.568789,f1_macro,0.106673,0.201838,213.653203,0.106673,0.201838,213.653203,1,True,4


In [8]:
predictor.model_best

'WeightedEnsemble_L2'

In [10]:
predictor.feature_importance(validation_data, model='NeuralNetTorch')

These features in provided data are not utilized by the predictor and will be ignored: ['__weight__']


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
B0_roeDeltae_my_mask,0.098912,0.005242,9.427916e-07,5,0.109704,0.088119
B0_KSFWV3,0.091661,0.003121,1.611075e-07,5,0.098088,0.085234
B0_KSFWV8,0.075848,0.010736,4.690689e-05,5,0.097953,0.053743
B0_KSFWV14,0.071542,0.004668,2.163491e-06,5,0.081154,0.061929
B0_R2,0.067793,0.004615,2.561987e-06,5,0.077296,0.05829
B0_roeMbc_my_mask,0.057597,0.00921,7.584488e-05,5,0.076561,0.038634
B0_vtxDDSig,0.046862,0.008282,0.0001123283,5,0.063914,0.02981
D_A1FflightDistanceSig_IP,0.036054,0.005973,8.71666e-05,5,0.048353,0.023756
D_daughterInvM_1_2,0.027804,0.008988,0.001145921,5,0.04631,0.009298
D_vtxReChi2,0.02569,0.006908,0.0005711935,5,0.039913,0.011466


# 4. Apply model

In [51]:
pred = predictor.predict_proba(validation_data)
pred = pred.rename(columns={0: 'fakeTracks_prob', 
                            1: 'fakeD_prob',
                            2: 'fakeL_prob',
                            3: 'continuum_prob',
                            4: 'combinatorial_prob',
                            5: 'singleBbkg_prob',
                            8: 'sig_prob'})

In [56]:
pred

Unnamed: 0,fakeTracks_prob,fakeD_prob,fakeL_prob,continuum_prob,combinatorial_prob,singleBbkg_prob,sig_prob
1,0.006789,0.777897,0.024496,0.010685,0.114710,0.030317,0.035106
4,0.006194,0.145342,0.035548,0.035458,0.249365,0.071790,0.456304
15,0.012899,0.885186,0.016199,0.047766,0.017444,0.003973,0.016532
22,0.012091,0.316573,0.067048,0.049029,0.398908,0.061314,0.095037
26,0.004274,0.388828,0.011429,0.521257,0.043394,0.003799,0.027018
...,...,...,...,...,...,...,...
729913,0.018933,0.520465,0.029479,0.418423,0.006764,0.002547,0.003390
729920,0.012418,0.867310,0.023997,0.008962,0.039717,0.043069,0.004527
729924,0.013535,0.171850,0.045695,0.112353,0.564511,0.053809,0.038247
729925,0.013976,0.809941,0.024437,0.007302,0.080575,0.060410,0.003358


In [59]:
df_pred['mode'].value_counts()

mode
1    84008
4    18590
3    13926
8    11672
5     7330
2     5431
0     5030
Name: count, dtype: int64

In [60]:
# make predictions on unlabeled sample
df_pred = pd.concat([validation_data, pred], axis=1)
df_pred.query('sig_prob>0.4')['mode'].value_counts()

mode
8    4690
4     905
1     799
5     455
3     339
2     300
0     101
Name: count, dtype: int64