# 1. Import Ntuple

In [1]:
%load_ext autoreload
%autoreload 2

import uproot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
from tqdm.auto import tqdm
import sys
sys.path.append('/home/belle/zhangboy/inclusive_R_D/')
import utilities as util
plt.rcParams["axes.prop_cycle"] = plt.cycler("color", plt.cm.tab20.colors)

training_variables = util.training_variables
columns = util.all_relevant_variables

In [2]:
# 4S Data vs MC, regions: p_l > 2.4; q^2 < 3; 4.5<roeMbc<5.05; 2<roeDeltaE<5; deltaE

# Load data files
MC_4S = uproot.concatenate([f'../../Samples/Generic_MC15ri/e_channel/MC15ri_local_200fb_control/*.root:B0'],
                          library="np",)
                          #cut = '(D_M>1.855) & (D_M<1.885)',
                          #filter_branch=lambda branch: branch.name in columns)

data_4S = uproot.concatenate([f'../../Samples/Data/e_channel/proc13_4S_control_quaxo_1.root:B0'],
                          library="np",
                          #cut = '(D_M>1.855) & (D_M<1.885)',
                          filter_branch=lambda branch: branch.name in columns)

df_mc_4S = pd.DataFrame(MC_4S)
df_data_4S = pd.DataFrame(data_4S)

BDT_continuum='signal_prob<0.3 and continuum_prob>0.8 and fakeD_prob<0.05'
BDT_sig='signal_prob==largest_prob and signal_prob>0.8 and continuum_prob<0.04 and fakeD_prob<0.05'
BDT_no_sig = 'signal_prob<0.3'

In [12]:
for df in [df_mc_4S,df_data_4S,]
    # df.eval(f'cos_D_l = (D_px*ell_px + D_py*ell_py + D_pz*ell_pz)/(D_p*ell_p)', inplace=True)
    df.eval('B_D_ReChi2 = B0_vtxReChi2 + D_vtxReChi2', inplace=True)
    df.eval('p_D_l = D_CMS_p + ell_CMS_p', inplace=True)

In [25]:
# load MVA
import lightgbm as lgb
df_mc_4S_BDT_conti = util.apply_mva_bcs(df_mc_4S, training_variables, BDT_continuum)
df_data_4S_BDT_conti = util.apply_mva_bcs(df_data_4S, training_variables, BDT_continuum)

df_mc_4S_BDT_sig = util.apply_mva_bcs(df_mc_4S, training_variables, BDT_sig)
df_data_4S_BDT_sig = util.apply_mva_bcs(df_data_4S, training_variables, BDT_sig)

df_mc_4S_BDT_no_sig = util.apply_mva_bcs(df_mc_4S, training_variables, BDT_no_sig)
df_data_4S_BDT_no_sig = util.apply_mva_bcs(df_data_4S, training_variables, BDT_no_sig)

bkg_FakeD 78329
bkg_TDFl 2629
bkg_fakeTracks 1814
bkg_continuum 3110
bkg_combinatorial 8185
bkg_singleBbkg 4584
bkg_other_TDTl 0
$D\tau\nu$ 1574
$D^\ast\tau\nu$ 1072
$D\ell\nu$ 22958
$D^\ast\ell\nu$ 18430
$D^{\ast\ast}\tau\nu$ 663
$D^{\ast\ast}\ell\nu$ 11162
$D\ell\nu$_gap 2830
bkg_other_signal 0


# Data vs. MC BKG shape

In [None]:
columns_without_nan = df_data_BDT_no_sig.columns[~df_data_BDT_no_sig.isna().any()].tolist()

samples=util.get_dataframe_samples_new(df_mc_BDT_no_sig, 'e', template=False)
mpl=util.mpl(samples,df_data_BDT_no_sig)
# df_conti_conti = samples['bkg_continuum']
for name, df in samples.items():
    print(name, len(df))

In [None]:
b1 = np.linspace(0,1,100)
data_hist_all, mc_hist_all = mpl.plot_data_mc_stacked(variable='fakeD_prob',bins=b1,
                                                      cut=None,scale=[1,187/200],
                                                      correction=False,mask=[],
                                                      figsize=(10,6),ratio=False)

In [None]:
b1 = np.linspace(1.79,1.95,50)
data_hist_all, mc_hist_all = mpl.plot_data_mc_stacked(variable='D_M',bins=b1,
                                                      cut='fakeD_prob>0.1',scale=[1,187/200],
                                                      correction=False,mask=[],
                                                      figsize=(10,6),ratio=False)

## consistent discrepency, sidebands vs. sig region

In [None]:
# check fitting variables shape in different regions, look for shifts

In [None]:
# sidebands
b1 = np.linspace(-10,10,50)
a,b=mpl.plot_data_mc_stacked(variable='B0_CMS3_weMissM2',bins=b1,ratio=False,
                          cut='fakeD_prob>0.1 and (D_M<1.83 or D_M>1.91)',
                          scale=[1,187/200],correction=False,mask=[],figsize=(10,6))

In [None]:
# signal
b1 = np.linspace(-10,10,50)
a,b=mpl.plot_data_mc_stacked(variable='B0_CMS3_weMissM2',bins=b1,ratio=False,
                          cut='fakeD_prob>0.1 and D_M>1.83 and D_M<1.91',
                          scale=[1,187/200],correction=False,mask=[],figsize=(10,6))

In [None]:
# sidebands
b2 = np.linspace(0.2,5,50)
a,b=mpl.plot_data_mc_stacked(variable='p_D_l',bins=b2,ratio=False,
                          cut='fakeD_prob>0.1 and (D_M<1.83 or D_M>1.91)',
                          scale=[1,187/200],correction=False,mask=[],figsize=(10,6))

In [None]:
# signal
b2 = np.linspace(0.2,5,50)
a,b=mpl.plot_data_mc_stacked(variable='p_D_l',bins=b2,ratio=False,
                          cut='fakeD_prob>0.1 and D_M>1.83 and D_M<1.91',
                          scale=[1,187/200],correction=False,mask=[],figsize=(10,6))