In [None]:
import numpy as np
from scipy import stats
import tensorflow as tf
import tensorflow_probability as tfp
from matplotlib import pyplot as plt
tfd = tfp.distributions

## Model the variables with similar, but different, models for data and MC
Instead of reading data from a simulation let generate them.

In [None]:
data_model_var1 = tfd.mixture.Mixture(tfd.Categorical(probs=[0.4, 0.6]),
            components=[
              tfd.Normal(loc=0., scale=0.1),
              tfd.Normal(loc=0., scale=0.8),
            ])
data_model_var2 = tfd.Exponential(0.2)

mc_model_var1 = tfd.mixture.Mixture(tfd.Categorical(probs=[0.3, 0.7]),
            components=[
              tfd.Normal(loc=0.1, scale=0.1),
              tfd.Normal(loc=0., scale=1),
            ])
mc_model_var2 = tfd.Exponential(0.4)

## Generate the random data from the model
The generated data has 3 columns. The first two are two observed variables, the third is 0 for data, 1 for MC

In [None]:
NSAMPLE = 100000
data_var1 = data_model_var1.sample(NSAMPLE).numpy()
data_var2 = data_model_var2.sample(NSAMPLE).numpy()
# add a rotation to create correlations
data_var1, data_var2 = 0.7 * data_var1 + 0.3 * data_var2, 0.3 * data_var1 + 0.7 * data_var2

mc_var1 = mc_model_var1.sample(NSAMPLE).numpy()
mc_var2 = mc_model_var2.sample(NSAMPLE).numpy()
# add a small rotation to create correlations
mc_var1, mc_var2 = 0.9 * mc_var1 + 0.1 * mc_var2, 0.1 * mc_var1 + 0.9 * mc_var2

# merge the variables in a single vector, put a flag as last element (0/1 for data/mc)
data = np.vstack([data_var1, data_var2, np.zeros(NSAMPLE)]).T
mc = np.vstack([mc_var1, mc_var2, np.ones(NSAMPLE)]).T

In [None]:
data

In [None]:
# create a sample mixing and shuffling data and mc
sample = np.vstack([data, mc])
np.random.shuffle(sample)

# divide in train and test 50/50
training_sample = sample[int(NSAMPLE/2):]
testing_sample = sample[:int(NSAMPLE/2)]

X_train = training_sample[:, :-1]
Y_train = training_sample[:, -1]

## Optimize the classifier, optimizing the cross entropy

In [None]:
classifier = tf.keras.Sequential([
    tf.keras.Input(X_train.shape[1]),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

classifier.compile(loss='binary_crossentropy', metrics='accuracy')
classifier.fit(X_train, Y_train, epochs=5)

## Plot the variables before the correction

In [None]:
testing_sample_mc = testing_sample[testing_sample[:, -1] == 1]
testing_sample_data = testing_sample[testing_sample[:, -1] == 0]

fig, axs = plt.subplots(1, 2, figsize=(13, 6))
bins = np.linspace(-2, 4, 100)
axs[0].hist(testing_sample_mc[:, 0], bins=bins, histtype='step', label='MC', lw=2)
axs[0].hist(testing_sample_data[:, 0], bins=bins, histtype='step', label='data', lw=2)
axs[0].legend(loc=0, fontsize=14)
axs[0].set_title('var1', fontsize=14)

bins = np.linspace(-1, 30, 100)
axs[1].hist(testing_sample_mc[:, 1], bins=bins, histtype='step', lw=2)
axs[1].hist(testing_sample_data[:, 1], bins=bins, histtype='step', lw=2)
axs[1].set_title('var2', fontsize=14)


plt.show()

## Compute the weight on the test sample (the value is close to 1)

The weight is just the ratio between

$$w = \frac{P[data|x]}{P[MC|x]}$$

Usually this is estimated with a ND-histogram, or just ignoring correlation using a set of 1D histograms.

A perfect classifier will return $y=P[data|x]$. Of course $P[MC|x] = 1 - P[data|x]$. So:

$$w = \frac{y}{1 - y}$$

Note that if we have data very similar to MC we will get a very poor classifier, which will output 0.5, and so the weight will be 1.

An additional thing to do (not done here), is to calibrate the classifier.

In [None]:
weights = classifier.predict(testing_sample_mc[:, :-1])
weights = weights / (1 - weights)

## Weight value distribution

In [None]:
fig, ax = plt.subplots()
ax.hist(weights, bins=np.linspace(0, 20, 100))
plt.show()

## Apply the weights and plot the distributions

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(13, 6))
bins = np.linspace(-2, 4, 100)
axs[0].hist(testing_sample_mc[:, 0], weights=1/weights, bins=bins, histtype='step', label='MC reweighted', lw=2)
axs[0].hist(testing_sample_data[:, 0], bins=bins, histtype='step', label='data', lw=2)
axs[0].hist(testing_sample_mc[:, 0], bins=bins, histtype='step', label='MC', ls='--')
axs[0].legend(loc=0, fontsize=14)
axs[0].set_title('var1', fontsize=14)

bins = np.linspace(-1, 30, 100)
axs[1].hist(testing_sample_mc[:, 1], weights=1/weights, bins=bins, histtype='step', lw=2)
axs[1].hist(testing_sample_data[:, 1], bins=bins, histtype='step', lw=2)
axs[1].hist(testing_sample_mc[:, 1], bins=bins, histtype='step', label='MC', ls='--')
axs[1].set_title('var2', fontsize=14)

plt.show()

## Look at the correlation
The reweighed MC reproduce the data correlation!

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(16, 5))
bins = [np.linspace(-2, 2, 50), np.linspace(0., 10, 50)]
axs[0].hist2d(testing_sample_data[:, 0], testing_sample_data[:, 1], bins=bins)
axs[1].hist2d(testing_sample_mc[:, 0], testing_sample_mc[:, 1], bins=bins)
axs[2].hist2d(testing_sample_mc[:, 0], testing_sample_mc[:, 1], bins=bins, weights=1/np.squeeze(weights))
axs[0].set_title('data', fontsize=14)
axs[1].set_title('MC', fontsize=14)
axs[2].set_title('MC reweighted', fontsize=14)
plt.show()

## Compare effective statistics with size of the sample
We loose a lot of effective statistics!

In [None]:
w = 1/np.squeeze(weights)
np.sum(w) ** 2 / np.sum(w ** 2)

In [None]:
len(w)