In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
import tensorflow_probability as tfp
tfd = tfp.distributions

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
columns_X = ['el_rawcl_Es0', 'el_rawcl_Es1', 'el_rawcl_Es2', 'el_rawcl_Es3', 'el_rawcl_E', 'el_cl_aeta', 'el_f0']
column_y = 'el_erawOverEtrue'

normalizer = tf.keras.layers.Normalization()
# this will take a while since it needs to read all your data and compute the mean and the variabnce
# normalization is non-trainable layer, it must be run before the training
normalizer.adapt(np.array(df_train[columns_X]))

print(normalizer.mean.numpy())
print(normalizer.variance.numpy())

In [None]:
print(tf.math.reduce_mean(normalizer(df_train[columns_X]), axis=0))
print(tf.math.reduce_variance(normalizer(df_train[columns_X]), axis=0))


In [None]:
model_linear_regression = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(units=1)
])

model_linear_regression.compile(optimizer=tf.optimizers.Adam(learning_rate=0.1),
                                loss='mean_absolute_error')

history = model_linear_regression.fit(
    df_train[columns_X].values,
    df_train[column_y].values,
    epochs=5,
    # Calculate validation results on 20% of the training data.
    validation_split = 0.2)

In [None]:
def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('Error [MPG]')
  plt.legend()
  plt.grid(True)

plot_loss(history)

In [None]:
for col, weight in zip(columns_X, model_linear_regression.layers[1].kernel):
    print("{:<20s} : {:+.3f}".format(col, weight.numpy()[0]))
print("bias                 : {:+.3f}".format(model_linear_regression.layers[1].bias.numpy()[0]))

In [None]:
yhat_linear_regression = model_linear_regression.predict(df_test[columns_X]).T[0]

In [None]:
xx = model_linear_regression.layers[1]
xx.set_weights([np.array([[ 0.0],
        [ 0.0],
        [ 0.0 ],
        [ 0.0],
        [0.0 ],
        [0.0],
        [ 0.0]], dtype=np.float32),
 np.array([0.89], dtype=np.float32)])

In [None]:
df_test['el_erawOverEtrue'].median()

In [None]:
fig, ax = plt.subplots()
xspace = np.linspace(0.5, 1.2, 100)
ax.hist(df_test['el_rawcl_E'] / df_test['el_truth_E'], bins=xspace, label='raw')
ax.hist(df_test['el_rawcl_E'] / yhat_linear_regression / df_test['el_truth_E'], bins=xspace, label='linear model')
ax.legend(loc=0)
plt.show()

In [None]:
model = tf.keras.Sequential([
    tf.keras.Input(shape=len(columns_X)),
    normalizer,
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1),
])

tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.001), loss='mean_absolute_percentage_error')
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(df_train[columns_X].values, df_train['el_truth_E'].values,
                    epochs=20, verbose=True, batch_size=1024, validation_split=0.2, callbacks=[callback])

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()

In [None]:
yhat = model.predict(df_test[columns_X]).T[0]

In [None]:
fig, ax = plt.subplots()
xspace = np.linspace(0.5, 1.2, 100)
ax.hist(df_test['el_rawcl_E'] / df_test['el_truth_E'], bins=xspace, label='raw', histtype='step')
ax.hist(yhat / df_test['el_truth_E'], bins=xspace, label='model', histtype='step')
ax.legend(loc=0)
plt.show()