## Predicting uncertainty with (deep) neural networks

### Selected Distribution - Standard Exponential

Hypothesis: 
    - Do we obtain better results if we try to learn lambda (exp.parameter) instead of the variance (gaussian)?

https://math.stackexchange.com/questions/101481/calculating-maximum-likelihood-estimation-of-the-exponential-distribution-and-pr

**NOTE**: This notebook has been adapted from the original version : https://github.com/sthorn/deep-learning-explorations/blob/master/predicting-uncertainty.ipynb*

### Imports 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import statistics as stat

%matplotlib inline

### Generate some random data

In [None]:
# Generate some linearly related data
np.random.seed(2019)

n_samples = 10000
x = np.random.standard_exponential(n_samples)

sin_ = 0.4 * np.sin(x*4*np.pi)
eps = np.random.standard_exponential(size=n_samples) * sin_ # noise
y_1 = 2*x + 1 
y = y_1 + eps # linear data with noise

### Describe the generated data

In [None]:
# Plot the data
plt.figure(figsize=(14,6))
plt.title('Generated synthetic data')

# plot data (with and without noise addition) 
plt.plot(x, y_1, '.', alpha=0.2, color='black' ) 
plt.plot(x, y, '.', alpha=0.2, color='green' ) 

plt.legend([r'data without noise',r'data with noise', r'$\mu$', r'$\pm\sigma$'])
plt.xlabel(r'$x$');
plt.ylabel(r'$y$');

### Prepare data for training

In [None]:
# Add in column of dummy zeros to represent sigma 
sigma = np.zeros(y.size)
y = np.stack([y, sigma], axis=1)

In [None]:
# Reshape x for PyTorch
#x = x.reshape(10000, 1)
x = x.reshape(x.size, 1)

In [None]:
# Split into training and validation sets
n_validation = 500
x_val, x = x[:n_validation], x[n_validation:]
y_val, y = y[:n_validation], y[n_validation:]

In [None]:
# Plot the data
plt.figure(figsize=(14,6))
plt.title('Training and validation sets')
plt.plot(x, y, '.', color= 'green')
plt.plot(x_val, y_val, '.', color='purple')
plt.xlabel(r'$x$');
plt.ylabel(r'$y$');

In [None]:
# Convert x and y to PyTorch Variables on the GPU
dtype = torch.cuda.FloatTensor
x_t = Variable(torch.from_numpy(x).type(dtype))
y_t = Variable(torch.from_numpy(y).type(dtype), requires_grad=False)

In [None]:
# Check shape ok for PyTorch - x should be (n x 1), y (n x 2)
x_t.shape, y_t.shape

## Experiment 

### Create a simple network 

In [None]:
# Create a simple two-layer network with one input (x) and two outputs (y, sigma)
n_inputs = 1
n_outputs = 2
n_hidden = 1000
model_1 = torch.nn.Sequential(torch.nn.Linear(n_inputs, n_hidden),
                            torch.nn.ReLU(),
                            torch.nn.Linear(n_hidden, n_outputs)
                           ).cuda()

In [None]:
# Adam optimizer
learning_rate = 1e-4
optimizer = torch.optim.Adam(model_1.parameters(), lr=learning_rate)

### Custom loss function
This is where the magic happens.

In [None]:
def loss_variance_error(input, target):
    
    # Estimate target value for variance (sigma^2) with (y_pred - y)**2
    #
    #    actual y        is target[:,0]
    # predicted y        is input[:,0]
    #    actual variance is target[:,1] - estimated here
    # predicted variance is input[:,0]
    
    # Use 'requires_grad == False' to prevent PyTorch from trying to differentiate 'target'
    target[:,1] = Variable((input[:,0].data - target[:,0].data)**2, 
                           requires_grad=False)  
    # Return MSE loss 
    return F.mse_loss(input, target)

### Training loop 
We train this simple model with batch size equal to the whole dataset.

In [None]:
n_epochs = 10000
for i in range(n_epochs):
    
    # Calculate predicted y from x
    y_pred = model_1(x_t)
    
    # Calculate loss
    loss = loss_variance_error(y_pred, y_t)
    if i%500 == 0: print(f'epoch: {i:4} loss: {loss.data.item():.3}',)

    # Backprop, first zeroing gradients
    optimizer.zero_grad()
    loss.backward()

    # Update parameters
    optimizer.step()

### Get predictions for validation set 

In [None]:
# Get predicted y and sigma for validation set
x_val_t = Variable(torch.from_numpy(x_val).type(dtype)) 
pred_1 = model_1(x_val_t)

# prediction of the response variables
y_pred_1 = pred_1[:,0].cpu().data
sigma_pred_1 = pred_1[:,1].cpu().data

### Plot results

In [None]:
# Plot predictions and their errors
fig, ax1 = plt.subplots(figsize=(14,6))

ax1.plot(x, y[:,0], '.', alpha=0.4, color='pink')
ax1.plot(x_val, y_val[:,0], '.', color='blue')
ax1.plot(x_val, y_pred_1, '.', color='purple')
ax1.errorbar(x_val, y_pred_1, yerr=sigma_pred_1, fmt='.k');

ax1.set_xlabel(r'$x$');
ax1.set_ylabel(r'$y$');
ax1.set_title('Validation set predictions');
ax1.legend([r'Training', r'Validation', r'Prediction of y','Prediction of $\pm \sigma$'], loc='upper left', );

In [None]:
# Plot predictions and their errors
fig, ax1 = plt.subplots(figsize=(14,6))
ax1.plot(x_val, y_val[:,0], '.', color='purple')
ax1.errorbar(x_val, y_val[:,0], yerr=sigma_pred_1.sqrt() , color='pink', fmt='.'); 
ax1.errorbar(x_val, y_pred_1, yerr=sigma_pred_1.sqrt() , color='green', fmt='.'); 

# One standard-deviation envelope
plt.xlabel(r'$x$');
plt.ylabel(r'$y$');
plt.title('Validation set predictions');
plt.legend([r'Original data','Validation set predictions'], loc='upper left', );

### Is it the predicted variance error distribution equals to the real variance error distirbution?

#### Predicted variance error distribution

In [None]:
import seaborn as sns
sns.distplot(sigma_pred_1)

#### Generated error distribution

In [None]:
sns.distplot(np.random.standard_exponential(size=n_samples))

#### Predicted y response variable distribution

In [None]:
sns.distplot(y_pred_1)

#### Real y distribution (validation)

In [None]:
sns.distplot(y_val[:,0])

#### Real y distribution (training)

In [None]:
sns.distplot(y[:,0])

#### Real variance error distribution (validation)

In [None]:
real_sigma = (y_pred_1.numpy()- y_val[:,0])**2

In [None]:
sns.distplot(real_sigma)

#### Real variance error distribution (training)

In [None]:
real_sigma_train = (y_pred_1.numpy()- y[:500,0])**2

In [None]:
sns.distplot(real_sigma_train)

#### Plot the real variance error 

In [None]:
import math
# Plot predictions and their errors
fig, ax1 = plt.subplots(figsize=(14,6))

ax1.plot(x_val, y_val[:,0], '.', color='purple')
plt.errorbar(x_val, y_val[:,0], yerr=np.sqrt(real_sigma) , color='pink', fmt='.'); 
ax1.errorbar(x_val, y_pred_1, yerr=np.sqrt(real_sigma) , color='green', fmt='.'); 

# One standard-deviation envelope

plt.title('Validation set predictions');
plt.legend([r'Original data $\pm \sigma$ envelope','Validation set predictions'], loc='upper left');

### Are the two (real and predicted error variance) distributions the same?

#### Statistical tests to affirm the assumption extraced from plots 

In [None]:
from scipy import stats

#### Normality check with p-p plot and saphiro test

In [None]:
stats.probplot(real_sigma, fit=True, rvalue=True, plot=plt);

In [None]:
stats.probplot(sigma_pred_1, fit=True, rvalue=True, plot=plt);

In [None]:
stats.shapiro(sigma_pred_1)

Comparing the two distributions with Kolmogorov-Smirnov

In [None]:
stats.ks_2samp(sigma_pred_1, real_sigma)

We can reject the null hypothesis: the two distribution are not identical

In [None]:
import pickle
pickle.dump( x_val, open( "x_val.p", "wb" ) )
pickle.dump( y_val, open( "y_val.p", "wb" ) )
pickle.dump( y_pred_1, open( "y_pred_1.p", "wb" ) )
pickle.dump( sigma_pred_1, open( "sigma_pred_1.p", "wb" ) )

In [None]:
from utilities import validation_metrics

In [None]:
validation_metrics.overall_model_performance(x_val[:,0], y_val[:,0], y_pred_1, sigma_pred_1, std_factor=1/4, extreme_values_performance=True, display_plots=True)

#### Evaluating the results with evaluation_metrics library

In [None]:
validation_metrics.tests_prior_beliefs(x_val[:,0], y_val[:,0], y_pred_1, sigma_pred_1, data_pdf_expected=np.random.standard_exponential(size=n_samples), name_pdf_expected='Exponential Standard')

In [None]:
a

In [None]:
stats.chisquare(sigma_pred_1, a)