# Model comparison / model selection
## using the Akaike Information Criterion and Bayesian Information Criterion

In [None]:
# import all packages we will need in this notebook
import pandas
import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.optimize

In [None]:
# read in some data we created for this example (.dat is a generic filename, it's just a text file)
data_filename='https://raw.githubusercontent.com/uofscphysics/STEM_Python_Course/Summer2020/02_Week2/Data/1D_intro_examples.dat'
example_data_1D = pandas.read_csv(data_filename,sep=',',header=0)#this file is separated by spaces and its first line contains the names of the columns (header) 
print(example_data_1D.head())

In [None]:
#Let's plot the data, with error bars, that we read from file (See Day 2)
plt.errorbar(example_data_1D['x'], #x,y,and error are the column names
             example_data_1D['y'], 
             yerr=example_data_1D['error'],#yerr denotes an error in the y-direction for plotting
             fmt='.') #fmt is "format", saying that I want data marked by "points"
plt.xlabel('Days since I left the honey jar out') #set the x-axis label 
plt.ylabel('Number of ants') #set the y-axis label
plt.show()

In [None]:
#The data were generated with a simple quadratic equation:
#ax^2+bx+c. 
def modelA_quadratic(x, alpha, beta, gamma): 
    """A quadratic in x (this happens to be the true model)"""
    return(alpha*x**2 + beta*x + gamma)

def modelB_cubic(x, p0, p1, p2, p3):
    """A third-order polynomial model."""
    return( p0 + p1*x + p2*x**2 + p3*x**3)
    
def modelC_exponential(x, floor, scale, expfactor):
    """"""
    return( floor + scale * np.exp( expfactor * x) )


In [None]:
def neg_ln_likelihoodA_quadratic(theta, args):
    """ This function accepts an argument "theta", which is 
    a list of parameter values [alpha,beta,gamma] for model A.
    It then calculates a log-likelihood by computing the 
    chi-squared statistic (i.e., assuming gaussian uncertainties), 
    which compares the observations and errors (provided in args) 
    to the model A.
    """   
    x, y, yerr = args 
    alpha, beta, gamma = theta 
    
    model_at_observed_x = modelA_quadratic(x, alpha, beta, gamma) 
    inverse_uncertainty2 = 1./yerr**2 
    chisquared = np.sum((y - model_at_observed_x)**2 
                        * inverse_uncertainty2 )
    # When all uncertainties are gaussian and uncorrelated, the 
    # natural log of the likelihood is ln(likelihood) = -0.5 * chi2 
    # We want -ln(likelihood) so we return 0.5*chi2.
    return (0.5 * chisquared)

In [None]:
def neg_ln_likelihoodB_cubic(theta, args):
    """ This function accepts an argument "theta", which is 
    a list of parameter values [p0,p1,p2,p3] for model B 
    (the cubic model).
    It then calculates a log-likelihood by computing the 
    chi-squared statistic (i.e., assuming gaussian uncertainties), 
    which compares the observations and errors (provided in args) 
    to the model B.
    """   
    x, y, yerr = args 

    # Introducing a short-hand here, to pass a list 
    # as distinct arguments to a function, pass *name_of_list
    model_at_observed_x = modelB_cubic(x, *theta) 
    
    # that is equivalent to doing: 
    # p0, p1, p2, p3 = theta 
    # model_at_observed_x = modelB_quartic(x, p0, p1, p2, p3)

    inverse_uncertainty2 = 1./yerr**2 
    chisquared = np.sum((y - model_at_observed_x)**2 
                        * inverse_uncertainty2 )
    return (0.5 * chisquared)

In [None]:
def neg_ln_likelihoodC_exponential(theta, args):
    """ This function accepts an argument "theta", which is 
    a list of parameter values [floor, scale, k] for model C 
    (the exponential model).
    It then calculates a log-likelihood by computing the 
    chi-squared statistic (i.e., assuming gaussian uncertainties), 
    which compares the observations and errors (provided in args) 
    to the model C.
    """   
    x, y, yerr = args 
    
    model_at_observed_x = modelC_exponential(x, *theta) 
    inverse_uncertainty2 = 1./yerr**2 
    chisquared = np.sum((y - model_at_observed_x)**2 
                        * inverse_uncertainty2 )
    return (0.5 * chisquared)

Let's have a look at what these models look like, using some parameters that are in the right ball park

In [None]:
def plot_three_models(thetaA, thetaB, thetaC):
    """ Plot all three models over the data, using 
    three parameter sets thetaA, thetaB and thetaC"""
    # first plot the data
    plt.errorbar(example_data_1D['x'], example_data_1D['y'], 
             yerr=example_data_1D['error'],
             fmt='.') 
    plt.xlabel('Days since I left the honey jar out')
    plt.ylabel('Number of ants')

    # now plot the models
    x_for_plotting = np.arange(-0.02, 10.1, 0.1)
    plt.plot(x_for_plotting, 
             modelA_quadratic(x_for_plotting, *thetaA),
             color='b', label='A (quadratic)')
         
    plt.plot(x_for_plotting, 
             modelB_cubic(x_for_plotting, *thetaB),
             color='g', label='B (cubic)')

    plt.plot(x_for_plotting, 
             modelC_exponential(x_for_plotting, *thetaC),
             color='m', label='C (exponential)')

    plt.legend()
    plt.show()

In [None]:
plot_three_models([50, -5, 1], [20, 10, 0.7, 4], [-100,100,0.3])

In [None]:
# Do a maximum likelihood estimation (equivalent to chi2 minimization) to find the best parameters for each model

maxlike_resultA = scipy.optimize.minimize(
    neg_ln_likelihoodA_quadratic, 
    x0=[20,-1,-1],
    bounds=None, #[(-100,100),(-100,100),(0,100)], 
    args=[example_data_1D['x'],
          example_data_1D['y'],
          example_data_1D['error']])

maxlike_resultB = scipy.optimize.minimize(
    neg_ln_likelihoodB_cubic, 
    x0=[20, 10, 0.7, 0.3],
    bounds=None,
    args=[example_data_1D['x'],
          example_data_1D['y'],
          example_data_1D['error']])

maxlike_resultC = scipy.optimize.minimize(
    neg_ln_likelihoodC_exponential, 
    x0=[-100, 200, 1],
    bounds=None, #[(-np.inf,np.inf),(-np.inf,np.inf),(-np.inf, np.inf)], 
    args=[example_data_1D['x'],
          example_data_1D['y'],
          example_data_1D['error']])

In [None]:
print(maxlike_resultA['x'])
print(maxlike_resultB['x'])
print(maxlike_resultC['x'])

In [None]:
plot_three_models(maxlike_resultA['x'], 
                  maxlike_resultB['x'], 
                  maxlike_resultC['x'])

# Model comparison :  the AIC and BIC 

The Akaike information criterion is defined as:

### AIC = 2 k - 2 ln(L)

it balances a model's ability to fit the data (measured by the maximum likelihood value L) against the number of parameters 'k' that the model requires.  A smaller value of the AIC indicates a better model (i.e., one that matches the data well, without being unnecessarily complex).

The Bayesian information criterion is very similar. It replaces the 2 in the first term with ln(n), where n is the number of data points.  This puts more weight on the first term (which penalizes complexity) when the size of the sample is large.  As with the AIC, smaller is better.

### BIC = k ln( n ) - 2 ln( L )

These two metrics are the most commonly used, but many others exist, with subtle differences in their properties.  One should take care to apply the appropriate criteria based on the data, the models, and the problem.

In [None]:
# Let us define a function that computes the AIC and BIC for each of our three models

def aic(numparams, lnmaxlikelihood):
    return (2 * numparams - 2 * lnmaxlikelihood)


def bic(numparams, numdatapoints, lnmaxlikelihood):
    return (numparams * np.log(numdatapoints) - 2 * lnmaxlikelihood)

In [None]:
# Compute each max likelihood value and report.  

# Remember that we have found the minimum
# of the negative log likelihood for each
# function. This is reported as the 'fun'
# entry in our set of results from the 
# scipy.optimize.minimize() function calls.

# The opposite of that minimum is our maximum log likelihood.

maxlikelihoodvalueA = -maxlike_resultA['fun']
maxlikelihoodvalueB = -maxlike_resultB['fun']
maxlikelihoodvalueC = -maxlike_resultC['fun']

print(maxlikelihoodvalueA, maxlikelihoodvalueB, maxlikelihoodvalueC)

### Compute the AIC and BIC for each model

In [None]:
aic_list = [
    aic(3, maxlikelihoodvalueA),
    aic(4, maxlikelihoodvalueB),
    aic(3, maxlikelihoodvalueC)]

n = len(example_data_1D)
bic_list = [
    bic(3, n, maxlikelihoodvalueA),
    bic(4, n, maxlikelihoodvalueB),
    bic(3, n, maxlikelihoodvalueC)]

print(aic_list)
print(bic_list)

In [None]:
# Make a nice pandas DataFrame table 

modelnames = ['A(quadratic)', 'B(cubic)', 'C(exp)']
df = pandas.DataFrame({
    'name':modelnames, 
    'AIC':aic_list, 
    'BIC':bic_list,
    'DeltaAIC':np.array(aic_list)-np.min(aic_list),
    'DeltaBIC':np.array(bic_list)-np.min(bic_list),
})

# Show the table
df

In [None]:
# interpreting the AIC / BIC as statistical weight

wAIC = np.exp(-0.5 * df['DeltaAIC'])
df['wgtAIC'] = wAIC / np.sum(wAIC)

wBIC = np.exp(-0.5 * df['DeltaBIC'])
df['wgtBIC'] = wBIC / np.sum(wBIC)


# Interpreting the AIC / BIC as an odds ratio
df['oddsAIC'] = np.max(df['wgtAIC']) / df['wgtAIC']
df['oddsBIC'] = np.max(df['wgtBIC']) / df['wgtBIC']


In [None]:
# set a pandas option so we only display two decimal places
pandas.options.display.float_format = '{:.2f}'.format
df

### next topic :  using the Bayesian evidence (Bayes factors) to compare models considering the entire parameter space

#### Reading list

A good broad book on Bayesian data analysis
* Sivia, D. and Skilling, J. "Data Analysis: A Bayesian Tutorial"
https://books.google.com/books/about/Data_Analysis.html?id=lYMSDAAAQBAJ

Some summary papers: 

* Wagenmakers and Farrell 2004
https://link.springer.com/content/pdf/10.3758/BF03206482.pdf

* Symonds and Moussalli 2010
http://byrneslab.net/classes/biol607/readings/Symonds_and_Moussalli_2010_behav_ecol.pdf