In [None]:
import random
import numpy as np
from scipy.stats import binom
import matplotlib.pyplot as plt
from scipy.special import erfinv
from scipy.integrate import quad
from scipy.stats import poisson, chi2
from iminuit import Minuit
%matplotlib inline

## Utility functions

In [None]:
def find_max(function, bound_low, bound_high, grid=100000,):
    """
    Return the maximum value of a function
    """
    if (not grid >= 0):
        raise ValueError("Grid must be a a positive intiger")
    # Generate grid of x values
    x = np.linspace(bound_low, bound_high, num=grid, endpoint=True,)
    y = function(x)
    return y.max()

## PDF Classes

In [None]:
class ProbabilityDensityFunction(object):
    """
    Parent class containing common methods and members to be used by all pdf classes
    """

    def __init__(self, bounds):

        if (not isinstance(bounds, tuple)):
            raise TypeError("Variable bound must be a tuple with the form (boundMin, boundMax)")
        if (not len(bounds) == 2):
            raise ValueError("Variable bound must have form (boundMin, boundMax)")
        if (not bounds[0] < bounds[1]):
            raise ValueError("First element in tuple must be smaller than second")
        
        # Initialise class variables
        self.boundMin, self.boundMax = bounds

    def integrate(self, limits):
        """
        Evaluate the integral of the pdf within the specified bounds
        ##### NOTE: Integral is not normalised within the specified bounds of the class #####
        """

        if (not isinstance(limits, tuple)):
            raise TypeError("Variable bound must be a tuple with the form (limitMin, limitMax)")
        if (not len(limits) == 2):
            raise ValueError("Variable bound must have form (limitMin, limitMax)")
        if (not limits[0] < limits[1]):
            raise ValueError("First element in tuple must be smaller than second")
        if (not limits[0] >= self.boundMin):
            raise ValueError("Lower integral limit must be larger than lower bound of pdf")
        if (not limits[1] <= self.boundMax):
            raise ValueError("Higher integral limit must be smaller than upper bound of pdf")
    
        lowerLimit, upperLimit = limits
        integralResult, IntegralError = quad(self._evaluate, lowerLimit, upperLimit) 
        return integralResult

class Gaussian(ProbabilityDensityFunction):
    """
    Class that contains a variable function for a Gaussian
    """

    def __init__(self, mean, sigma, bounds):

        # Initialise parent class
        super().__init__(bounds)
        
        # Initialise class variables
        self.mean = mean
        self.sigma = sigma
        # Find maximum value of the distribution within the bounds
        self.maxValue = find_max(self._evaluate, self.boundMin, self.boundMax)

    def _evaluate(self, x,):
        """
        Evaluate the gaussian function of the distribution
        ##### NOTE: Returns un-normalised values between the bounds #####
        """

        return 1/(self.sigma * np.sqrt(2.0*np.pi)) * np.exp( -(x-self.mean)**2 / (2.0 * self.sigma**2) )

    def setParameters(self, mean=None, sigma=None):
        """
        Set passed variables as parameters for pdf
        """

        # Use default values for parameters of none are passed through kwargs
        if not mean == None:                self.mean = mean 
        if not sigma == None:               self.sigma = sigma

class Linear(ProbabilityDensityFunction):
    """
    Class that contains a variable function for a  Linear function
    """

    def __init__(self, slope, intercept, bounds):

        # Initialise parent class
        super().__init__(bounds)

        # Initialise class variables
        self.intercept = intercept
        self.slope = slope
        # Find maximum value of the distribution within the bounds
        self.maxValue = find_max(self._evaluate, self.boundMin, self.boundMax)

    def _evaluate(self, x,):
        """
        Evaluate the linear function of the distribution
        NOTE: Returns un-normalised values
        """
        
        return self.intercept + self.slope * x

    def setParameters(self, slope=None, intercept=None):
        """
        Set passed variables as parameters for pdf
        """

        # Use default values for parameters of none are passed through kwargs
        if not slope == None:                   self.slope = slope
        if not intercept == None:               self.intercept = intercept       

class SecondOrderPolynomial(ProbabilityDensityFunction):
    """
    Class that contains a variable function for a 2nd order polinomial
    """

    def __init__(self, alpha, beta, gamma, bounds):

        # Initialise parent class
        super().__init__(bounds)

        # Initialise class variables
        self.alpha = alpha
        self.beta = beta
        self.gamma = gamma
        # Find maximum value of the distribution within the bounds
        self.maxValue = find_max(self._evaluate, self.boundMin, self.boundMax)

    def _evaluate(self, x,):
        """
        Evaluate the linear function of the distribution
        NOTE: Returns un-normalised values
        """
        
        return self.alpha*x**2 + self.beta*x + self.gamma

    def setParameters(self, alpha=None, beta=None, gamma=None):
        """
        Set passed variables as parameters for pdf
        """

        # Use default values for parameters of none are passed through kwargs
        if not alpha == None:                   self.alpha = alpha
        if not beta == None:                    self.beta = beta 
        if not gamma == None:                   self.gamma = gamma

class Exponential(ProbabilityDensityFunction):
    """
    Class that contains a variable function for an exponential
    """

    def __init__(self, decayConstant, bounds):

        # Initialise parent class
        super().__init__(bounds)

        # Initialise class variables
        self.decayConstant = decayConstant
        # Find maximum value of the distribution within the bounds
        self.maxValue = find_max(self._evaluate, self.boundMin, self.boundMax)

    def _evaluate(self, x,):
        """
        Evaluate the linear function of the distribution
        NOTE: Returns un-normalised values
        """
        
        return np.exp(-x/self.decayConstant)

    def setParameters(self, decayConstant=None):
        """
        Set passed variables as parameters for pdf
        """

        # Use default values for parameters of none are passed through kwargs
        if not decayConstant == None:                   self.decayConstant = decayConstant

class GaussianWithLinear(ProbabilityDensityFunction):
    """
    Class that will generate a varuable function consisting of gaussian signal with a linear background 
    """

    def __init__(self, backgroundFraction, mean, sigma, slope, intercept, bounds,):
        
        # Initialise parent class
        super().__init__(bounds)

        self.backgroundFraction = backgroundFraction
    
        # Initialise pdf objects
        self.signal = Gaussian(mean, sigma, bounds)
        self.background = Linear(slope, intercept, bounds)

    def _evaluate(self, x,):
        """
        Evaluate the function of the distribution
        NOTE: Returns un-normalised values between the bounds
        """

        return (1-self.backgroundFraction)*self.signal._evaluate(x,) + \
                self.backgroundFraction*self.background._evaluate(x,) 

    def setParameters(self, backgroundFraction=None, mean=None, sigma=None, slope=None, intercept=None):
        """
        Set passed variables as parameters for pdf
        """

        # Update parameters to variables that have been passed through the method
        if not backgroundFraction == None:      self.backgroundFraction = backgroundFraction
        if not mean == None:                self.signal.mean = mean
        if not sigma == None:               self.signal.sigma = sigma 
        if not slope == None:               self.background.slope = slope
        if not intercept == None:           self.background.intercept = intercept

class GaussianWithPolinomial(ProbabilityDensityFunction):
    """
    Class that will generate a varuable function consisting of gaussian signal with a second order polinomial background 
    """

    def __init__(self, backgroundFraction, mean, sigma, alpha, beta, gamma, bounds):
        
        # Initialise parent class
        super().__init__(bounds)

        self.backgroundFraction = backgroundFraction
    
        # Initialise pdf objects
        self.signal = Gaussian(mean, sigma, bounds)
        self.background = SecondOrderPolynomial(alpha, beta, gamma, bounds)

    def _evaluate(self, x,):
        """
        Evaluate the function of the distribution
        NOTE: Returns un-normalised values between the bounds
        """

        return (1-self.backgroundFraction)*self.signal._evaluate(x,) + \
                self.backgroundFraction*self.background._evaluate(x,) 

    def setParameters(self, backgroundFraction=None, mean=None, sigma=None, alpha=None, beta=None, gamma=None):
        """
        Set passed variables as parameters for pdf
        """

        # Update parameters to variables that have been passed through the method
        if not backgroundFraction == None:  self.backgroundFraction = backgroundFraction
        if not mean == None:                self.signal.mean = mean
        if not sigma == None:               self.signal.sigma = sigma 
        if not alpha == None:               self.background.alpha = alpha
        if not beta == None:                self.background.beta = beta 
        if not gamma == None:               self.background.gamma = gamma

class GaussianWithExponential(ProbabilityDensityFunction):
    """
    Class that will generate a varuable function consisting of gaussian signal with an exponential background 
    """

    def __init__(self, backgroundFraction, mean, sigma, decayConstant, bounds):
        
        # Initialise parent class
        super().__init__(bounds)

        self.backgroundFraction = backgroundFraction
    
        # Initialise pdf objects
        self.signal = Gaussian(mean, sigma, bounds)
        self.background = Exponential(decayConstant, bounds)

    def _evaluate(self, x,):
        """
        Evaluate the function of the distribution
        NOTE: Returns un-normalised values between the bounds
        """

        return (1-self.backgroundFraction)*self.signal._evaluate(x,) + \
                self.backgroundFraction*self.background._evaluate(x,) 

    def setParameters(self,backgroundFraction=None, mean=None, sigma=None, decayConstant=None):
        """
        Set passed variables as parameters for pdf
        """

        # Update parameters to variables that have been passed through the method
        if not backgroundFraction == None:  self.backgroundFraction = backgroundFraction
        if not mean == None:                self.signal.mean = mean
        if not sigma == None:               self.signal.sigma = sigma 
        if not decayConstant == None:       self.background.decayConstant = decayConstant

## Minimisation classes

In [None]:
class NegativeLogLikelihood(object):
    """
    Class containing minimisation statistic to be for pdf fitting. The class takes in events as the input.
    """

    def __init__(self, pdf, data):

        self.pdf = pdf
        self.data = data

    def setData(self, data):
        """
        Assign data class member to new dataset for the reuse of this class
        """
        
        self.data = data

    def findNormalisationFactor(self,):
        """
        Find integral of pdf 
        """
        
        # Define integration limits
        normalisationLimits = (self.pdf.boundMin, self.pdf.boundMax)

        return self.pdf.integrate(normalisationLimits)

    def evaluateSignalWithExponential(self, backgroundFraction, mean, sigma, decayConstant):
        """
        Evaluate negative log likelihood statisctic for passed parameters
        """

        # set new parameters
        self.pdf.setParameters(backgroundFraction=backgroundFraction, mean=mean, sigma=sigma, decayConstant=decayConstant,)

        # compute likelyhood using passed parameters
        normalisation = self.pdf.integrate((self.pdf.boundMin, self.pdf.boundMax))
        likelihood = self.pdf._evaluate(self.data,) / normalisation
        # set any negative likelihoods to neglegable positive values
        if (likelihood <= 0).any():
            likelihood[likelihood <=0 ] = 1e-10
        loglikelihood = np.log(likelihood)
        return -loglikelihood.sum()

    def evaluateSignalWithPolynomial(self, backgroundFraction, mean, sigma, alpha, beta, gamma,):
        """
        Evaluate negative log likelihood statisctic for passed parameters
        """

        # set new parameters
        self.pdf.setParameters(
            backgroundFraction=backgroundFraction, mean=mean, sigma=sigma, alpha=alpha, beta=beta, gamma=gamma,
        )

        # compute likelyhood using passed parameters
        normalisation = self.pdf.integrate((self.pdf.boundMin, self.pdf.boundMax))
        likelihood = self.pdf._evaluate(self.data,) / normalisation
        # set any negative likelihoods to neglegable positive values
        if (likelihood <= 0).any():
            likelihood[likelihood <=0 ] = 1e-10
        loglikelihood = np.log(likelihood)
        return -loglikelihood.sum()

    def evaluateSignalWithLinear(self, backgroundFraction, mean, sigma, slope, intercept):
        """
        Evaluate negative log likelihood statisctic for passed parameters
        """

        # set new parameters
        self.pdf.setParameters(
            backgroundFraction=backgroundFraction, mean=mean, sigma=sigma, slope=slope, intercept=intercept,
        )

        # compute likelyhood using passed parameters
        normalisation = self.pdf.integrate((self.pdf.boundMin, self.pdf.boundMax))
        likelihood = self.pdf._evaluate(self.data,) / normalisation
        # set any negative likelihoods to neglegable positive values
        if (likelihood <= 0).any():
            likelihood[likelihood <=0 ] = 1e-10
        loglikelihood = np.log(likelihood)
        return -loglikelihood.sum()

## Question 1

In [None]:
# Load in the higgs dataset
filename = "datafile-higgs.txt"
events = np.loadtxt(filename)
n_events = events.size
print(f"The number of events in dataset is: {n_events}")

In [None]:
# Define constants to be used througout the analysis
X_BOUNDS = (0.0, 10.0)
N_BINS = 70
X_VALUES = np.linspace(*X_BOUNDS, N_BINS, endpoint=True)

We can plot the dataset to visualise the distribution

In [None]:
# Plot histogram
plt.hist(events, bins=N_BINS, histtype="step", color="blue",) 
plt.xlabel("Mass, m (a.u)")
plt.ylabel(f"Number of events (counts)")
plt.title(r"Histogram of measured events in range $\in$ [0, 10]")

We will now attempt to fit the data with a gaussian signal + exponential background.

We expect the parameter values to be around:
* Background fraction = 0.98
* Mean = 2.5
* Sigma = 0.2
* Decay constant = 5 

In [None]:
# Define initial parameters of fit in a dictionary
initial_parameter_dict = {
    "backgroundFraction":       0.9,
    "mean":                     2.0,
    "sigma":                    0.5,
    "decayConstant":            4.5,   
}

# Initialise objects for pdf (object with function we want to fit) and the fitting statistic (nnl)
# We set the values of the parameter when initialising object to 1 as they will be overritien by the init param dict
fit_function = GaussianWithExponential(backgroundFraction=1.0, mean=1.0, sigma=1.0, decayConstant=1.0, bounds=X_BOUNDS,)
fit_statistic = NegativeLogLikelihood(fit_function, events)

# Initialise iminuit minimiser object
minimiser = Minuit(
    fit_statistic.evaluateSignalWithExponential,
    **initial_parameter_dict,
)
# Set the error diff to 0.5 as we are using negative log likelyhood
minimiser.errordef = 0.5
# Minimise the fit_statistic
result = minimiser.migrad()
# Output minimisation results
result

In [None]:
# Define list containing all parameter names
parameter_names = initial_parameter_dict.keys()
# Define list containing all optimised parameter values
optimised_parameter_vals = [result.values[key] for key in parameter_names]
# Define list containing all optimised parameter errors
optimised_parameter_errors = [result.errors[key] for key in parameter_names]
# Define liust containing units of each parameter
parameter_units = ["", "a.u", "a.u", ""]

# Print out results
for idx, name in enumerate(parameter_names):
    print(f"The optimised value +/- statistical error of {name} is:  {optimised_parameter_vals[idx]:.2f} +/- {optimised_parameter_errors[idx]:.2f} {parameter_units[idx]}\n")

Plot the distribution allong with the fit

In [None]:
# Compute the predictied y values of the fit (Values are normalised)
fit_line = fit_function._evaluate(X_VALUES) / fit_function.integrate(X_BOUNDS)

# Plot histogram with fit
plt.hist(events, bins=N_BINS, histtype="step", color="blue", label="Data", density=True) 
plt.plot(X_VALUES, fit_line, ls="--", c="purple", label="Fit")
plt.xlabel("Mass, m (a.u)")
plt.ylabel(f"Number of events (counts)")
plt.title(r"Histogram of measured events in range $\in$ [0, 10]")
plt.legend()

## Question 2

We will now attempt to fit the data with a gaussian signal + exponential background.

We expect the parameter values to be around:
* Background fraction = 0.98
* Mean = 2.5
* alpha/gamma = -0.16
* beta/gamma = 0.007
* We fix gamma = 1  

In [None]:
# Define initial parameters of fit in a dictionary
initial_parameter_dict = {
    "backgroundFraction":       0.9,
    "mean":                     2.0,
    "sigma":                    0.5,
    "alpha":                    -0.1,
    "beta":                     0.005,
    "gamma":                    1.0,
}

# Initialise objects for pdf (object with function we want to fit) and the fitting statistic (nnl)
# We set the values of the parameter when initialising object to 1 as they will be overritien by the init param dict
fit_function = GaussianWithPolinomial(
    backgroundFraction=1.0, mean=1.0, sigma=1.0, alpha=1.0, beta=1.0, gamma=1.0, bounds=X_BOUNDS,
)
fit_statistic = NegativeLogLikelihood(fit_function, events)

# Initialise iminuit minimiser object
minimiser = Minuit(
    fit_statistic.evaluateSignalWithPolynomial,
    **initial_parameter_dict,
)
# Set the error diff to 0.5 as we are using negative log likelyhood
minimiser.errordef = 0.5

# Fix the value of gamma to 1.0
minimiser.fixed["gamma"] = True

# Apply resonable limits to parameters
minimiser.limits = [(0.7, 1.0), (2.0, 3.0), (0.0, 0.4), (-0.2, 0.0), (0.0, 0.01), (0.0, 1.0)] 

# Minimise the fit_statistic
result = minimiser.migrad()

# Output minimisation results
result

In [None]:
# Define list containing all parameter names
parameter_names = initial_parameter_dict.keys()
# Define list containing all optimised parameter values
optimised_parameter_vals = [result.values[key] for key in parameter_names]
# Define list containing all optimised parameter errors
optimised_parameter_errors = [result.errors[key] for key in parameter_names]
# Define liust containing units of each parameter
parameter_units = ["", "a.u", "a.u", "a.u^2", "a.u", "",]

# Print out results
for idx, name in enumerate(parameter_names):
    print(f"The optimised value +/- statistical error of {name} is:  {optimised_parameter_vals[idx]:.2f} +/- {optimised_parameter_errors[idx]:.2f} {parameter_units[idx]}\n")

Plot the distribution allong with the fit

In [None]:
# Compute the predictied y values of the fit (Values are normalised)
fit_line = fit_function._evaluate(X_VALUES) / fit_function.integrate(X_BOUNDS)

# Plot histogram with fit
plt.hist(events, bins=N_BINS, histtype="step", color="blue", label="Data", density=True) 
plt.plot(X_VALUES, fit_line, ls="--", c="purple", label="Fit")
plt.xlabel("Mass, m (a.u)")
plt.ylabel(f"Number of events (counts)")
plt.title(r"Histogram of measured events in range $\in$ [0, 10]")
plt.legend()

## Question 3

We will now attempt to fit the data with a gaussian signal + exponential background.

We expect the parameter values to be around:
* Background fraction = 0.98
* Mean = 2.5
* Slope/intercept = -0.09
* We fix intercept = 1.0 

In [None]:
# Define initial parameters of fit in a dictionary
initial_parameter_dict = {
    "backgroundFraction":       0.9,
    "mean":                     2.0,
    "sigma":                    0.5,
    "slope":                    -0.05,
    "intercept":                1.0,
}

# Initialise objects for pdf (object with function we want to fit) and the fitting statistic (nnl)
# We set the values of the parameter when initialising object to 1 as they will be overritien by the init param dict
fit_function = GaussianWithLinear(
    backgroundFraction=1.0, mean=1.0, sigma=1.0, slope=1.0, intercept=1.0, bounds=X_BOUNDS,
)
fit_statistic = NegativeLogLikelihood(fit_function, events)

# Initialise iminuit minimiser object
minimiser = Minuit(
    fit_statistic.evaluateSignalWithLinear,
    **initial_parameter_dict,
)
# Set the error diff to 0.5 as we are using negative log likelyhood
minimiser.errordef = 0.5

# Fix the value of intercept to 1.0
minimiser.fixed["intercept"] = True

# Apply resonable limits to parameters
minimiser.limits = [(0.7, 1.0), (2.0, 3.0), (0.0, 0.4), (-0.2, 0.0), (0.0, 1.0)] 

# Minimise the fit_statistic
result = minimiser.migrad()

# Output minimisation results
result

In [None]:
# Define list containing all parameter names
parameter_names = initial_parameter_dict.keys()
# Define list containing all optimised parameter values
optimised_parameter_vals = [result.values[key] for key in parameter_names]
# Define list containing all optimised parameter errors
optimised_parameter_errors = [result.errors[key] for key in parameter_names]
# Define liust containing units of each parameter
parameter_units = ["", "a.u", "a.u", "a.u", "",]

# Print out results
for idx, name in enumerate(parameter_names):
    print(f"The optimised value +/- statistical error of {name} is:  {optimised_parameter_vals[idx]:.2f} +/- {optimised_parameter_errors[idx]:.2f} {parameter_units[idx]}\n")


Plot the distribution allong with the fit

In [None]:
# Compute the predictied y values of the fit (Values are normalised)
fit_line = fit_function._evaluate(X_VALUES) / fit_function.integrate(X_BOUNDS)

# Plot histogram with fit
plt.hist(events, bins=N_BINS, histtype="step", color="blue", label="Data", density=True) 
plt.plot(X_VALUES, fit_line, ls="--", c="purple", label="Fit")
plt.xlabel("Mass, m (a.u)")
plt.ylabel(f"Number of events (counts)")
plt.title(r"Histogram of measured events in range $\in$ [0, 10]")
plt.legend()