# Understanding Experimental Data
* Modelling
    * Least squares objective function: $\sum_{i=0}^{len(observed)-1}(observed[i]-predicted[i])^2$
        * ~ variance / # experiments
    * Linear regression: find the curve of least square - polyfit
        * Assume we want to find a polynomial
        * model = pylab.polyfit(xVals, yVals, degree)
* Prediction: polyval
    * estyVals = pylab.polyval(model, xVals)
* Which fit provides more accurate estimate?
    * compare two different models for the same data: mean squared error $\frac{\sum_{i=0}^{len(observed)-1}(observed[i]-predicted[i])^2)}{len(data)}$
    * absolute goodness of fit: coefficient of determination $R^2 = 1 - \frac{\sum(y_i-p_i)^2}{\sum(y_i-\mu)^2}$, where $y_i$ are measured values, $p_i$ are predicted values, $\mu$ mean of measured values
        * capture the portion of variability in the data is accounted for by my model
        * r=1: variability is all accounted for!; r=0: the model does not capture anything
    * problem of overfitting: not only fit the underlying process, but also the noise
        * cross validation: generate model using one dataset and test them on another dataset
            * small data-set: leave-one-out cross validation
            * larger data-set
                * k-fold cross validation: partition into k equal size sets, model trained on k-1 sets, test on the remaining set
                * repeated random sampling: randomly select n elements to train model, test on the remaining elements
        * visualise as search process
        

In [None]:
import random, pylab, numpy
def plotData(fileName):
    xVals, yVals = getData(fileName)
    xVals = pylab.array(xVals)
    yVals = pylab.array(yVals)
    xVals = xVals*9.81  #acc. due to gravity
    pylab.plot(xVals, yVals, 'bo',
               label = 'Measured displacements')

In [None]:
def aveMeanSquareError(data, predicted):
    error = 0.0
    for i in range(len(data)):
        error += (data[i] - predicted[i])**2
    return error/len(data)

def rSquared(observed, predicted):
    error = ((predicted - observed)**2).sum()
    meanError = error/len(observed)
    return 1 - (meanError/numpy.var(observed))

In [None]:
def genFits(xVals, yVals, degrees):
    models = []
    for d in degrees:
        model = pylab.polyfit(xVals, yVals, d)
        models.append(model)
    return models

def testFits(models, degrees, xVals, yVals, title):
    pylab.plot(xVals, yVals, 'o', label = 'Data')
    for i in range(len(models)):
        estYVals = pylab.polyval(models[i], xVals)
        error = rSquared(yVals, estYVals)
        pylab.plot(xVals, estYVals,
                   label = 'Fit of degree '\
                   + str(degrees[i])\
                   + ', R2 = ' + str(round(error, 5)))
    pylab.legend(loc = 'best')
    pylab.title(title)

In [None]:
def LeaveOneOutCrossValidation(dataset):
    testResults = []
    for i in range(len(dataset)):
        training = dataset[:].pop(i)
        model = buildModel(training)
        testResults.append(test(model, dataset[i]))
    avg = sum(testResults)/len(testResults)

def RepeatedRandomSampling(dataset, num_trials, num_train):
    testResults = []
    for i in range(num_trials):
        trainX, trainY, testX, testY = splitData(xVals,)
        model = buildModel(training)
        testResults.append(test(model,testSet))
    avg = sum(testResults)/len(testResults)