# Replication

This notebook replicates exactly the analysis done for this project.

In [None]:
#imports
import pickle
import os
import glob
import numpy as np
from sklearn import linear_model
import pickle

In [None]:
#seasons to run analysis on
seasons = ['2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13', '2013-14']

# The Regression

In [None]:
def train_linear(X, y, normalize = False):
    lr = linear_model.LinearRegression(normalize=normalize)
    lr.fit(X, y)

    return lr

# The Error Functions

In [None]:
#computed error given model as input
def error(model, X, y):
    predictions = model.predict(X)
    avg_error = 0.
    max_error = 0.
    errors = []
    for i, prediction in enumerate(predictions):
        error = abs(prediction - y[i])
        avg_error += error
        max_error = error if error > max_error else max_error
        errors.append(error)

    #plt.plot(errors)
    #plt.show()

    return avg_error/predictions.shape[0], max_error

In [None]:
#all but one fold error over seasons using inputed average type (raw, sliding, ...)
#Uncomment the commented lines to see detail for each fold
def ABOF_error(seasons, average_type = "raw", weight = ""):
    Xs = []
    ys = []
    for season in seasons:
        X = pickle.load(open('data' + os.sep + season + os.sep + 'averages' + os.sep + average_type + '_X' + weight + '.pkl', 'rb'))
        y = pickle.load(open('data' + os.sep + season + os.sep + 'averages' + os.sep + average_type + '_y' + weight + '.pkl', 'rb'))
        Xs.append(X)
        ys.append(y)

    errors = []
    avg_error = 0.
    avg_max = 0.

    errorsn = []
    avg_errorn = 0.
    avg_maxn = 0.

    for i, season in enumerate(seasons):
        #print "Testing on season %s (Training on the rest)" % season
        tmp_X = list(Xs)
        tmp_y = list(ys)
        testX, testy = tmp_X.pop(i), tmp_y.pop(i)
        trainX, trainy = np.concatenate(tmp_X), np.concatenate(tmp_y)
        model = train_linear(trainX, trainy)
        modeln = train_linear(trainX, trainy, True)
        err = error(model, testX, testy)
        errn = error(modeln, testX, testy)
        errors.append(err[0])
        errorsn.append(errn[0])
        avg_error += err[0]
        avg_errorn += errn[0]
        avg_max += err[1]
        avg_maxn += errn[1]

        #print "error for this season is %s" % (err,)
        #print "error for this season with normalized features is %s" % (errn,)

    result = avg_error/len(seasons), avg_max/len(seasons)
    resultn = avg_errorn/len(seasons), avg_maxn/len(seasons)
    print "Average error and Averaged max error over all seasons is %s" % (result,)
    print "Average error and Averaged max error over all seasons with normalized features is %s" % (resultn,)


    return result, resultn

# Results 

In [None]:
#Takes about a minute
print "RAW"
ABOF_error(seasons)
print"\n"
print "SLIDING"
ABOF_error(seasons, "sliding")
print"\n"
print "SLIDING with weight 2"
ABOF_error(seasons, "sliding", "2")
print"\n"
print "DOUBLE INSTANCES"
ABOF_error(seasons, "sliding_loc")