# The Bristol Project 
## Model fitting
The project is framed as a multivariate regression task (for the moment). Although this may look more like a distribution of distributions which can all regress to the same answer. 

In this notebook the data will be loaded and explored 

In [1]:
import pandas as pd
import os
import historic_data_processing_functions
import logging
import betfairlightweight
from betfairlightweight import StreamListener
import bz2
import random
import numpy as np
from matplotlib import pyplot as plt

from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import StandardScaler
from sklearn import svm
import seaborn as sns 
from scipy import stats
import copy

### We trial building models for different features of a horse and see if they combine to be better than one big model!

In [9]:
def rms(y,y_pred):
    rms=np.sqrt(np.mean((y-y_pred)**2))
    return rms

def create_train_test(low, high, std, train_df, test_df):
    train_df = train_df.drop(["Unnamed: 0", "selection_ids", "market_id"], axis=1)
    train_df = train_df.dropna()

    train_df = train_df[(train_df["mean_300"] > low) & (train_df["mean_300"] <= high)]
    train_df = train_df[train_df["mean_14400"] > 0]
    train_df = train_df.drop(train_df[train_df["std_2700"] > std].index)



    test_df = test_df.drop(["Unnamed: 0", "selection_ids", "market_id"], axis=1)
    test_df = test_df[(test_df["mean_300"] > low) & (test_df["mean_300"] <= high)]
    test_df = test_df[test_df["mean_14400"] > 0 ]
    test_df = test_df.dropna()

    # below is a slight hack ... 
    test_df = test_df.drop(test_df[test_df["std_2700"] > std].index)


    mean_test_900 = copy.deepcopy(test_df["mean_900"].values)
    mean_test_300 = copy.deepcopy(test_df["mean_300"].values)

    # sort out our targets
    y_train_df = train_df["bsps"]
    x_train_df = train_df.drop(["bsps"], axis=1)
    y_test_df = test_df["bsps"]
    print(y_test_df.size)
    x_test_df = test_df.drop(["bsps"], axis=1)

    # Some feature engineering: 

    x_train_df["near_mean_delta"] = x_train_df["mean_300"] - x_train_df["mean_900"]
    x_train_df["mid_mean_delta"] = x_train_df["mean_300"] - x_train_df["mean_1800"]
    x_train_df["far_mean_delta"] = x_train_df["mean_300"] - x_train_df["mean_7200"]

    x_train_df["near_std_delta"] = x_train_df["std_300"] - x_train_df["std_900"]
    x_train_df["mid_std_delta"] = x_train_df["std_300"] - x_train_df["std_1800"]
    x_train_df["far_std_delta"] = x_train_df["std_300"] - x_train_df["std_7200"]

    x_test_df["near_mean_delta"] = x_test_df["mean_300"] - x_test_df["mean_900"]
    x_test_df["mid_mean_delta"] = x_test_df["mean_300"] - x_test_df["mean_1800"]
    x_test_df["far_mean_delta"] = x_test_df["mean_300"] - x_test_df["mean_7200"]

    x_test_df["near_std_delta"] = x_test_df["std_300"] - x_test_df["std_900"]
    x_test_df["mid_std_delta"] = x_test_df["std_300"] - x_test_df["std_1800"]
    x_test_df["far_std_delta"] = x_test_df["std_300"] - x_test_df["std_7200"]



    # apply scaling
    scaler = StandardScaler()
    clm=x_train_df.columns
    x_train_df = pd.DataFrame(scaler.fit_transform(x_train_df), columns=clm)
    x_test_df = pd.DataFrame(scaler.transform(x_test_df), columns=clm)
    
    return x_train_df, x_test_df, y_train_df, y_test_df, mean_test_300

def how_good(y_pred_test, mean_test_300, y_test_df):
    back_correct = 0
    lay_correct = 0
    incorrect = 0
    
    stake = 10
    back_correct_gap = 0
    lay_correct_gap = 0
    incorrect_gap = 0
    for i in range(len(y_pred_test)):
        if (y_pred_test[i] > mean_test_300[i]) and (y_test_df.values[i] > mean_test_300[i]):
            back_correct += 1
            back_correct_gap += ((y_test_df.values[i] - mean_test_300[i]))/y_test_df.values[i]
        elif (y_pred_test[i] > mean_test_300[i]) and (y_test_df.values[i] < mean_test_300[i]):
            incorrect += 1
            incorrect_gap += abs((y_test_df.values[i] - mean_test_300[i]))/y_test_df.values[i]
        elif (mean_test_300[i] <= stake):
            if (y_pred_test[i] < mean_test_300[i]) and (y_test_df.values[i] < mean_test_300[i]):
                lay_correct += 1
                lay_correct_gap += (y_test_df.values[i] - mean_test_300[i])/y_test_df.values[i]
            else:
                incorrect += 1
                incorrect_gap += abs((y_test_df.values[i] - mean_test_300[i]))/y_test_df.values[i]
    
    net_gap = back_correct_gap-lay_correct_gap-incorrect_gap
#     print("back_correct : ", back_correct)
#     print("lay_correct : ", lay_correct)
#     print("incorrect : ", incorrect)

#     print("back_correct gap : ", back_correct_gap)
#     print("lay_correct gap : ", lay_correct_gap)
#     print("incorrect gap: ", incorrect_gap)
    print("net_gap is: ", net_gap)
    
    return net_gap

def pred_and_results(x_train_df, x_test_df, y_train_df, y_test_df, model):
    y_pred_train = model.predict(x_train_df)
    y_pred_test = model.predict(x_test_df)

    results_train_mae = mae(y_train_df, y_pred_train)
    results_train_rms = rms(y_train_df, y_pred_train)

    results_test_mae = mae(y_test_df, y_pred_test)
    results_test_rms = rms(y_test_df, y_pred_test)
#     print("MAE train : ", results_train_mae)
#     print("RMS train : ", results_train_rms)
#     print("MAE test : ", results_test_mae)
#     print("RMS test : ", results_test_rms)

    return y_pred_test

def ablations(price_bounds, std_bounds, train_df, test_df):
    for std in std_bounds:
        print("STD IS : ", std)
        gap_summer = 0
        for i in range(len(price_bounds[1:])):
            low = price_bounds[i]
            high = price_bounds[i+1]
            print("prices: low ", low, "high", high)
            x_train_df, x_test_df, y_train_df, y_test_df, mean_test_300 = create_train_test(low,
                                                                             high,
                                                                             std,
                                                                             train_df,
                                                                             test_df)
            model = Ridge()
            model.fit(x_train_df, y_train_df)
            y_pred_test = pred_and_results(x_train_df, x_test_df, y_train_df, y_test_df, model)

            gap_summer += how_good(y_pred_test, mean_test_300, y_test_df)

        print("TOTAL GAP SUMMER IS ", gap_summer)
        print("------------------")


In [22]:
## Now demonstrate the ridgemodel predict ability

# fit linear regression model (for feature exploration and initial results)

# load in our trains and tests - in this case it learns on jan and feb and mar is the test
train_df1 = pd.read_csv("jan20_analysis_direct_nr0_100_50.csv")
train_df2 = pd.read_csv("feb20_analysis_direct_nr0_100_50.csv")
train_df3 = pd.read_csv("mar20_analysis_direct_nr0_100_50.csv")
train_df4 = pd.read_csv("may22_analysis_direct_nr0_100_50.csv")
train_df5 = pd.read_csv("jun22_analysis_direct_nr0_100_50.csv")

print("_----------  ----------__")
print("__________JULY___________")
print("_----------  ----------__")

frames = [
        train_df1,
          train_df2,
          train_df3,
          train_df4,
          train_df5
]
train_df = pd.concat(frames)
test_df = pd.read_csv("jul22_analysis_direct_nr0_100_50.csv")
price_bounds = [0, 50]
std_bounds = [1, 3, 5, 10, 100]

ablations(price_bounds, std_bounds, train_df, test_df)

#############################

print("_----------  ----------__")
print("__________JUNE___________")
print("_----------  ----------__")


frames = [
        train_df1,
          train_df2,
          train_df3,
          train_df4,
          #train_df5
]
train_df = pd.concat(frames)
test_df = pd.read_csv("jun22_analysis_direct_nr0_100_50.csv")
price_bounds = [0, 50]
std_bounds = [1, 3, 5, 10, 100]

ablations(price_bounds, std_bounds, train_df, test_df)
#################################
print("_----------  ----------__")
print("__________MAY___________")
print("_----------  ----------__")


frames = [
        train_df1,
          train_df2,
          train_df3,
          #train_df4,
          #train_df5
]
train_df = pd.concat(frames)
test_df = pd.read_csv("may22_analysis_direct_nr0_100_50.csv")



price_bounds = [0, 50]
std_bounds = [1, 3, 5, 10, 100]

ablations(price_bounds, std_bounds, train_df, test_df)

#################################
print("_----------  ----------__")
print("__________MARCH___________")
print("_----------  ----------__")


frames = [
        train_df1,
          train_df2,
          #train_df3,
          #train_df4,
          #train_df5
]
train_df = pd.concat(frames)
test_df = pd.read_csv("mar20_analysis_direct_nr0_100_50.csv")



price_bounds = [0, 50]
std_bounds = [1, 3, 5, 10, 100]

ablations(price_bounds, std_bounds, train_df, test_df)

#################################
print("_----------  ----------__")
print("__________FEB___________")
print("_----------  ----------__")


frames = [
        train_df1,
          #train_df2,
          #train_df3,
          #train_df4,
          #train_df5
]
train_df = pd.concat(frames)
test_df = pd.read_csv("mar20_analysis_direct_nr0_100_50.csv")



price_bounds = [0, 50]
std_bounds = [1, 3, 5, 10, 100]

ablations(price_bounds, std_bounds, train_df, test_df)


_----------  ----------__
__________JULY___________
_----------  ----------__
STD IS :  1
prices: low  0 high 50
3603
net_gap is:  20.33096649365433
TOTAL GAP SUMMER IS  20.33096649365433
------------------
STD IS :  3
prices: low  0 high 50
3764
net_gap is:  19.567385288192156
TOTAL GAP SUMMER IS  19.567385288192156
------------------
STD IS :  5
prices: low  0 high 50
3781
net_gap is:  27.805568401769165
TOTAL GAP SUMMER IS  27.805568401769165
------------------
STD IS :  10
prices: low  0 high 50
3784
net_gap is:  24.334923632848017
TOTAL GAP SUMMER IS  24.334923632848017
------------------
STD IS :  100
prices: low  0 high 50
3784
net_gap is:  25.240918224975417
TOTAL GAP SUMMER IS  25.240918224975417
------------------
_----------  ----------__
__________JUNE___________
_----------  ----------__
STD IS :  1
prices: low  0 high 50
3151
net_gap is:  13.892697844473673
TOTAL GAP SUMMER IS  13.892697844473673
------------------
STD IS :  3
prices: low  0 high 50
3321
net_gap is:  7.15

In [4]:
# See if our predictions are good for ridge


# Problem 4 - See what happens at the BSP
We want to check what happens at the BSP - so we can compare it against all the prices within the std from the means and hopefully gain a better indicator of value

So when the BSP is the available price, we want to check the average available money, and the weight of money at both 1 and 2 ticks - this might need to be standardized from selection traded?

Whenever the price changes from BSP, plot the change in selection traded and the change in weight of moneys so look for trends - again maybe standardize by selection traded?

### Why?
If the above problem doesn't give a good indicator of value, then we will need other indicators. Because of the assumption that the majority of traders are aware of the BSP we would like to look at the *behaviour* of money around this price. For instance, it might not be enough to just look at the volumes of price traded - because it doesn't tell us which side those in the know are trading on. But it is fair to assume that those in the know are most likely to be those who are queing money (punters just place bets). Therefore there may be some trends in this behaviors which will give better indications of where the BSP is if we look at all prices within the std range from problem 2.