# Testing framework - Benchmark model

Woodruff Wanderers - 1920 - 2,061 points  
Big Weapon - 1920 - 2,212 points  
Kebab DeBiryane - 1920 - 2,330 points

# Setting up

In [1]:
import pandas as pd

import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso
from sklearn.metrics import median_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from footbot.data.utils import set_up_bigquery, run_query
from footbot.research.utils.simulator import simulate_events

In [2]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.info("test")

INFO:root:test


In [3]:
pd.set_option('max_colwidth', 60)
pd.set_option('max_rows', 100)
pd.set_option('max_columns', 100)

In [4]:
client = set_up_bigquery('../../secrets/service_account.json')

INFO:footbot.data.utils:setting up BigQuery client


# Getting data

## Training data SQL

In [5]:
train_sql = \
'''
  -- training data
WITH
  teams AS (
    -- lookup for team names
  SELECT
    DISTINCT team,
    safe_team_name,
    season
  FROM
    `footbot-001.fpl.elements_all` ),
  --------------------------------------------------------------------------------------------------------------------------------------------------------------
  element_gameweeks AS (
    -- historic player-fixture data as of event of interest
  SELECT
    eg.* EXCEPT(opponent_team),
    ot.safe_team_name AS opponent_team,
    e.element_all,
    e.safe_web_name,
    e.element_type,
    e.safe_team_name AS team,
  IF
    (EXTRACT(DAYOFWEEK
      FROM
        kickoff_time) = 1,
      1,
      0) AS was_sunday,
  IF
    (EXTRACT(DAYOFWEEK
      FROM
        kickoff_time) NOT IN (1,
        7),
      1,
      0) AS was_weekday,
  IF
    ((kickoff_time BETWEEN '2019-10-27'
        AND '2020-03-29'
        AND EXTRACT(HOUR
        FROM
          kickoff_time) > 15)
      OR (kickoff_time NOT BETWEEN '2019-10-27'
        AND '2020-03-29'
        AND EXTRACT(HOUR
        FROM
          kickoff_time) > 14),
      1,
      0) AS was_late,
  IF
    ((kickoff_time BETWEEN '2019-10-27'
        AND '2020-03-29'
        AND EXTRACT(HOUR
        FROM
          kickoff_time) < 15)
      OR (kickoff_time NOT BETWEEN '2019-10-27'
        AND '2020-03-29'
        AND EXTRACT(HOUR
        FROM
          kickoff_time) < 14),
      1,
      0) AS was_early,
    DENSE_RANK() OVER(PARTITION BY e.element_all ORDER BY eg.season, eg.event, eg.kickoff_time) AS element_event_rank
  FROM
    `footbot-001.fpl.element_gameweeks_all` AS eg
  INNER JOIN
    `footbot-001.fpl.elements_all` AS e
  ON
    eg.element = e.element
    AND eg.season = e.season
  INNER JOIN
    teams AS ot
  ON
    eg.opponent_team = ot.team
    AND eg.season = ot.season
  WHERE
    (eg.season = '{season}'
      AND eg.event < {event})
    OR (eg.season < '{season}') -- before event of interest
    ),
  --------------------------------------------------------------------------------------------------------------------------------------------------------------
  elements AS (
    -- element features as of event of interest
  SELECT
    DISTINCT element_all,
    element_event_rank,
    element_type,
    team,
    value,
    AVG(total_points) OVER(PARTITION BY element_all ORDER BY element_event_rank RANGE BETWEEN 20 PRECEDING AND 1 PRECEDING) AS rolling_avg_total_points_element_p20,
    AVG(assists) OVER(PARTITION BY element_all ORDER BY element_event_rank RANGE BETWEEN 20 PRECEDING AND 1 PRECEDING) AS rolling_avg_assists_element_p20,
    AVG(clean_sheets) OVER(PARTITION BY element_all ORDER BY element_event_rank RANGE BETWEEN 20 PRECEDING AND 1 PRECEDING) AS rolling_avg_clean_sheets_element_p20,
    AVG(goals_conceded) OVER(PARTITION BY element_all ORDER BY element_event_rank RANGE BETWEEN 20 PRECEDING AND 1 PRECEDING) AS rolling_avg_goals_conceded_element_p20,
    AVG(saves) OVER(PARTITION BY element_all ORDER BY element_event_rank RANGE BETWEEN 20 PRECEDING AND 1 PRECEDING) AS rolling_avg_saves_element_p20,
    AVG(minutes) OVER(PARTITION BY element_all ORDER BY element_event_rank RANGE BETWEEN 20 PRECEDING AND 1 PRECEDING) AS rolling_avg_minutes_element_p20,
  FROM
    element_gameweeks )
  --------------------------------------------------------------------------------------------------------------------------------------------------------------
  --------------------------------------------------------------------------------------------------------------------------------------------------------------
  --------------------------------------------------------------------------------------------------------------------------------------------------------------
SELECT
  eg.element_all,
  season,
  event,
  total_points,
  opponent_team,
  was_home,
  was_sunday,
  was_weekday,
  was_late,
  was_early,
  e.element_type,
  e.team,
  e.value,
  rolling_avg_total_points_element_p20,
  rolling_avg_assists_element_p20,
  rolling_avg_clean_sheets_element_p20,
  rolling_avg_goals_conceded_element_p20,
  rolling_avg_saves_element_p20,
  rolling_avg_minutes_element_p20
FROM
  element_gameweeks AS eg
LEFT JOIN
  elements AS e
ON
  eg.element_all = e.element_all
  AND eg.element_event_rank = e.element_event_rank
ORDER BY
  element_all,
  season,
  event
'''

## Prediction data SQL

In [6]:
predict_features_sql = \
'''
-- prediction data
WITH
  teams AS (
    -- lookup for team names
  SELECT
    DISTINCT team,
    safe_team_name,
    season
  FROM
    `footbot-001.fpl.elements_all` ),
  --------------------------------------------------------------------------------------------------------------------------------------------------------------
  fixtures AS (
    -- fixture data known ahead of time for fixtures from event of interest onwards
  SELECT
    f.* EXCEPT(opponent_team),
    t.safe_team_name AS opponent_team
  FROM (
    SELECT
      element,
      event,
      fixture,
      opponent_team,
      was_home,
    IF
      (EXTRACT(DAYOFWEEK
        FROM
          kickoff_time) = 1,
        1,
        0) AS was_sunday,
    IF
      (EXTRACT(DAYOFWEEK
        FROM
          kickoff_time) NOT IN (1,
          7),
        1,
        0) AS was_weekday,
    IF
      ((kickoff_time BETWEEN '2019-10-27'
          AND '2020-03-29'
          AND EXTRACT(HOUR
          FROM
            kickoff_time) > 15)
        OR (kickoff_time NOT BETWEEN '2019-10-27'
          AND '2020-03-29'
          AND EXTRACT(HOUR
          FROM
            kickoff_time) > 14),
        1,
        0) AS was_late,
    IF
      ((kickoff_time BETWEEN '2019-10-27'
          AND '2020-03-29'
          AND EXTRACT(HOUR
          FROM
            kickoff_time) < 15)
        OR (kickoff_time NOT BETWEEN '2019-10-27'
          AND '2020-03-29'
          AND EXTRACT(HOUR
          FROM
            kickoff_time) < 14),
        1,
        0) AS was_early
    FROM (
      SELECT
        element,
        event,
        fixture,
        kickoff_time,
        opponent_team,
        was_home
      FROM
        `footbot-001.fpl.element_gameweeks_{season}`
      WHERE
        event >= {event} -- fixtures from event of interest onwards
        ) ) AS f
  INNER JOIN
    teams AS t
  ON
    f.opponent_team = t.team
    AND t.season = '{season}' ),
  --------------------------------------------------------------------------------------------------------------------------------------------------------------
  element_gameweeks AS (
    -- historic player-fixture data as of event of interest
  SELECT
    eg.* EXCEPT(opponent_team),
    ot.safe_team_name AS opponent_team,
    e.element_all,
    e.safe_web_name,
    e.element_type,
    e.safe_team_name AS team,
    DENSE_RANK() OVER(PARTITION BY e.element_all ORDER BY eg.season, eg.event, eg.kickoff_time) AS element_event_rank
  FROM
    `footbot-001.fpl.element_gameweeks_all` AS eg
  INNER JOIN
    `footbot-001.fpl.elements_all` AS e
  ON
    eg.element = e.element
    AND eg.season = e.season
  INNER JOIN
    teams AS ot
  ON
    eg.opponent_team = ot.team
    AND eg.season = ot.season
  WHERE
    (eg.season = '{season}'
      AND eg.event <= {event})
    OR (eg.season < '{season}') -- before event of interest
    ),
  --------------------------------------------------------------------------------------------------------------------------------------------------------------
  elements AS (
    -- element features as of event of interest
  SELECT
    *
  FROM (
    SELECT
      DISTINCT element_all,
      element_type,
      team,
      value,
      AVG(total_points) OVER(PARTITION BY element_all ORDER BY element_event_rank RANGE BETWEEN 20 PRECEDING AND 1 PRECEDING) AS rolling_avg_total_points_element_p20,
      AVG(assists) OVER(PARTITION BY element_all ORDER BY element_event_rank RANGE BETWEEN 20 PRECEDING AND 1 PRECEDING) AS rolling_avg_assists_element_p20,
      AVG(clean_sheets) OVER(PARTITION BY element_all ORDER BY element_event_rank RANGE BETWEEN 20 PRECEDING AND 1 PRECEDING) AS rolling_avg_clean_sheets_element_p20,
      AVG(goals_conceded) OVER(PARTITION BY element_all ORDER BY element_event_rank RANGE BETWEEN 20 PRECEDING AND 1 PRECEDING) AS rolling_avg_goals_conceded_element_p20,
      AVG(saves) OVER(PARTITION BY element_all ORDER BY element_event_rank RANGE BETWEEN 20 PRECEDING AND 1 PRECEDING) AS rolling_avg_saves_element_p20,
      AVG(minutes) OVER(PARTITION BY element_all ORDER BY element_event_rank RANGE BETWEEN 20 PRECEDING AND 1 PRECEDING) AS rolling_avg_minutes_element_p20,
      DENSE_RANK() OVER(PARTITION BY element_all ORDER BY element_event_rank DESC) AS is_current
    FROM
      element_gameweeks )
  WHERE
    is_current = 1 )
  --------------------------------------------------------------------------------------------------------------------------------------------------------------
  --------------------------------------------------------------------------------------------------------------------------------------------------------------
  --------------------------------------------------------------------------------------------------------------------------------------------------------------
SELECT
  a.element_all,
  season,
  f.event,
  fixture,
  opponent_team,
  was_home,
  was_sunday,
  was_weekday,
  was_late,
  was_early,
  e.element_type,
  e.team,
  e.value,
  rolling_avg_total_points_element_p20,
  rolling_avg_assists_element_p20,
  rolling_avg_clean_sheets_element_p20,
  rolling_avg_goals_conceded_element_p20,
  rolling_avg_saves_element_p20,
  rolling_avg_minutes_element_p20
FROM
  fixtures AS f
INNER JOIN
  `footbot-001.fpl.elements_all` AS a
ON
  f.element = a.element
  AND a.season = '{season}'
INNER JOIN
  elements AS e
ON
  a.element_all = e.element_all
ORDER BY
  element_all,
  season,
  event
'''

## Helpers

In [7]:
def get_data(sql, season, event, client):
    formatted_sql = sql.format(season=season, event=event)
    return run_query(formatted_sql, client)

# Modelling points

In [8]:
def get_predictions_df(
    season,
    event,
    client
):
    
    train_df = get_data(train_sql, season, event, client)
    predict_features_df = get_data(predict_features_sql, season, event, client)
    
    
    meta_data = [
        'element_all',
        'season',
        'event',
        'fixture',
    ]
    
    train_df = train_df.drop(meta_data, axis=1, errors='ignore')
    
    categorical_features = [
        'opponent_team',
        'was_home',
        'was_sunday',
        'was_weekday',
        'was_late',
        'was_early',
        'element_type',
        'team',
    ]

    numerical_features = [
        i for i in train_df.columns if i not in categorical_features + ['total_points']
    ]

    numerical_transformer = Pipeline(
        [
            ('impute missing values', SimpleImputer()),
            ('scale numerical features', StandardScaler()),
        ]
    )

    preprocess = ColumnTransformer(
        [
            (
                'preprocess numerical features',
                numerical_transformer,
                numerical_features,
            ),
            (
                'preprocess categorical features',
                OneHotEncoder(handle_unknown='ignore'),
                categorical_features,
            ),
        ]
    )

    model = Pipeline(
        [
            ('pre-process features', preprocess),
            ('predictive model', Lasso(alpha=0.0020)),
        ]
    )

    model.fit(train_df.drop('total_points', axis=1), train_df['total_points'])

    predictions_df = predict_features_df.copy()
    predictions_df['predicted_total_points'] = model.predict(
        predictions_df.drop(meta_data, axis=1)
    )
    
    return predictions_df

In [9]:
predictions_df = get_predictions_df(
    '1920',
    1,
    client
)

In [10]:
predictions_df

Unnamed: 0,element_all,season,event,fixture,opponent_team,was_home,was_sunday,was_weekday,was_late,was_early,element_type,team,value,rolling_avg_total_points_element_p20,rolling_avg_assists_element_p20,rolling_avg_clean_sheets_element_p20,rolling_avg_goals_conceded_element_p20,rolling_avg_saves_element_p20,rolling_avg_minutes_element_p20,predicted_total_points
0,2,1920,1,8,man city,True,0,0,0,1,2,west ham,50,1.25,0.05,0.05,0.95,0.0,55.65,1.454544
1,2,1920,2,13,brighton,False,0,0,0,0,2,west ham,50,1.25,0.05,0.05,0.95,0.0,55.65,1.773725
2,2,1920,3,29,watford,False,0,0,0,0,2,west ham,50,1.25,0.05,0.05,0.95,0.0,55.65,1.756702
3,2,1920,4,40,norwich,True,0,0,0,0,2,west ham,50,1.25,0.05,0.05,0.95,0.0,55.65,1.966061
4,2,1920,5,41,aston villa,False,0,1,1,0,2,west ham,50,1.25,0.05,0.05,0.95,0.0,55.65,1.753316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20665,1445,1920,43,332,man utd,True,0,1,1,0,1,aston villa,45,,,,,,,1.168136
20666,1445,1920,44,341,crystal palace,True,1,0,0,1,1,aston villa,45,,,,,,,1.384517
20667,1445,1920,45,355,everton,False,0,1,1,0,1,aston villa,45,,,,,,,1.102822
20668,1445,1920,46,361,arsenal,True,0,1,1,0,1,aston villa,45,,,,,,,1.139456


# Evaluate model

## Simulate season

In [11]:
season = '1920'
events = [1, 2, 3]#list(range(1, 30)) + list(range(39, 48))

In [12]:
simulation_results_arr = simulate_events(
    season=season,
    events=events,
    get_predictions_df=get_predictions_df,
    events_to_look_ahead=0,
    first_team_factor=0.9,
    bench_factor=0.1,
    captain_factor=0.9,
    vice_factor=0.1,
    transfer_penalty=0,
    transfer_limit=1,
    wildcard_events=[],
    free_hit_events=[],
    triple_captain_events=[],
    bench_boost_events=[],
    dataset='research',
    table=f'benchmark_model_{season}_debug',
    save_new_predictions=True,
    client=client
)

ERROR:footbot.data.utils:404 Not found: Table footbot-001:research.benchmark_model_1920_debug was not found in location US

(job ID: 6e978737-fb17-4d63-91d2-8acbe97be797)

                       -----Query Job SQL Follows-----                       

    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
   1:DELETE FROM `footbot-001.research.benchmark_model_1920_debug` WHERE true
    |    .    |    .    |    .    |    .    |    .    |    .    |    .    |
INFO:footbot.research.utils.simulator:writing predictions as of event 1
INFO:footbot.research.utils.simulator:writing predictions as of event 2
INFO:footbot.research.utils.simulator:writing predictions as of event 3
INFO:footbot.research.utils.simulator:simulating event 1
INFO:footbot.research.utils.simulator:simulating event 2
INFO:footbot.research.utils.simulator:simulating event 3


In [13]:
sum(i['event_points'] for i in simulation_results_arr)

160.0