In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

import os

from utils import preprocess_observed_data, calculate_observed_variance, get_available_schedules, calculate_luck_variance_per_year, load_yaml
from simulation_utils import simulate_league, simulate_league_multiple_times, calculate_variance_of_simulated_leagues

config = load_yaml()

In [2]:
NUMBER_OF_SIMULATIONS = config["General"]["number_of_simulations"]

In [3]:
def calculate_observed_and_luck_variance(prepared_data_path, schedule_directory, probabilities_win_loss_tie, points_for_win_loss_tie, number_of_simulations):
    """
    Calculate the observed and luck variance.

    Parameters
    ----------
    prepared_data_path : str
        The path to the prepared data.
    schedule_directory : str
        The directory where the schedules are stored.
    probabilities_win_loss_tie : list
        The probabilities a team wins, loses or ties.
    points_for_win_loss_tie : list
        The points a team gets for a win, a loss and a tie.
    number_of_simulations : int
        The number of simulations.

    Returns
    -------
    pandas.DataFrame
        The dataframe containing the observed data with a new luck variance column that contains the luck variance for every year where a schedule is available.
    float
        The average observed variance.
    float
        The average variance of the luck simulation.
    """



    df_prepared = preprocess_observed_data(prepared_data_path, points_for_win_loss_tie)
    available_schedules = get_available_schedules(schedule_directory)
    df_prepared = calculate_luck_variance_per_year(df_prepared, available_schedules,  probabilities_win_loss_tie, points_for_win_loss_tie, number_of_simulations, min_max_scaling=False)
    observed_variance = df_prepared["Variance_observed"].mean()
    luck_variance = df_prepared["Luck_variance"].mean()

    return df_prepared, observed_variance, luck_variance


### NBA

In [4]:
df_prepared_nba, observed_variance_nba, luck_variance_nba = calculate_observed_and_luck_variance(config["NBA"]["prepared_data_path"], config["NBA"]["schedule_directory"], config["NBA"]["probabilities_win_loss_tie"], config["NBA"]["points_for_win_loss_tie"], NUMBER_OF_SIMULATIONS)

The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is not available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is not available.
The schedule for the year 2021 is available.


### NFL

In [5]:
df_prepared_nfl, observed_variance_nfl, luck_variance_nfl = calculate_observed_and_luck_variance(config["NFL"]["prepared_data_path"], config["NFL"]["schedule_directory"], config["NFL"]["probabilities_win_loss_tie"], config["NFL"]["points_for_win_loss_tie"], NUMBER_OF_SIMULATIONS)

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is available.
The schedule for the year 2021 is available.


### NHL

In [6]:
df_prepared_nhl, observed_variance_nhl, luck_variance_nhl = calculate_observed_and_luck_variance(config["NHL"]["prepared_data_path"], config["NHL"]["schedule_directory"], config["NHL"]["probabilities_win_loss_tie"], config["NHL"]["points_for_win_loss_tie"], NUMBER_OF_SIMULATIONS)

The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is not available.
The schedule for the year 2020 is not available.
The schedule for the year 2021 is not available.


### MLB

In [7]:
df_prepared_mlb, observed_variance_mlb, luck_variance_mlb = calculate_observed_and_luck_variance(config["MLB"]["prepared_data_path"], config["MLB"]["schedule_directory"], config["MLB"]["probabilities_win_loss_tie"], config["MLB"]["points_for_win_loss_tie"], NUMBER_OF_SIMULATIONS)

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is available.
The schedule for the year 2021 is available.
The schedule for the year 2022 is available.


### MLS

In [8]:
df_prepared_mls, observed_variance_mls, luck_variance_mls = calculate_observed_and_luck_variance(config["MLS"]["prepared_data_path"], config["MLS"]["schedule_directory"], config["MLS"]["probabilities_win_loss_tie"], config["MLS"]["points_for_win_loss_tie"], NUMBER_OF_SIMULATIONS)

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is not available.
The schedule for the year 2021 is available.


#### PML

In [9]:
df_prepared_pml, observed_variance_pml, luck_variance_pml = calculate_observed_and_luck_variance(config["PML"]["prepared_data_path"], config["PML"]["schedule_directory"], config["PML"]["probabilities_win_loss_tie"], config["PML"]["points_for_win_loss_tie"], NUMBER_OF_SIMULATIONS)

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is available.
The schedule for the year 2021 is available.


#### Ligue 1

In [10]:
df_prepared_ligue1, observed_variance_ligue1, luck_variance_ligue1 = calculate_observed_and_luck_variance(config["Ligue1"]["prepared_data_path"], config["Ligue1"]["schedule_directory"], config["Ligue1"]["probabilities_win_loss_tie"], config["Ligue1"]["points_for_win_loss_tie"], NUMBER_OF_SIMULATIONS)

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is available.
The schedule for the year 2021 is available.


 #### SerieA

In [11]:
df_prepared_serieA, observed_variance_serieA, luck_variance_serieA = calculate_observed_and_luck_variance(config["SerieA"]["prepared_data_path"], config["SerieA"]["schedule_directory"], config["SerieA"]["probabilities_win_loss_tie"], config["SerieA"]["points_for_win_loss_tie"], NUMBER_OF_SIMULATIONS)

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is available.
The schedule for the year 2021 is available.


#### LaLiga

In [12]:
df_prepared_laliga, observed_variance_laliga, luck_variance_laliga = calculate_observed_and_luck_variance(config["LaLiga"]["prepared_data_path"], config["LaLiga"]["schedule_directory"], config["LaLiga"]["probabilities_win_loss_tie"], config["LaLiga"]["points_for_win_loss_tie"], NUMBER_OF_SIMULATIONS)

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is available.
The schedule for the year 2021 is available.


#### Bundesliga

In [13]:
df_prepared_bundesliga, observed_variance_bundesliga, luck_variance_bundesliga = calculate_observed_and_luck_variance(config["Bundesliga"]["prepared_data_path"], config["Bundesliga"]["schedule_directory"], config["Bundesliga"]["probabilities_win_loss_tie"], config["Bundesliga"]["points_for_win_loss_tie"], NUMBER_OF_SIMULATIONS)

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is available.
The schedule for the year 2021 is available.


# Calculating the luck contribution for each sport

### Calculation:

Based on [Classical Test Theory](https://en.wikipedia.org/wiki/Classical_test_theory) we assume that

$$ Observed Score = True Score + Error Score $$

In our case *True Score* reflects the skill component while the *Error Score* reflects the luck component.

Since we are assuming the two variables Luck and Skill to be independent we can estimate:

$$Var(Skill) =  Var(Observed) - Var(Luck) $$

We can estimate the contribution of skill (reliability) with:

$$ \rho_{OS}^2 = \frac{\sigma_S^2}{\sigma_O^2} = \frac{\sigma_O^2 - \sigma_L^2}{\sigma_O^2} = 1 - \frac{\sigma_L^2}{\sigma_O^2} $$ 

and the contribution of luck

$$ \rho_{OL}^2 = \frac{\sigma_L^2}{\sigma_O^2} $$ 


In [14]:
luck_contribution_nba = luck_variance_nba/observed_variance_nba
luck_contribution_nfl = luck_variance_nfl/observed_variance_nfl
luck_contribution_nhl = luck_variance_nhl/observed_variance_nhl
luck_contribution_mlb = luck_variance_mlb/observed_variance_mlb
luck_contribution_mls = luck_variance_mls/observed_variance_mls
luck_contribution_pml = luck_variance_pml/observed_variance_pml
luck_contribution_ligue1 = luck_variance_ligue1/observed_variance_ligue1
luck_contribution_serieA = luck_variance_serieA/observed_variance_serieA
luck_contribution_laliga = luck_variance_laliga/observed_variance_laliga
luck_contribution_bundesliga = luck_variance_bundesliga/observed_variance_bundesliga


print("NBA: ", luck_contribution_nba)
print("NFL: ", luck_contribution_nfl)
print("NHL: ", luck_contribution_nhl)
print("MLB: ", luck_contribution_mlb)
print("MLS: ", luck_contribution_mls)
print("PML: ", luck_contribution_pml)
print("Ligue1: ", luck_contribution_ligue1)
print("SerieA: ", luck_contribution_serieA)
print("LaLiga: ", luck_contribution_laliga)
print("Bundesliga: ", luck_contribution_bundesliga)


NBA:  0.13855372775018832
NFL:  0.4175976746174232
NHL:  0.4026374640034991
MLB:  0.2964446117784229
MLS:  0.7946711836466658
PML:  0.28546403382703095
Ligue1:  0.4321493807477484
SerieA:  0.2945184928825171
LaLiga:  0.3309112316344553
Bundesliga:  0.3769332242207458
