In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

import os

from utils import (
    preprocess_observed_data,
    calculate_observed_variance,
    get_available_schedules,
    calculate_luck_variance_per_year,
    load_yaml,
)
from simulation_utils import (
    simulate_league,
    simulate_league_multiple_times,
    calculate_variance_of_simulated_leagues,
)

config = load_yaml()

results_df = pd.DataFrame(
    columns=["Sport", "#Teams (avg)", "#Seasons", "Variance Observed", "Variance Luck"]
)

In [2]:
NUMBER_OF_SIMULATIONS = config["General"]["number_of_simulations"]

In [3]:
def calculate_observed_and_luck_variance(
    prepared_data_path,
    schedule_directory,
    probabilities_win_loss_tie,
    points_for_win_loss_tie,
    number_of_simulations,
    type="teams",
):
    """
    Calculate the observed and luck variance.

    Parameters
    ----------
    prepared_data_path : str
        The path to the prepared data.
    schedule_directory : str
        The directory where the schedules are stored.
    probabilities_win_loss_tie : list
        The probabilities a team wins, loses or ties.
    points_for_win_loss_tie : list
        The points a team gets for a win, a loss and a tie.
    number_of_simulations : int
        The number of simulations.

    Returns
    -------
    pandas.DataFrame
        The dataframe containing the observed data with a new luck variance column that contains the luck variance for every year where a schedule is available.
    float
        The average observed variance.
    float
        The average variance of the luck simulation.
    """

    df_prepared = preprocess_observed_data(prepared_data_path, points_for_win_loss_tie)
    available_schedules = get_available_schedules(schedule_directory)
    df_prepared = calculate_luck_variance_per_year(
        df_prepared,
        available_schedules,
        probabilities_win_loss_tie,
        points_for_win_loss_tie,
        number_of_simulations,
        min_max_scaling=False,
        type=type,
    )

    # filter for values where simulation was possible
    df_prepared_filtered = df_prepared[~df_prepared["Luck_variance"].isnull()]
    n_years = len(df_prepared_filtered)
    n_teams = np.mean(df_prepared_filtered["#Teams"])

    observed_variance = df_prepared_filtered["Variance_observed"].mean()
    luck_variance = df_prepared_filtered["Luck_variance"].mean()

    return df_prepared_filtered, observed_variance, luck_variance, n_years, n_teams


def append_to_results_df(observed_variance, luck_variance, n_years, n_teams, sport):
    """
    Append the results to the results dataframe.

    Parameters
    ----------
    observed_variance : float
        The average observed variance.
    luck_variance : float
        The average variance of the luck simulation.
    n_years : int
        The number of years where a schedule was available.
    n_teams : int
        The number of teams.
    sport : str
        The name of the sport.

    Returns
    -------
    None.

    """
    global results_df
    # concat to results df
    tmp_df = pd.DataFrame(
        {
            "Sport": [sport],
            "#Teams (avg)": [n_teams],
            "#Seasons": [n_years],
            "Variance Observed": [observed_variance],
            "Variance Luck": [luck_variance],
        }
    )
    results_df = pd.concat([results_df, tmp_df], ignore_index=True)

### NBA

In [5]:
(
    df_prepared_nba,
    observed_variance_nba,
    luck_variance_nba,
    n_years,
    n_teams,
) = calculate_observed_and_luck_variance(
    config["NBA"]["prepared_data_path"],
    config["NBA"]["schedule_directory"],
    config["NBA"]["probabilities_win_loss_tie"],
    config["NBA"]["points_for_win_loss_tie"],
    NUMBER_OF_SIMULATIONS,
)
append_to_results_df(observed_variance_nba, luck_variance_nba, n_years, n_teams, "NBA")

The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is not available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is not available.
The schedule for the year 2021 is available.


### NFL

In [6]:
(
    df_prepared_nfl,
    observed_variance_nfl,
    luck_variance_nfl,
    n_years,
    n_teams,
) = calculate_observed_and_luck_variance(
    config["NFL"]["prepared_data_path"],
    config["NFL"]["schedule_directory"],
    config["NFL"]["probabilities_win_loss_tie"],
    config["NFL"]["points_for_win_loss_tie"],
    NUMBER_OF_SIMULATIONS,
)
append_to_results_df(observed_variance_nfl, luck_variance_nfl, n_years, n_teams, "NFL")

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is available.
The schedule for the year 2021 is available.


### NHL

In [7]:
(
    df_prepared_nhl,
    observed_variance_nhl,
    luck_variance_nhl,
    n_years,
    n_teams,
) = calculate_observed_and_luck_variance(
    config["NHL"]["prepared_data_path"],
    config["NHL"]["schedule_directory"],
    config["NHL"]["probabilities_win_loss_tie"],
    config["NHL"]["points_for_win_loss_tie"],
    NUMBER_OF_SIMULATIONS,
)
append_to_results_df(observed_variance_nhl, luck_variance_nhl, n_years, n_teams, "NHL")

The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is not available.
The schedule for the year 2020 is not available.
The schedule for the year 2021 is not available.


### MLB

In [8]:
(
    df_prepared_mlb,
    observed_variance_mlb,
    luck_variance_mlb,
    n_years,
    n_teams,
) = calculate_observed_and_luck_variance(
    config["MLB"]["prepared_data_path"],
    config["MLB"]["schedule_directory"],
    config["MLB"]["probabilities_win_loss_tie"],
    config["MLB"]["points_for_win_loss_tie"],
    NUMBER_OF_SIMULATIONS,
)
append_to_results_df(observed_variance_mlb, luck_variance_mlb, n_years, n_teams, "MLB")

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is available.
The schedule for the year 2021 is available.
The schedule for the year 2022 is available.


### MLS

In [9]:
(
    df_prepared_mls,
    observed_variance_mls,
    luck_variance_mls,
    n_years,
    n_teams,
) = calculate_observed_and_luck_variance(
    config["MLS"]["prepared_data_path"],
    config["MLS"]["schedule_directory"],
    config["MLS"]["probabilities_win_loss_tie"],
    config["MLS"]["points_for_win_loss_tie"],
    NUMBER_OF_SIMULATIONS,
)
append_to_results_df(observed_variance_mls, luck_variance_mls, n_years, n_teams, "MLS")

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is not available.
The schedule for the year 2021 is available.


#### PML

In [10]:
(
    df_prepared_pml,
    observed_variance_pml,
    luck_variance_pml,
    n_years,
    n_teams,
) = calculate_observed_and_luck_variance(
    config["PML"]["prepared_data_path"],
    config["PML"]["schedule_directory"],
    config["PML"]["probabilities_win_loss_tie"],
    config["PML"]["points_for_win_loss_tie"],
    NUMBER_OF_SIMULATIONS,
)
append_to_results_df(observed_variance_pml, luck_variance_pml, n_years, n_teams, "PML")

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is available.
The schedule for the year 2021 is available.


#### Ligue 1

In [11]:
(
    df_prepared_ligue1,
    observed_variance_ligue1,
    luck_variance_ligue1,
    n_years,
    n_teams,
) = calculate_observed_and_luck_variance(
    config["Ligue1"]["prepared_data_path"],
    config["Ligue1"]["schedule_directory"],
    config["Ligue1"]["probabilities_win_loss_tie"],
    config["Ligue1"]["points_for_win_loss_tie"],
    NUMBER_OF_SIMULATIONS,
)
append_to_results_df(
    observed_variance_ligue1, luck_variance_ligue1, n_years, n_teams, "Ligue1"
)

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is available.
The schedule for the year 2021 is available.


 #### SerieA

In [12]:
(
    df_prepared_serieA,
    observed_variance_serieA,
    luck_variance_serieA,
    n_years,
    n_teams,
) = calculate_observed_and_luck_variance(
    config["SerieA"]["prepared_data_path"],
    config["SerieA"]["schedule_directory"],
    config["SerieA"]["probabilities_win_loss_tie"],
    config["SerieA"]["points_for_win_loss_tie"],
    NUMBER_OF_SIMULATIONS,
)
append_to_results_df(
    observed_variance_serieA, luck_variance_serieA, n_years, n_teams, "SerieA"
)

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is available.
The schedule for the year 2021 is available.


#### LaLiga

In [13]:
(
    df_prepared_laliga,
    observed_variance_laliga,
    luck_variance_laliga,
    n_years,
    n_teams,
) = calculate_observed_and_luck_variance(
    config["LaLiga"]["prepared_data_path"],
    config["LaLiga"]["schedule_directory"],
    config["LaLiga"]["probabilities_win_loss_tie"],
    config["LaLiga"]["points_for_win_loss_tie"],
    NUMBER_OF_SIMULATIONS,
)
append_to_results_df(
    observed_variance_laliga, luck_variance_laliga, n_years, n_teams, "LaLiga"
)

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is available.
The schedule for the year 2021 is available.


#### Bundesliga

In [14]:
(
    df_prepared_bundesliga,
    observed_variance_bundesliga,
    luck_variance_bundesliga,
    n_years,
    n_teams,
) = calculate_observed_and_luck_variance(
    config["Bundesliga"]["prepared_data_path"],
    config["Bundesliga"]["schedule_directory"],
    config["Bundesliga"]["probabilities_win_loss_tie"],
    config["Bundesliga"]["points_for_win_loss_tie"],
    NUMBER_OF_SIMULATIONS,
)
append_to_results_df(
    observed_variance_bundesliga,
    luck_variance_bundesliga,
    n_years,
    n_teams,
    "Bundesliga",
)

The schedule for the year 2003 is available.
The schedule for the year 2004 is available.
The schedule for the year 2005 is available.
The schedule for the year 2006 is available.
The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2020 is available.
The schedule for the year 2021 is available.


#### Climbing

In [15]:
(
    df_prepared_climbing,
    observed_variance_climbing,
    luck_variance_climbing,
    n_years,
    n_teams,
) = calculate_observed_and_luck_variance(
    config["Climbing"]["prepared_data_path"],
    config["Climbing"]["schedule_directory"],
    config["Climbing"]["probabilities_win_loss_tie"],
    config["Climbing"]["points_for_win_loss_tie"],
    NUMBER_OF_SIMULATIONS,
    type="climbing",
)
append_to_results_df(
    observed_variance_climbing, luck_variance_climbing, n_years, n_teams, "Climbing"
)

The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2021 is available.
The schedule for the year 2022 is available.


#### Bouldering

In [16]:
(
    df_prepared_bouldering,
    observed_variance_bouldering,
    luck_variance_bouldering,
    n_years,
    n_teams,
) = calculate_observed_and_luck_variance(
    config["Bouldering"]["prepared_data_path"],
    config["Bouldering"]["schedule_directory"],
    config["Bouldering"]["probabilities_win_loss_tie"],
    config["Bouldering"]["points_for_win_loss_tie"],
    NUMBER_OF_SIMULATIONS,
    type="bouldering",
)
append_to_results_df(
    observed_variance_bouldering,
    luck_variance_bouldering,
    n_years,
    n_teams,
    "Bouldering",
)

The schedule for the year 2007 is available.
The schedule for the year 2008 is available.
The schedule for the year 2009 is available.
The schedule for the year 2010 is available.
The schedule for the year 2011 is available.
The schedule for the year 2012 is available.
The schedule for the year 2013 is available.
The schedule for the year 2014 is available.
The schedule for the year 2015 is available.
The schedule for the year 2016 is available.
The schedule for the year 2017 is available.
The schedule for the year 2018 is available.
The schedule for the year 2019 is available.
The schedule for the year 2021 is available.
The schedule for the year 2022 is available.


# Save Results

In [17]:
df_prepared_nba.to_parquet(config["NBA"]["results_path"])
df_prepared_nfl.to_parquet(config["NFL"]["results_path"])
df_prepared_nhl.to_parquet(config["NHL"]["results_path"])
df_prepared_mlb.to_parquet(config["MLB"]["results_path"])
df_prepared_mls.to_parquet(config["MLS"]["results_path"])
df_prepared_pml.to_parquet(config["PML"]["results_path"])
df_prepared_ligue1.to_parquet(config["Ligue1"]["results_path"])
df_prepared_serieA.to_parquet(config["SerieA"]["results_path"])
df_prepared_laliga.to_parquet(config["LaLiga"]["results_path"])
df_prepared_bundesliga.to_parquet(config["Bundesliga"]["results_path"])
df_prepared_climbing.to_parquet(config["Climbing"]["results_path"])
df_prepared_bouldering.to_parquet(config["Bouldering"]["results_path"])

# Calculating the luck contribution for each sport

### Calculation:

Based on [Classical Test Theory](https://en.wikipedia.org/wiki/Classical_test_theory) we assume that

$$ Observed Score = True Score + Error Score $$

In our case *True Score* reflects the skill component while the *Error Score* reflects the luck component.

Since we are assuming the two variables Luck and Skill to be independent we can estimate:

$$Var(Skill) =  Var(Observed) - Var(Luck) $$

We can estimate the contribution of skill (reliability) with:

$$ \rho_{OS}^2 = \frac{\sigma_S^2}{\sigma_O^2} = \frac{\sigma_O^2 - \sigma_L^2}{\sigma_O^2} = 1 - \frac{\sigma_L^2}{\sigma_O^2} $$ 

and the contribution of luck

$$ \rho_{OL}^2 = \frac{\sigma_L^2}{\sigma_O^2} $$ 


In [18]:
luck_contribution_nba = luck_variance_nba / observed_variance_nba
luck_contribution_nfl = luck_variance_nfl / observed_variance_nfl
luck_contribution_nhl = luck_variance_nhl / observed_variance_nhl
luck_contribution_mlb = luck_variance_mlb / observed_variance_mlb
luck_contribution_mls = luck_variance_mls / observed_variance_mls
luck_contribution_pml = luck_variance_pml / observed_variance_pml
luck_contribution_ligue1 = luck_variance_ligue1 / observed_variance_ligue1
luck_contribution_serieA = luck_variance_serieA / observed_variance_serieA
luck_contribution_laliga = luck_variance_laliga / observed_variance_laliga
luck_contribution_bundesliga = luck_variance_bundesliga / observed_variance_bundesliga
luck_contribution_climbing = luck_variance_climbing / observed_variance_climbing
luck_contribution_bouldering = luck_variance_bouldering / observed_variance_bouldering

print("NBA: ", luck_contribution_nba)
print("NFL: ", luck_contribution_nfl)
print("NHL: ", luck_contribution_nhl)
print("MLB: ", luck_contribution_mlb)
print("MLS: ", luck_contribution_mls)
print("PML: ", luck_contribution_pml)
print("Ligue1: ", luck_contribution_ligue1)
print("SerieA: ", luck_contribution_serieA)
print("LaLiga: ", luck_contribution_laliga)
print("Bundesliga: ", luck_contribution_bundesliga)
print("Climbing: ", luck_contribution_climbing)
print("Bouldering: ", luck_contribution_bouldering)

NBA:  0.13262363388657405
NFL:  0.41844917500213735
NHL:  0.4237682773880446
MLB:  0.29461530053317464
MLS:  0.7765685622157661
PML:  0.284220604142927
Ligue1:  0.4256139730337553
SerieA:  0.29857244076303663
LaLiga:  0.3307489008855544
Bundesliga:  0.3735740449101629
Climbing:  2.370963155795996
Bouldering:  0.795701693511433


In [19]:
results_df["Luck Contribution"] = (
    results_df["Variance Luck"] / results_df["Variance Observed"]
)
results_df.sort_values(by="Luck Contribution")

Unnamed: 0,Sport,#Teams (avg),#Seasons,Variance Observed,Variance Luck,Luck Contribution
1,NBA,30.0,16,614.527222,81.500833,0.132624
0,NBA,30.0,16,614.527222,82.276333,0.133886
6,PML,20.0,19,300.569342,85.428,0.284221
4,MLB,30.0,20,137.228333,40.429567,0.294615
8,SerieA,19.894737,19,288.607558,86.170263,0.298572
9,LaLiga,20.0,19,262.621711,86.861842,0.330749
10,Bundesliga,18.0,19,202.273717,75.564211,0.373574
2,NFL,32.0,19,9.619243,4.025164,0.418449
3,NHL,30.142857,14,187.313714,79.37761,0.423768
7,Ligue1,20.0,19,199.391579,84.863842,0.425614
