In [11]:
import pandas as pd
import numpy as np
from collections import Counter
import os

# Validate the standings for correctness

In [12]:
def get_table(df, year):
    df = df[df['Year'] == year]
    if len(df) == 1:
        teams = list(df['Teams'].values[0])
        wins = list(df['Wins'].values[0])
        losses = list(df['Losses'].values[0])
        ties = list(df['Ties'].values[0])
        df = pd.DataFrame({'Teams': teams, 'Wins': wins, 'Losses': losses, 'Ties': ties})
    return df

def check_every_team_has_the_same_amount_of_games(df):
    years = df['Year'].unique()
    for year in years:
        table = get_table(df, year)
        if np.var(table[["Wins", "Losses", "Ties"]].sum(axis=1)) > 0.0:
            print(f"For year {year} teams do not have the same amount of games")
    

## Read in the data

In [13]:
df_nba = pd.read_parquet("prepared_data/NBA_data.parquet")
df_mlb = pd.read_parquet("prepared_data/MLB_data.parquet")
df_nfl = pd.read_parquet("prepared_data/NFL_data.parquet")
df_nhl = pd.read_parquet("prepared_data/NHL_data.parquet")
df_mls = pd.read_parquet("prepared_data/MLS_data.parquet")



### Check if every team has the same amount of games in the standings

For some years it is expected that teams have a different amount of games.

In [14]:
### 
print("NBA")
check_every_team_has_the_same_amount_of_games(df_nba)
print("MLB")
check_every_team_has_the_same_amount_of_games(df_mlb)
print("NFL")
check_every_team_has_the_same_amount_of_games(df_nfl)
print("NHL")
check_every_team_has_the_same_amount_of_games(df_nhl)
print("MLS")
check_every_team_has_the_same_amount_of_games(df_mls)

NBA
For year 1949 teams do not have the same amount of games
For year 1950 teams do not have the same amount of games
For year 1952 teams do not have the same amount of games
For year 2012 teams do not have the same amount of games
For year 2017 teams do not have the same amount of games
MLB
For year 2003 teams do not have the same amount of games
For year 2004 teams do not have the same amount of games
For year 2006 teams do not have the same amount of games
For year 2007 teams do not have the same amount of games
For year 2008 teams do not have the same amount of games
For year 2009 teams do not have the same amount of games
For year 2011 teams do not have the same amount of games
For year 2013 teams do not have the same amount of games
For year 2015 teams do not have the same amount of games
For year 2016 teams do not have the same amount of games
For year 2018 teams do not have the same amount of games
For year 2019 teams do not have the same amount of games
For year 2020 teams do 

### Check schedules for each league

In [62]:
file_path = "prepared_data/schedules/MLB/MLB_schedule_2022.csv"

def list_all_csv_files_in_directory(directory):
    return [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith(".csv")]

def check_every_team_has_the_same_amount_of_games_in_schedule(file_path):
    df_schedule = pd.read_csv(file_path)
    all_teams = df_schedule["Away"].values.tolist() + df_schedule["Home"].values.tolist()
    if np.var(list(Counter(all_teams).values())) == 0.0:
        return True
    else:
        return False

def check_all_schedules_in_directory(directory):
    files = list_all_csv_files_in_directory(directory)
    is_any_schedule_incorrect = False
    for file in files:
        if not check_every_team_has_the_same_amount_of_games_in_schedule(file):
            is_any_schedule_incorrect = True
            print(f"Schedule {file} does not have the same amount of games for all teams")

    if not is_any_schedule_incorrect:
        print("All schedules are correct")

#### NBA


In [63]:
nba_schedule_directory = 'prepared_data/schedules/NBA'
check_all_schedules_in_directory(nba_schedule_directory)

All schedules are correct


#### NHL

In [64]:
nhl_schedule_directory = 'prepared_data/schedules/NHL'
check_all_schedules_in_directory(nhl_schedule_directory)

All schedules are correct


#### NFL

In [65]:
nfl_schedule_directory = 'prepared_data/schedules/NFL'
check_all_schedules_in_directory(nfl_schedule_directory)

All schedules are correct


#### MLB

In [66]:
mlb_schedule_directory = 'prepared_data/schedules/MLB'
check_all_schedules_in_directory(mlb_schedule_directory)

All schedules are correct


#### MLS

In [67]:
mls_schedule_directory = 'prepared_data/schedules/MLS'
check_all_schedules_in_directory(mls_schedule_directory)

Schedule prepared_data/schedules/MLS/MLS_2001.csv does not have the same amount of games for all teams
