In [1]:
import pandas as pd
import numpy as np
import sys
import os
import bs4
from IPython.display import display_html,clear_output, HTML
import re
from datetime import datetime
import ast
import itertools
from tqdm import tqdm,trange
from file_tools import *
from request_tools import *
from parse_tools import *
from load_tools import *
import pyarrow as pa

pd.set_option('display.max_columns', None)
pd.set_option('display.min_rows', 30)
pd.set_option('compute.use_numexpr', False)

# import warnings
# from pandas.errors import PerformanceWarning
# pd.options.mode.use_inf_as_na = True
# warnings.filterwarnings('ignore', category=PerformanceWarning)

SRC_DIR = './data-indexed'
TGT_DIR = './data-aggregated'

In [None]:
def __load_team_stats_df__(filename):
    return pd.read_csv(filename,index_col=[0,1],header=[0,1]).drop(columns='Source',level=0)

def __load_facts_df__(filename):
    df = pd.read_csv(filename,index_col=[0,1],header=[0])
    df.columns = pd.MultiIndex.from_product([['Match Info'],df.columns])
    return df

def __get_opp_stats_row_df__(STATS_DF,IDX_DF,TM_IDX):
    OPP_IDX         = IDX_DF.loc[TM_IDX]
    OPP_STATS_DF    = STATS_DF.loc[OPP_IDX].to_frame().T
    OPP_STATS_DF.index = pd.MultiIndex.from_tuples([TM_IDX],names=['index','Team_id'])
    return OPP_STATS_DF

def __get_opp_stats_df__(STATS_DF,IDX_DF):
    OPP_STATS_DF_LIST = []
    for TM_IDX in STATS_DF.index:
        OPP_STATS_DF_LIST.append(__get_opp_stats_row_df__(STATS_DF,IDX_DF,TM_IDX))
    OPP_STATS_DF = pd.concat(OPP_STATS_DF_LIST,axis=0)
    return OPP_STATS_DF

IDX_DIR = './05-data-indexed/'
SRC_DIR = './06-data-normalized-reindexed/'
LG_SS_DIR = '/leagues/NBA_2020/'
print(get_all_files(SRC_DIR+LG_SS_DIR))

TM_OPP_MN_IDX = load_index_dict(IDX_DIR+LG_SS_DIR)['team_opp_main']
OPP_IDX_DF = TM_OPP_MN_IDX['Opp_Curr_Gm']

TGL_FACTS_GM_RESULTS_DIR = 'facts_gm_results_tgl_basic'
TGL_FACTS_VEN_REST_DIR = 'facts_venue_rest_days_tgl_basic'

TGL_BAS_NORM_CUMU_AVG_DIR = 'norm_minmax_tgl_basic_stats_cumu_avg'
TGL_BAS_NORM_VEN_CUMU_AVG_DIR = 'norm_minmax_tgl_basic_stats_ven_cumu_avg'
TGL_BAS_NORM_ROLL_08_AVG_DIR = 'norm_minmax_tgl_basic_stats_roll_08_avg'

TGL_ADV_NORM_CUMU_AVG_DIR = 'norm_minmax_tgl_advanced_stats_cumu_avg'
TGL_ADV_NORM_VEN_CUMU_AVG_DIR = 'norm_minmax_tgl_advanced_stats_ven_cumu_avg'
TGL_ADV_NORM_ROLL_08_AVG_DIR = 'norm_minmax_tgl_advanced_stats_roll_08_avg'

Team stats and facts

In [None]:
TGL_FACTS_GM_RESULTS_DF         = __load_facts_df__(f'{SRC_DIR}/{LG_SS_DIR}/{TGL_FACTS_GM_RESULTS_DIR}.csv') # Our label dataframe
TGL_FACTS_VEN_REST_DF           = __load_facts_df__(f'{SRC_DIR}/{LG_SS_DIR}/{TGL_FACTS_VEN_REST_DIR}.csv')

TGL_BAS_NORM_CUMU_AVG_DF            = __load_team_stats_df__(f'{SRC_DIR}/{LG_SS_DIR}/{TGL_BAS_NORM_CUMU_AVG_DIR}.csv')
TGL_BAS_NORM_VEN_CUMU_AVG_REIDX_DF  = __load_team_stats_df__(f'{SRC_DIR}/{LG_SS_DIR}/{TGL_BAS_NORM_VEN_CUMU_AVG_DIR}.csv')
TGL_BAS_NORM_ROLL_08_AVG_REIDX_DF   = __load_team_stats_df__(f'{SRC_DIR}/{LG_SS_DIR}/{TGL_BAS_NORM_ROLL_08_AVG_DIR}.csv')

TGL_ADV_NORM_CUMU_AVG_REIDX_DF      = __load_team_stats_df__(f'{SRC_DIR}/{LG_SS_DIR}/{TGL_ADV_NORM_CUMU_AVG_DIR}.csv')
TGL_ADV_NORM_VEN_CUMU_AVG_REIDX_DF  = __load_team_stats_df__(f'{SRC_DIR}/{LG_SS_DIR}/{TGL_ADV_NORM_VEN_CUMU_AVG_DIR}.csv')
TGL_ADV_NORM_ROLL_08_AVG_REIDX_DF   = __load_team_stats_df__(f'{SRC_DIR}/{LG_SS_DIR}/{TGL_ADV_NORM_ROLL_08_AVG_DIR}.csv')

Opponents stats

In [None]:
TGL_OPP_FACTS_VEN_REST_DF = __get_opp_stats_df__(TGL_FACTS_VEN_REST_DF,TM_OPP_MN_IDX['Opp_Curr_Gm'])

TGL_OPP_BAS_NORM_CUMU_AVG_DF            = __get_opp_stats_df__(TGL_BAS_NORM_CUMU_AVG_DF,OPP_IDX_DF)
TGL_OPP_BAS_NORM_VEN_CUMU_AVG_REIDX_DF  = __get_opp_stats_df__(TGL_BAS_NORM_VEN_CUMU_AVG_REIDX_DF,OPP_IDX_DF)
TGL_OPP_BAS_NORM_ROLL_08_AVG_REIDX_DF   = __get_opp_stats_df__(TGL_BAS_NORM_ROLL_08_AVG_REIDX_DF,OPP_IDX_DF)

TGL_OPP_ADV_NORM_CUMU_AVG_REIDX_DF      = __get_opp_stats_df__(TGL_ADV_NORM_CUMU_AVG_REIDX_DF,OPP_IDX_DF)
TGL_OPP_ADV_NORM_VEN_CUMU_AVG_REIDX_DF  = __get_opp_stats_df__(TGL_ADV_NORM_VEN_CUMU_AVG_REIDX_DF,OPP_IDX_DF)
TGL_OPP_ADV_NORM_ROLL_08_AVG_REIDX_DF   = __get_opp_stats_df__(TGL_ADV_NORM_ROLL_08_AVG_REIDX_DF,OPP_IDX_DF)

Combined

In [7]:
import yaml

def load_yaml(file_path):
    with open(file_path, 'r') as file:
        config = yaml.safe_load(file)
    return config


def save_yaml(file_path, config):
    with open(file_path, 'w') as file:
        yaml.dump(config, file, default_flow_style=False)

        
# Example usage
config = load_yaml('config.yaml')
print(config)


{'data_dir': {'data_facts_dir': './00-data-facts', 'data_html_dir': './01-data-html', 'data_parsed_dir': './02-data-parsed', 'data_aggregated_dir': './03-data-aggregated', 'data_normalized_dir': './04-data-normalized', 'data_indexed_dir': './05-data-indexed', 'data_reindexed_dir': './06-data-normalized-reindexed', 'data_compiled_dir': './07-data-compiled'}, 'src_data': {'src_dir': './06-data-normalized-reindexed', 'seasons': [None, None], 'features': ['facts_venue_rest_days_tgl_basic', 'norm_minmax_tgl_basic_stats_cumu_avg', 'norm_minmax_tgl_basic_stats_ven_cumu_avg', 'norm_minmax_tgl_basic_stats_roll_08_avg', 'norm_minmax_tgl_advanced_stats_cumu_avg', 'norm_minmax_tgl_advanced_stats_ven_cumu_avg', 'norm_minmax_tgl_advanced_stats_roll_08_avg'], 'label': 'facts_gm_results_tgl_basic', 'opponent': True}, 'tgt_data': {'tgt_dir': './07-data-compiled', 'dataset_name': 'compiled-dataset'}}


In [None]:
# def compile_dataset(opponent=False):
SRC_DIR = config['src_dir'] # './06-data-normalized-reindexed/'
TGT_DIR = config['tgt_dir'] # './07-data-reindexed-compiled/'
IDX_DIR = config['idx_dir'] # './05-data-indexed/'
LG_SS_START, LG_SS_END = config['seasons'] # '/leagues/NBA_2020/'
opponent = config['opponent']
SRC_DATATSETS_NAMES = config['src_datasets_names']
LABEL = config['label']


LG_SS_DIR = '/leagues/NBA_2023/'

opponent=True
SRC_DATATSETS_NAMES = [
    'facts_venue_rest_days_tgl_basic',
    'norm_minmax_tgl_basic_stats_cumu_avg',
    'norm_minmax_tgl_basic_stats_ven_cumu_avg',
    'norm_minmax_tgl_basic_stats_roll_08_avg',
    'norm_minmax_tgl_advanced_stats_cumu_avg',
    'norm_minmax_tgl_advanced_stats_ven_cumu_avg',
    'norm_minmax_tgl_advanced_stats_roll_08_avg'
]
LABEL = 'facts_gm_results_tgl_basic'
OPP_IDX_DF = load_index_dict(IDX_DIR+LG_SS_DIR)['team_opp_main']['Opp_Curr_Gm']
DATASETS = {}
for SRC_DATASET_NAME in SRC_DATATSETS_NAMES:
    SRC_DATASET_DIR = f'{SRC_DIR}/{LG_SS_DIR}/{SRC_DATASET_NAME}'
    if 'facts' in SRC_DATASET_NAME:
        TM_DATASET_DF = __load_facts_df__(f'{SRC_DATASET_DIR}.csv')
        DATASETS[f'Team_{SRC_DATASET_NAME}']  = TM_DATASET_DF
        if opponent:
            OPP_DATASET_DF = __get_opp_stats_df__(TM_DATASET_DF,OPP_IDX_DF)
            DATASETS[f'Opp_{SRC_DATASET_NAME}'] = OPP_DATASET_DF
    else:
        TM_DATASET_DF = __load_team_stats_df__(f'{SRC_DATASET_DIR}.csv')
        DATASETS[f'Team_{SRC_DATASET_NAME}']  = TM_DATASET_DF
        if opponent:
            OPP_DATASET_DF = __get_opp_stats_df__(TM_DATASET_DF,OPP_IDX_DF)
            DATASETS[f'Opp_{SRC_DATASET_NAME}'] = OPP_DATASET_DF
FEATURES_DF = pd.concat(DATASETS.values(),keys=DATASETS.keys(),axis=1)
LABELS_DF = __load_facts_df__(f'{SRC_DIR}/{LG_SS_DIR}/{LABEL}.csv')

make_directory(f'{TGT_DIR}/{LG_SS_DIR}')
FEATURES_DF.to_csv(f'{TGT_DIR}/{LG_SS_DIR}/features.csv')
LABELS_DF.to_csv(f'{TGT_DIR}/{LG_SS_DIR}/labels.csv')
    # return FEATURES_DF,LABEL_DF

        # SRC_DATASET_DF = __load_facts_df__(f'{SRC_DATASET_DIR}.csv')
        # SRC_DATASET_OPP_DF = get_opp_stats_df(SRC_DATASET_DF,TM_OPP_MN_IDX['Opp_Curr_Gm'])
        # SRC_DATASET_OPP_DF.to_csv(f'{SRC_DATASET_DIR}_opp.csv')

In [None]:
FEATURES_DF.dropna().describe()

In [None]:
FEATURES_DF.dropna()

In [None]:
LABELS_DF

In [None]:
FEATURES_DF = pd.concat(DATASETS.values(),keys=DATASETS.keys(),axis=1)
LABEL_DF = __load_facts_df__(f'{SRC_DIR}/{LG_SS_DIR}/{LABEL}.csv')

In [None]:
TGL_ALL_DF = pd.concat([TGL_FACTS_VEN_REST_DF,
                        TGL_BAS_ADV_STATS_CUMU_AVG_NORM_TM_OPP_DF_REIDX,
                        TGL_BAS_ADV_STATS_VEN_CUMU_AVG_NORM_TM_OPP_DF_REIDX,
                        TGL_BAS_ADV_STATS_ROLL_08_AVG_NORM_TM_OPP_DF_REIDX],axis=1,
                        keys = ['facts_venue_rest_days',
                                'stats_cumu_avg',
                                'stats_venue_cumu_avg',
                                'stats_roll_08_avg'])

In [None]:
TGL_OPP_BAS_NORM_CUMU_AVG_DF

In [None]:

# TGL_BAS_ADV_STATS_VEN_CUMU_AVG_NORM_TM_OPP_DF_REIDX = get_team_opp_stats_df(TGL_BAS_ADV_STATS_VEN_CUMU_AVG_NORM_DF_REIDX,IDX_DICT['team_opp_games'])
# TGL_BAS_ADV_STATS_ROLL_08_AVG_NORM_TM_OPP_DF_REIDX = get_team_opp_stats_df(TGL_BAS_ADV_STATS_ROLL_08_AVG_NORM_DF_REIDX,IDX_DICT['team_opp_games'])

In [None]:
TM_OPP_MN_IDX = IDX_DICT['team_opp_main']
TGL_BAS_STATS_CUMU_AVG_NORM_DF_REIDX = __reindex_stats__(TGL_BAS_STATS_CUMU_AVG_NORM_DF,TM_OPP_MN_IDX['Team_Prev_Gm_01'])
TGL_BAS_STATS_VEN_CUMU_AVG_NORM_DF_REIDX = __reindex_stats__(TGL_BAS_STATS_VEN_CUMU_AVG_NORM_DF,TM_OPP_MN_IDX['Team_Ven_Prev_Gm_01'])
TGL_BAS_STATS_ROLL_08_AVG_NORM_DF_REIDX = __reindex_stats__(TGL_BAS_STATS_ROLL_08_AVG_NORM_DF,TM_OPP_MN_IDX['Team_Prev_Gm_01'])
TGL_ADV_STATS_CUMU_AVG_NORM_DF_REIDX = __reindex_stats__(TGL_ADV_STATS_CUMU_AVG_NORM_DF,TM_OPP_MN_IDX['Team_Prev_Gm_01'])
TGL_ADV_STATS_VEN_CUMU_AVG_NORM_DF_REIDX = __reindex_stats__(TGL_ADV_STATS_VEN_CUMU_AVG_NORM_DF,TM_OPP_MN_IDX['Team_Ven_Prev_Gm_01'])
TGL_ADV_STATS_ROLL_08_AVG_NORM_DF_REIDX = __reindex_stats__(TGL_ADV_STATS_ROLL_08_AVG_NORM_DF,TM_OPP_MN_IDX['Team_Prev_Gm_01'])

TGL_BAS_ADV_STATS_CUMU_AVG_NORM_DF_REIDX = pd.concat([TGL_BAS_STATS_CUMU_AVG_NORM_DF_REIDX,TGL_ADV_STATS_CUMU_AVG_NORM_DF_REIDX],axis=1)
TGL_BAS_ADV_STATS_VEN_CUMU_AVG_NORM_DF_REIDX = pd.concat([TGL_BAS_STATS_VEN_CUMU_AVG_NORM_DF_REIDX,TGL_ADV_STATS_VEN_CUMU_AVG_NORM_DF_REIDX],axis=1)
TGL_BAS_ADV_STATS_ROLL_08_AVG_NORM_DF_REIDX = pd.concat([TGL_BAS_STATS_ROLL_08_AVG_NORM_DF_REIDX,TGL_ADV_STATS_ROLL_08_AVG_NORM_DF_REIDX],axis=1)

TGL_BAS_ADV_STATS_CUMU_AVG_NORM_TM_OPP_DF_REIDX = get_team_opp_stats_df(TGL_BAS_ADV_STATS_CUMU_AVG_NORM_DF_REIDX,IDX_DICT['team_opp_games'])
TGL_BAS_ADV_STATS_VEN_CUMU_AVG_NORM_TM_OPP_DF_REIDX = get_team_opp_stats_df(TGL_BAS_ADV_STATS_VEN_CUMU_AVG_NORM_DF_REIDX,IDX_DICT['team_opp_games'])
TGL_BAS_ADV_STATS_ROLL_08_AVG_NORM_TM_OPP_DF_REIDX = get_team_opp_stats_df(TGL_BAS_ADV_STATS_ROLL_08_AVG_NORM_DF_REIDX,IDX_DICT['team_opp_games'])

TGL_FACTS_GM_RESULTS_DF = __load_season_facts_from_dir__(os.path.join(SRC_DIR,LG_SS_DIR,TGL_FACTS_GM_RESULTS_DIR))
TGL_FACTS_VEN_REST_DF = __load_season_facts_from_dir__(os.path.join(SRC_DIR,LG_SS_DIR,TGL_FACTS_VEN_REST_DIR))
TGL_FACTS_VEN_REST_DF.columns = pd.MultiIndex.from_product([['Match'],['Info'], TGL_FACTS_VEN_REST_DF.columns]) # So that it can be concatenated with TGL_FACTS_GM_RESULTS_DF
TGL_FACTS_GM_RESULTS_DF.columns = pd.MultiIndex.from_product([['Match'],['Results'], TGL_FACTS_GM_RESULTS_DF.columns])

# TGL_BAS_ADV_STATS_CUMU_AVG_NORM_DF

In [None]:
TGL_ALL_DF = pd.concat([TGL_FACTS_VEN_REST_DF,
                        TGL_BAS_ADV_STATS_CUMU_AVG_NORM_TM_OPP_DF_REIDX,
                        TGL_BAS_ADV_STATS_VEN_CUMU_AVG_NORM_TM_OPP_DF_REIDX,
                        TGL_BAS_ADV_STATS_ROLL_08_AVG_NORM_TM_OPP_DF_REIDX],axis=1,
                        keys = ['facts_venue_rest_days',
                                'stats_cumu_avg',
                                'stats_venue_cumu_avg',
                                'stats_roll_08_avg'])

TGL_ALL_DF

In [None]:
SS_STATS_DF = TGL_BAS_ADV_STATS_CUMU_AVG_NORM_DF_REIDX.copy()
GM_IDX_I = IDX_DICT['team_opp_games'].iloc[1000]
TM_IDX_I = GM_IDX_I['Team']
OPP_IDX_I = GM_IDX_I['Opp']

TM_DF_I = SS_STATS_DF.loc[TM_IDX_I]
OPP_DF_I = SS_STATS_DF.loc[OPP_IDX_I]
TM_OPP_DF_I = pd.concat([TM_DF_I,OPP_DF_I],keys=['Team_Cumu_Stats','Opponent_Cumu_Stats']).to_frame().T
TM_OPP_DF_I.index = pd.MultiIndex.from_tuples([TM_IDX_I],names=['index','Team_id'])
TM_OPP_DF_I


In [None]:
OPP_TM_DF_I = pd.concat([OPP_DF_I,TM_DF_I],keys=['Team_Cumu_Stats','Opponent_Cumu_Stats']).to_frame().T
OPP_TM_DF_I.index = pd.MultiIndex.from_tuples([OPP_IDX_I],names=['index','Team_id'])
OPP_TM_DF_I