In [3]:
import numpy as np
import pandas as pd
import glob

class DataLoader:
    def __init__(self, root_path: str, paths: list[str]) -> None:
        self.root: str = root_path
        self.paths: list[str] = []
        for path in paths:
            self.paths.append(self.root + path)
    
    def load_csvs(self) -> dict[str, pd.DataFrame]:
        files: dict[str, pd.DataFrame] = {}
        for path in self.paths:
            csvs: list[str] = glob.glob(path + "*.csv")
            for csv in csvs:
                print("Reading " + csv + "...", end = "")
                files[csv.split('/')[-1]] = pd.read_csv(csv)
                print(" done.")
        
        return files

In [4]:
data_path = "../data/"
stage_one_path = "MDataFiles_Stage1/"
stage_two_path = "MDataFiles_Stage2/"

data_loader = DataLoader(data_path, [stage_one_path])
files = data_loader.load_csvs()

Reading ../data/MDataFiles_Stage1/MNCAATourneyDetailedResults.csv... done.
Reading ../data/MDataFiles_Stage1/MNCAATourneyCompactResults.csv... done.
Reading ../data/MDataFiles_Stage1/MSeasons.csv... done.
Reading ../data/MDataFiles_Stage1/MRegularSeasonDetailedResults.csv... done.
Reading ../data/MDataFiles_Stage1/MNCAATourneySlots.csv... done.
Reading ../data/MDataFiles_Stage1/MGameCities.csv... done.
Reading ../data/MDataFiles_Stage1/MConferenceTourneyGames.csv... done.
Reading ../data/MDataFiles_Stage1/Cities.csv... done.
Reading ../data/MDataFiles_Stage1/MRegularSeasonCompactResults.csv... done.
Reading ../data/MDataFiles_Stage1/MNCAATourneySeedRoundSlots.csv... done.
Reading ../data/MDataFiles_Stage1/MSampleSubmissionStage1.csv... done.
Reading ../data/MDataFiles_Stage1/MTeamConferences.csv... done.
Reading ../data/MDataFiles_Stage1/MTeamCoaches.csv... done.
Reading ../data/MDataFiles_Stage1/MMasseyOrdinals.csv... done.
Reading ../data/MDataFiles_Stage1/Conferences.csv... done.
Re

In [5]:
ordinals = files['MMasseyOrdinals.csv']
print(ordinals.head(1))

   Season  RankingDayNum SystemName  TeamID  OrdinalRank
0    2003             35        SEL    1102          159


In [8]:
seasons = np.unique(ordinals["Season"])
systems = np.unique(ordinals["SystemName"])

NameError: name 'system_dfs' is not defined

In [9]:
first_ordinals = pd.DataFrame()
final_ordinals = pd.DataFrame()
all_firsts = []
all_finals = []
for season in seasons:
    season_frame = ordinals.loc[ordinals["Season"] == season]
    for system in systems:
        season_system_frame = season_frame.loc[season_frame["SystemName"] == system]
        if not (season_system_frame.empty):
            minimum_day = min(season_system_frame["RankingDayNum"])
            maximum_day = max(season_system_frame["RankingDayNum"])
            season_system_firsts = season_system_frame.loc[season_system_frame["RankingDayNum"] == minimum_day]
            season_system_finals = season_system_frame.loc[season_system_frame["RankingDayNum"] == maximum_day]
            all_firsts.append(season_system_firsts)
            all_finals.append(season_system_finals)

first_ordinals = pd.concat(all_firsts, axis = 0)
final_ordinals = pd.concat(all_finals, axis = 0)
print(first_ordinals.shape)
print(final_ordinals.shape)

(367285, 5)
(367886, 5)


In [10]:
first_last = final_ordinals.merge(first_ordinals, on=["Season", "TeamID", "SystemName"], how="inner", suffixes = ("_final", "_first"))

In [11]:
first_last.columns
first_last.head(5)
system_dfs = []

for system in systems:
    system_dfs.append(first_last.loc[first_last["SystemName"] == system].drop(["SystemName", "RankingDayNum_final", "RankingDayNum_first"], axis=1).rename(columns={"OrdinalRank_final": system + "_final", "OrdinalRank_first": system + "_first"}))

In [13]:
joint_ordinals = system_dfs[0]
for df in system_dfs[1:]:
    joint_ordinals = joint_ordinals.merge(df, how="outer", on=["Season", "TeamID"])
print(joint_ordinals.shape)
joint_ordinals.head(5)

(6891, 376)


Unnamed: 0,Season,TeamID,7OT_final,7OT_first,ACU_final,ACU_first,ADE_final,ADE_first,AP_final,AP_first,...,WOB_final,WOB_first,WOL_final,WOL_first,WTE_final,WTE_first,YAG_final,YAG_first,ZAM_final,ZAM_first
0,2012,1102,146.0,120.0,,,152.0,191.0,,,...,158.0,148.0,172.0,179.0,,,,,,
1,2012,1103,80.0,89.0,,,70.0,172.0,,,...,85.0,81.0,65.0,134.0,,,,,,
2,2012,1104,35.0,31.0,,,31.0,14.0,,,...,34.0,39.0,41.0,35.0,,,,,,
3,2012,1105,338.0,338.0,,,324.0,187.0,,,...,336.0,339.0,333.0,233.0,,,,,,
4,2012,1106,300.0,320.0,,,284.0,314.0,,,...,316.0,313.0,306.0,278.0,,,,,,


In [14]:
seeds = files["MNCAATourneySeeds.csv"]
confs = files["MTeamConferences.csv"]
with_seeds = joint_ordinals.merge(seeds, how="inner", on=["Season", "TeamID"])
with_conf = with_seeds.merge(confs, on=["Season", "TeamID"])
data = with_conf

In [15]:
results = files["MNCAATourneyCompactResults.csv"].drop(["DayNum", "WScore", "LScore", "WLoc", "NumOT"], axis = 1)

In [16]:
results

Unnamed: 0,Season,WTeamID,LTeamID
0,1985,1116,1234
1,1985,1120,1345
2,1985,1207,1250
3,1985,1229,1425
4,1985,1242,1325
...,...,...,...
2312,2021,1211,1425
2313,2021,1417,1276
2314,2021,1124,1222
2315,2021,1211,1417


In [23]:
X = []
y = []
for i in range(len(results)):
    result = results.iloc[i]
    team1 = min(result["WTeamID"], result["LTeamID"])
    team2 = max(result["WTeamID"], result["LTeamID"])
    season = result["Season"]
    season_data = data.loc[data["Season"] == season]
    x1 = season_data.loc[(season_data["TeamID"] == team1)]
    x2 = season_data.loc[(season_data["TeamID"] == team2)]
    if (x1.shape[0] == 1) and (x2.shape[0] == 1):
        x = x1.merge(x2, on=["Season"], suffixes=("_team1", "_team2"))
        X.append(x)
        if (team1 == result["WTeamID"]):
            y.append(1)
        else:
            y.append(0)
y = pd.Series(y, name="label")
X = pd.concat(X, axis=0)
print(np.mean(y))
X["label"] = y
print(np.mean(X["label"]))
X = X.reset_index()

0.48348856900931414
0.0


In [19]:
np.mean(X["label"])

0.0

In [207]:
from autogluon.tabular import TabularPredictor

In [None]:

predictor = TabularPredictor(label="label").fit(X)

No path specified. Models will be saved in: "AutogluonModels/ag-20220312_231953/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220312_231953/"
AutoGluon Version:  0.4.0
Python Version:     3.9.10
Operating System:   Darwin
Train Data Rows:    1181
Train Data Columns: 756
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == int, but few unique label-values observed).
	1 unique label values:  [0]
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Train Data Class Count: 1
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    6029.31 MB
	Train Data (Original)  Memory Usage: 7.39 MB (0.1% of available memory)
	Inferring data type of each featu

Fitting model: LightGBM ...
[LightGBM] [Fatal] Number of classes should be specified and greater than 1 for multiclass training
		Number of classes should be specified and greater than 1 for multiclass training
Detailed Traceback:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1074, in _train_and_save
    model = self._train_single(X, y, model, X_val, y_val, **model_fit_kwargs)
  File "/usr/local/lib/python3.9/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1032, in _train_single
    model = model.fit(X=X, y=y, X_val=X_val, y_val=y_val, **model_fit_kwargs)
  File "/usr/local/lib/python3.9/site-packages/autogluon/core/models/abstract/abstract_model.py", line 577, in fit
    out = self._fit(**kwargs)
  File "/usr/local/lib/python3.9/site-packages/autogluon/tabular/models/lgb/lgb_model.py", line 185, in _fit
    self.model = lgb.train(**train_params)
  File "/usr/local/lib/python3.9/site-p

Detailed Traceback:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/site-packages/autogluon/core/trainer/abstract_trainer.py", line 1092, in _train_and_save
    score = model.score(X=X_val, y=y_val, sample_weight=w_val)
  File "/usr/local/lib/python3.9/site-packages/autogluon/core/models/abstract/abstract_model.py", line 727, in score
    y_pred = self.predict(X=X, **kwargs)
  File "/usr/local/lib/python3.9/site-packages/autogluon/core/models/abstract/abstract_model.py", line 664, in predict
    y_pred_proba = self.predict_proba(X, **kwargs)
  File "/usr/local/lib/python3.9/site-packages/autogluon/core/models/abstract/abstract_model.py", line 677, in predict_proba
    y_pred_proba = self._predict_proba(X=X, **kwargs)
  File "/usr/local/lib/python3.9/site-packages/autogluon/tabular/models/rf/rf_model.py", line 232, in _predict_proba
    return self._convert_proba_to_unified_form(y_pred_proba)
  File "/usr/local/lib/python3.9/site-packages/autogluon/core/models/abstra