In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
from typing import List
from tqdm import tqdm
import multiprocessing

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost

import networkx
from networkx.algorithms.traversal.depth_first_search import dfs_edges

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
data = dict()
for dirname, _, filenames in os.walk('../kaggle/input'):
    for filename in filenames:
        table_name = filename.split('.')[0]
        table_path = os.path.join(dirname, filename)
        try:
            data[table_name] = pd.read_csv(table_path)
        except UnicodeDecodeError:
            data[table_name] = pd.read_csv(table_path, encoding='cp1252')
        except Exception as e:
            print(f"Error with {filename}: {e}")

# Split dict of dataframes by gender and other (supplemental) data
mens_data = dict()
womens_data = dict()
supplemental_data = dict()

for k, v in data.items():
    if k.startswith("M"):
        mens_data[k] = v
    elif k.startswith("W"):
        womens_data[k] = v
    else:
        supplemental_data[k] = v
        

In [85]:
def get_season_stats(dataset, detailed=False, post_season=False, year=None):
    # Gets the first letter in dataset
    gender = list(dataset.keys())[0][0]
    
    if detailed:
        if post_season:
            df = dataset[f"{gender}NCAATourneyDetailedResults"]
        else:
            df = dataset[f"{gender}RegularSeasonDetailedResults"]
        
    else:
        if post_season:
            df = dataset[f"{gender}NCAATourneyCompactResults"]
        else:
            df = dataset[f"{gender}RegularSeasonCompactResults"]
    if year is not None:
        df = df[df["Season"] == year]
    return df, gender

def compute_margins_of_victory(df):
    df["margin"] = df["WScore"] - df["LScore"]
    
    win_df = df[["WTeamID", "margin"]].rename(columns={"WTeamID": "TeamID"})
    lose_df = df[["LTeamID", "margin"]].rename(columns={"LTeamID": "TeamID"})
    lose_df["margin"] = -lose_df["margin"]

    res = pd.concat([win_df, lose_df], axis=0)
    return res.groupby("TeamID")["margin"].mean()

def join_team_names(df, data, gender="M"):
    """
    df: pd.DataFrame
        dataframe appending teams to
    data: dict[str, pd.DataFrame]
        dictionary of all table names and data
    """
    res = pd.merge(df, data[f"{gender}Teams"][["TeamID", "TeamName"]], on="TeamID")
    return res

def create_srs(df,gender):

    df["margin"] = df["WScore"] - df["LScore"]
    win_df = df[["WTeamID", "margin", "LTeamID"]].rename(
        columns={"WTeamID": "team_id", "LTeamID": "opp_id"}
    )
    lose_df = df[["WTeamID", "margin", "LTeamID"]].rename(
        columns={"LTeamID": "team_id", "WTeamID": "opp_id"}
    )
    lose_df["margin"] = -lose_df["margin"]

    teams = pd.concat([win_df, lose_df], axis=0)
    spreads = compute_margins_of_victory(df)
    
    terms = []
    solutions = []

    for team_id in spreads.keys():
        row = []
        opps = list(teams[teams["team_id"] == team_id]["opp_id"])

        for opp_id in spreads.keys():
            if opp_id == team_id:
                # coef for the team itself should be 1
                row.append(1)
            elif opp_id in opps:
                # coef for opponents is 1 over num of opps
                row.append(-1.0/len(opps))
            else:
                # teams not faced get a 0 coef
                row.append(0)
        terms.append(row)

        solutions.append(spreads[team_id])

    solutions, _, _, _ = np.linalg.lstsq(np.array(terms), np.array(solutions), rcond=None)
    
    ratings = list(zip( spreads.keys(), solutions ))
    srs = pd.DataFrame(ratings, columns=['team', 'rating'])
    rankings = srs.sort_values('rating', ascending=False).reset_index()[['team', 'rating']]
    rankings = join_team_names(rankings.rename(columns={"team": "TeamID"}), data, gender=gender)
    return rankings

def get_coach_win_perc(
    dataset: dict,
    regular_season: bool,
    year:int = 2024
) -> pd.DataFrame:
    """
    
    parameters
    ----------
    dataset: dict
        dictionary of datasets to use. it will be
        mens_data or womens_data.
        
    year: int
        year to filter data. it will get coaches stats for everything
        up until this year. (model can't have any look ahead bias). for post
        season games, use a year one less than the year of interest.
        
    returns
    -------
    coaches_stats: pd.DataFrame
        dataframe with count of wins, win percentage, and std dev
        of wins.
    """
    
    # Gets the first letter in dataset
    gender = list(dataset.keys())[0][0]
    
    if regular_season:
        df = dataset[f"{gender}RegularSeasonCompactResults"]
        #Filter season up until season of interest
        df = df[df["Season"] <= year]
    else:
        df = dataset[f"{gender}NCAATourneyCompactResults"]
        #Filter season up until season of interest
        df = df[df["Season"] < year]
        
    
    
    winning_coaches_df = pd.merge(
        df,
        dataset[f"{gender}TeamCoaches"],
        how="left",
        left_on=["Season", "WTeamID"],
        right_on=["Season", "TeamID"]
    )

    winning_coaches_df = winning_coaches_df[
        (winning_coaches_df['DayNum'] >= winning_coaches_df['FirstDayNum']) 
        & (winning_coaches_df['DayNum'] <= winning_coaches_df['LastDayNum'])
    ]
    winning_coaches_df["win"] = 1

    #Make sure the join dind't create dupes
    assert len(winning_coaches_df) == len(df)

    losing_coaches_df = pd.merge(
        df,
        dataset[f"{gender}TeamCoaches"],
        how="left",
        left_on=["Season", "LTeamID"],
        right_on=["Season", "TeamID"]
    )

    losing_coaches_df = losing_coaches_df[
        (losing_coaches_df['DayNum'] >= losing_coaches_df['FirstDayNum']) 
        & (losing_coaches_df['DayNum'] <= losing_coaches_df['LastDayNum'])
    ]
    losing_coaches_df["win"] = 0

    #Make sure the join dind't create dupes
    assert len(losing_coaches_df) == len(df)

    coaches_df = pd.concat(
        [
            losing_coaches_df[["CoachName", "win"]],
            winning_coaches_df[["CoachName", "win"]]
        ],
        axis=0
    )

    coach_stats = (
        coaches_df
        .groupby("CoachName")["win"]
        .describe()
        .sort_values("count", ascending=False)
        [["count", "mean", "std"]]
        .fillna(0)
    )

    return coach_stats
def get_system_ratings(
    mens_dataset, #There are only ratings for men
    systems: List[str],
    year: int=2024,
):
    """
    gets system ratings for each team for specified systems for a specific year.
    
    parameters
    ---------
    mens_dataset: dict
        dictionary of datasets for men
    systems: List[str]
        list of dictionaries we are interested in seeing
    year: int
        year to look for ratings
    moving_average: str
        specifies how to calculate rolling ratings for given systems.
        if None, the system takes the most recent system rating
    
    returns
    -------
    df: pd.DataFrame
        data that reflects ratings for a team
    """
    
    # Filter by season - only take most recent
    df = mens_dataset["MMasseyOrdinals"]
    df = df[df["Season"] == year]
    
    # Filter by system
    df = df[df["SystemName"].isin(systems)]
    
    latest_rank = (
        df
        .sort_values("RankingDayNum")
        .groupby(["TeamID","SystemName"])
        ["OrdinalRank"]
        .last()
        .unstack("SystemName")
        .reset_index().
        rename(columns = {i: i+"_latest" for i in systems})
    )
    
    transformed_df = (
        df
        .sort_values(by="RankingDayNum")
        .groupby(["TeamID", "SystemName"], group_keys=False)
        ["OrdinalRank"]
        .rolling(5) # TODO: Parameterize this (window and moving average method)
        .mean()
        .unstack("SystemName")
        .reset_index()
        .drop("level_1", axis=1)
        .groupby("TeamID")
        [systems]
        .last()
        .reset_index()
        .rename(columns = {i: i+"_rolling" for i in systems})
    )
    
    res = pd.merge(latest_rank, transformed_df, on="TeamID")

    return res

def get_post_season(data, year):
    
    df, gender = get_season_stats(
            data, 
            detailed=False, 
            post_season=True, 
            year=year
    )
    
    # Shuffle teams for positional encoding (model shouldn't have winning teams features as the same)
    df["TeamID"] = np.where(
        np.random.uniform(0,1, size=len(df)) > .5, 
        df["WTeamID"], 
        df["LTeamID"]
    )
    df["team_score"] = np.where(
        df["TeamID"] == df["WTeamID"], 
        df["WScore"], 
        df["LScore"]
    )
    df["OppID"] = np.where(
        df["TeamID"] == df["WTeamID"], 
        df["LTeamID"], 
        df["WTeamID"]
    )
    df["opp_score"] = np.where(
        df["TeamID"] == df["WTeamID"], 
        df["LScore"], 
        df["WScore"]
    )
    df = df.drop(
        ["WTeamID", "LTeamID", "WScore", "LScore", "WLoc", "NumOT"],
        axis=1
    )
    
    return df

def get_features(mens_data, year, systems):
    # Season Stats
    df, gender = get_season_stats(
        mens_data, 
        detailed=False, 
        post_season=False, 
        year=year
    )

    # Rating System
    srs = create_srs(df, gender)

    # System Ratings
    system_ratings = get_system_ratings(
        mens_data, 
        systems=systems
    ) #KenPom, Nolan ELO, EPSN BPI

    # Ratings df
    ratings_df = pd.merge(
                srs,
                system_ratings,
                on="TeamID"
    )

    # Coaches postseason win stats
    coaches_postseason_win_df = get_coach_win_perc(
        dataset=mens_data, 
        regular_season=False, 
        year=year
    ).rename(columns={"count": "count_post", "mean": "mean_post", "std": "std_post"})

    # Coaches regular season win stats
    coaches_regseason_win_df = get_coach_win_perc(
        dataset=mens_data, 
        regular_season=True, 
        year=year
    ).rename(columns={"count": "count_reg", "mean": "mean_reg", "std": "std_reg"})

    coaches_df = pd.merge(
        coaches_regseason_win_df,
        coaches_postseason_win_df,
        on="CoachName",
        how="left"
    ).fillna(0)

    # Get coaches for the year and only grab the most recent coach for a certain team
    curr_coaches = (
        mens_data["MTeamCoaches"][
            mens_data["MTeamCoaches"]["Season"] == year
        ]
        .sort_values("FirstDayNum")
        .groupby("TeamID")["CoachName"]
        .last()
        .reset_index()
    )

    # Get coach stats for current coaches
    coaches_df = pd.merge(
        curr_coaches,
        coaches_df,
        on="CoachName",
        how="left"
    )


    feature_df = (
        pd.merge(
            ratings_df,
            coaches_df
        )
        .drop(["TeamName", "CoachName"], axis=1)
    )

    
    return feature_df


def merge_features_to_games(feature_df, post_season_df, year, training=True):
    
    post_season_merged = pd.merge(
        pd.merge(
            feature_df,
            post_season_df,
            on="TeamID",
        ),
        feature_df,
        left_on="OppID",
        right_on="TeamID",
        suffixes=("_team", "_opp")
    )
    if training:
        post_season_merged["win"] = post_season_merged["team_score"] > post_season_merged["opp_score"]
        post_season_merged = (
            post_season_merged
            .drop(
                ["TeamID_team", "team_score", "OppID", "TeamID_opp", "opp_score", "DayNum"], 
                axis=1
            )
        )
    return post_season_merged

In [86]:
mens_data.keys()

dict_keys(['MConferenceTourneyGames', 'MGameCities', 'MMasseyOrdinals', 'MNCAATourneyCompactResults', 'MNCAATourneyDetailedResults', 'MNCAATourneySeedRoundSlots', 'MNCAATourneySeeds', 'MNCAATourneySlots', 'MRegularSeasonCompactResults', 'MRegularSeasonDetailedResults', 'MSeasons', 'MSecondaryTourneyCompactResults', 'MSecondaryTourneyTeams', 'MTeamCoaches', 'MTeamConferences', 'MTeams', 'MTeamSpellings'])

In [87]:
df = mens_data["MRegularSeasonDetailedResults"]
df.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF'],
      dtype='object')

In [88]:
df_2023, _ = get_season_stats(
    mens_data,
    detailed=True,
    post_season=False,
    year=None
)

In [92]:
def get_team_stats(df):

    df["margin"] = df["WScore"] - df["LScore"]
    win_df = df.rename(
        columns={"WTeamID": "team_id", "LTeamID": "opp_id", "WLoc": "Loc"}
    )
    win_df = win_df.rename(columns={col: col[1:] + "_opp" for col in win_df.columns if col.startswith("L") and col != "Loc"})
    win_df = win_df.rename(columns={col: col[1:] for col in win_df.columns if col.startswith("W") and not col.endswith("_opp")})
    
    lose_df = df.rename(
        columns={"LTeamID": "team_id", "WTeamID": "opp_id", "WLoc": "Loc"}
    )
    lose_df = lose_df.rename(columns={col: col[1:] for col in lose_df.columns if col.startswith("L") and col != "Loc"})
    lose_df = lose_df.rename(columns={col: col[1:] + "_opp" for col in lose_df.columns if col.startswith("W")})
    lose_df["Loc"] = lose_df["Loc"].apply(lambda x: "H" if x == "A" else "A" if x == "H" else "N")
    lose_df["margin"] = -lose_df["margin"]

    teams = pd.concat([win_df, lose_df], axis=0)

    df = teams.groupby(["Season", "team_id"])[
        ['FGM', 'FGA', 'FGM3', 'FGA3', 'FTM', 'FTA', 'OR', 'DR', 'Ast',
        'TO', 'Stl', 'Blk', 'PF', 'FGM_opp', 'FGA_opp', 'FGM3_opp', 'FGA3_opp',
        'FTM_opp', 'FTA_opp', 'OR_opp', 'DR_opp', 'Ast_opp', 'TO_opp',
        'Stl_opp', 'Blk_opp', 'PF_opp', 'margin'
        ]
    ].agg([
            ("mean", "mean"), 
            ("quant25" , lambda x: x.quantile(.25)), 
            ("quant75", lambda x: x.quantile(.75))
        ]
    ).reset_index()
    df.columns = [(col + "_" + agg_func).strip("_") for col, agg_func in zip(df.columns.get_level_values(0), df.columns.get_level_values(1))]

    for col in df.columns:
        if (
            "_opp" in col
            and col.replace("_opp", "") in df.columns 
            and col not in ["Season", "team_id"]
        ):
            new_col = col.replace("_opp", "") + "_diff"
            df[new_col] = df[col.replace("_opp", "")] - df[col]
            df = df.drop([col.replace("_opp", ""), col], axis=1)
    return df

In [93]:
df = get_team_stats(df_2023)

In [94]:
df

Unnamed: 0,Season,team_id,margin_mean,margin_quant25,margin_quant75,FGM_mean_diff,FGM_quant25_diff,FGM_quant75_diff,FGA_mean_diff,FGA_quant25_diff,...,TO_quant75_diff,Stl_mean_diff,Stl_quant25_diff,Stl_quant75_diff,Blk_mean_diff,Blk_quant25_diff,Blk_quant75_diff,PF_mean_diff,PF_quant25_diff,PF_quant75_diff
0,2003,1102,0.250000,-13.00,10.00,-0.142857,0.00,0.00,-2.642857,-2.75,...,0.00,0.535714,1.00,0.00,0.214286,1.00,-1.00,0.392857,0.75,0.75
1,2003,1103,0.629630,-6.50,6.00,-0.629630,-1.50,1.00,-1.148148,-1.50,...,-2.50,0.851852,0.00,1.00,-0.518519,-0.50,-0.50,-2.592593,-2.00,-4.00
2,2003,1104,4.285714,-4.25,11.25,0.785714,-1.00,2.00,1.678571,2.00,...,-2.75,1.071429,1.00,1.25,0.607143,1.00,0.00,-1.214286,-1.75,-1.00
3,2003,1105,-4.884615,-11.75,0.50,-2.615385,-1.25,-2.75,2.653846,4.25,...,-1.00,-0.076923,-0.75,1.00,-2.115385,-1.25,-2.00,1.153846,1.00,0.25
4,2003,1106,-0.142857,-6.75,6.75,1.714286,1.00,2.25,1.892857,2.00,...,3.50,-0.428571,-0.75,0.75,-0.035714,0.00,-0.75,2.035714,3.00,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7612,2024,1474,-5.928571,-16.00,3.75,-3.785714,-3.25,-4.25,-2.285714,-5.75,...,1.00,0.142857,1.00,1.00,-1.964286,-2.00,-2.25,-0.714286,-1.25,-2.00
7613,2024,1475,-8.115385,-12.75,-2.00,-1.769231,-2.00,-1.50,0.423077,2.00,...,2.00,-0.846154,0.00,-2.00,-1.000000,-1.00,-1.00,2.961538,3.00,1.50
7614,2024,1476,-13.571429,-17.25,-5.00,-3.750000,-2.00,-3.50,-3.107143,-1.25,...,3.25,-1.464286,-1.00,-2.00,-2.035714,-1.75,-3.00,1.464286,1.00,1.00
7615,2024,1477,-10.520000,-22.00,2.00,-3.800000,-2.00,-6.00,2.200000,2.00,...,-2.00,1.600000,-1.00,2.00,0.960000,0.00,2.00,3.320000,3.00,2.00


In [148]:
# import requests
# from bs4 import BeautifulSoup
# import pandas as pd

# def get_advanced_stats(year):
    
#     ################################## TEAM ##########################################


#     # URL of the page you want to scrape
#     url = f'https://www.sports-reference.com/cbb/seasons/men/{year}-advanced-school-stats.html'

#     # Send a GET request to the webpage
#     response = requests.get(url)

#     # Parse the HTML content of the page
#     soup = BeautifulSoup(response.text, 'html.parser')

#     # Find the table by ID or class
#     table = soup.find('table', {'id': 'adv_school_stats'})

#     # Extract the column headers from the <thead> part of the table
#     headers = [header.text for header in table.find('thead').find_all('th')]
#     headers = [
#         i for i in headers 
#         if (i != "") 
#         and (i != "\xa0") 
#         and (i not in ['School Advanced', 'Overall', 'Conf.', 'Home', 'Away', 'Points', 'Opponent Advanced', 'Rk'])
#     ]

#     # Extract rows from the <tbody> part of the table
#     rows = table.find('tbody').find_all('tr')

#     # Extract data from each row
#     data = []
#     for row in rows:
#         cols = row.find_all('td')
#         cols = [ele.text.strip() for ele in cols if ele.text.strip() != ""]
#         data.append(cols)  # Get rid of empty values

#     # Create the DataFrame
#     df = pd.DataFrame(data, columns=headers)  # headers[1:] because the first header is usually the rank or an empty string


#     ################################## OPP ##########################################

#     # URL of the page you want to scrape
#     url = f'https://www.sports-reference.com/cbb/seasons/men/{year}-advanced-opponent-stats.html'

#     # Send a GET request to the webpage
#     response = requests.get(url)

#     # Parse the HTML content of the page
#     soup = BeautifulSoup(response.text, 'html.parser')

#     # Find the table by ID or class
#     table = soup.find('table', {'id': 'adv_opp_stats'})

#     # Extract the column headers from the <thead> part of the table
#     headers = [header.text for header in table.find('thead').find_all('th')]
#     headers = [
#         i for i in headers 
#         if (i != "") 
#         and (i != "\xa0") 
#         and (i not in ['Overall', 'Conf.', 'Home', 'Away', 'Points', 'Opponent Advanced', 'Rk'])
#     ]

#     # Extract rows from the <tbody> part of the table
#     rows = table.find('tbody').find_all('tr')

#     # Extract data from each row
#     data = []
#     for row in rows:
#         cols = row.find_all('td')
#         cols = [ele.text.strip() for ele in cols if ele.text.strip() != ""]
#         data.append(cols)  # Get rid of empty values

#     # Create the DataFrame
#     opp_df = pd.DataFrame(data, columns=headers)  # headers[1:] because the first header is usually the rank or an empty string

#     df = df.dropna().reset_index(drop=True)
#     opp_df = (
#         opp_df
#         .dropna()
#         .reset_index(drop=True)
#         .drop(["G", "W", "L", "W-L%", "SRS", "SOS", "W", "L", "W", "L", "W", "L", "Tm.", "Opp."], axis=1)
#         .rename(columns={col: col + "_opp" for col in opp_df.columns if col != "School"})
#     )
#     res = pd.merge(df, opp_df, on="School")

#     res["School"] = res["School"].str.replace("NCAA","").str.strip()

#     return res


# # Works 2010 and on
# advanced_stats = dict()
# for year in range(2010, 2025):
#     print(year)
#     advanced_stats[year] = get_advanced_stats(year)