# Clean and organising the raw data scraped from https://stats.ncaa.org/

Author: Atharv Sonwane (Player & Team Data Cleaning) & Vedant Shah (Cleaning team_stats onwards)

In [1]:
import pandas as pd
import os
import sys
from pathlib import Path
import requests
from tqdm import tnrange
import shutil

# Clean Data

In [17]:
year = 2018
for root, dirs, files in os.walk(f'../../data/ncaa/raw/{year}/team_game_by_game/'):
    for i in tnrange(len(files), desc='Cleaning data'):
        print(files[i])
        f = files[i]
        df = pd.read_csv(Path(root).joinpath(f), header=1)
        if year >= 2018:
            df.drop(columns=["MP", "Attend", "BHE", "Unnamed: 20"], inplace=True)
        else:
            df.drop(labels=["MP", "BHE", "Unnamed: 19"], axis=1, inplace=True)
        df.replace({'/':''}, regex=True, inplace=True)
        df.fillna(0, inplace=True)
        df[["Kills", "Errors", "Total Attacks", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr"]] = df[["Kills", "Errors", "Total Attacks", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr"]].astype(int)
        outpath = Path(root).parent.parent.parent.joinpath(f"processed/{year}/game_by_game_cleaned/")
        outpath.mkdir(parents=True, exist_ok=True)
        f = f[:f.find('(') - 1] + ".csv"
        df.to_csv(outpath.joinpath(f), index=False)



HBox(children=(FloatProgress(value=0.0, description='Cleaning data', max=334.0, style=ProgressStyle(descriptio…

UMBC (America East).csv
Seton Hall (Big East).csv
Florida (SEC).csv
NJIT (ASUN).csv
Navy (Patriot).csv
Wisconsin (Big Ten).csv
Northern Ill. (MAC).csv
Mercer (SoCon).csv
UNLV (MWC).csv
La.-Monroe (Sun Belt).csv
Stony Brook (America East).csv
UConn (AAC).csv
South Fla. (AAC).csv
Winthrop (Big South).csv
Purdue Fort Wayne (Summit League).csv
Brown (Ivy League).csv
UC Riverside (Big West).csv
Charlotte (C-USA).csv
Northern Ariz. (Big Sky).csv
Cleveland St. (Horizon).csv
Samford (SoCon).csv
UNCW (CAA).csv
East Carolina (AAC).csv
Indiana St. (MVC).csv
Binghamton (America East).csv
UTRGV (WAC).csv
N.C. A&T (MEAC).csv
Northwestern (Big Ten).csv
Green Bay (Horizon).csv
Oklahoma (Big 12).csv
UMES (MEAC).csv
CSUN (Big West).csv
Fairleigh Dickinson (NEC).csv
Santa Clara (WCC).csv
DePaul (Big East).csv
Saint Peter's (MAAC).csv
Sacramento St. (Big Sky).csv
Saint Francis (PA) (NEC).csv
Cornell (Ivy League).csv
Sacred Heart (NEC).csv
UTSA (C-USA).csv
Central Conn. St. (NEC).csv
Southern Utah (Big Sky

# Computing Moving Averages

In [37]:
def transform_team_data(input_dir, output_dir, tf):
    for root, dirs, files in os.walk(input_dir):
        new_root = Path(output_dir)
        new_root.mkdir(parents=True, exist_ok=True)
        for f in files:
            df = pd.read_csv(Path(root).joinpath(f))
            tf(df)
            df.to_csv(new_root.joinpath(f), index=False)

features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]

## Simple Moving Average

In [36]:
window = 10
year = 2016

def sma(df):
    df[features] = df[features].rolling(window, min_periods=1).mean()

transform_team_data(
    input_dir=f'../../data/ncaa/processed/{year}/game_by_game_cleaned',
    output_dir=f'../../data/ncaa/processed/{year}/game_by_game_{window}_sma',
    tf=sma,
)

## Cumulative Moving Average

In [23]:
year = 2016

def cma(df):
    df[features] = df[features].expanding().mean()

transform_team_data(
    input_dir=f'../../data/ncaa/processed/{year}/game_by_game_cleaned',
    output_dir=f'../../data/ncaa/processed/{year}/game_by_game_cma',
    tf=cma,
)

## Exponential Moving Average

In [24]:
year = 2016
alpha = 0.2

def ewm(df):
    df[features] = df[features].ewm(alpha=alpha).mean()

transform_team_data(
    input_dir=f'../../data/ncaa/processed/{year}/game_by_game_cleaned',
    output_dir=f'../../data/ncaa/processed/{year}/game_by_game_{alpha}_ewm',
    tf=ewm,
)

# Combine into single dataframe of matches

## Utility Function

In [2]:
def clean_name(name):
    name = name.replace('\"', '')
    if '@' in name:
        if name.index('@') == 0:
            return name[2:]
        else:
            return name[:name.index('@')-1]
    else:
        return name

features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]
combined_features = ["Date", "TeamA", "TeamB", "Result", "S", "Team A Kills", "Team A Errors", "Team A Total Attacks", "Team A Hit Pct", "Team A Assists", "Team A Aces", "Team A SErr", "Team A Digs", "Team A RErr", "Team A Block Solos", "Team A Block Assists", "Team A BErr", "Team A PTS", "Team B Kills", "Team B Errors", "Team B Total Attacks", "Team B Hit Pct", "Team B Assists", "Team B Aces", "Team B SErr", "Team B Digs", "Team B RErr", "Team B Block Solos", "Team B Block Assists", "Team B BErr", "Team B PTS"]


def combine(input_path, output_path):
    dfs = []
    team_names = []
    for root, dirs, files in os.walk(input_path):
        for f in files:
            team_names.append(f[:-4])
            dfs.append(pd.read_csv(Path(root).joinpath(f)))

    data = []

    err_a = 0
    err_b = 0

    for i, name in enumerate(team_names):
        df = dfs[i]
        for j, TeamA_row in df.iterrows(): 
            date = TeamA_row["Date"]
            TeamA = name
            TeamB = clean_name(TeamA_row["Opponent"])
            Result = 1 if TeamA_row["Result"][0] == 'W' else 0
            S = TeamA_row["S"]
            TeamA_stats = TeamA_row[features]
            try:
                TeamB_df = dfs[team_names.index(TeamB)]
            except:
                err_a += 1
                continue
            try:
                TeamB_row = TeamB_df[TeamB_df["Date"] == date][TeamB_df["Opponent"].str.contains(TeamA)].reset_index().loc[0]
            except:
                err_b += 1
                continue

            TeamB_stats = TeamB_row[features]
            data.append([date, TeamA, TeamB, Result, S, *TeamA_stats, *TeamB_stats])
        
    combined_df = pd.DataFrame(data, columns=combined_features)
    combined_df.to_csv(output_path, index=False)
    return dict(df_length=len(combined_df), err_a=err_a, err_b=err_b)

def prev_combine(input_path, output_path):
    dfs = []
    team_names = []
    for root, dirs, files in os.walk(input_path):
        for f in files:
            team_names.append(f[:-4])
            dfs.append(pd.read_csv(Path(root).joinpath(f)))

    data = []

    err_a = 0
    err_b = 0

    for i, name in enumerate(team_names):
        df = dfs[i]
        for j in range(len(df)):
            if j == 0:
                continue
            TeamA_row = df.loc[j-1]
            date = TeamA_row["Date"]
            TeamA = name
            TeamB = clean_name(TeamA_row["Opponent"])
            Result = 1 if TeamA_row["Result"][0] == 'W' else 0
            S = TeamA_row["S"]
            TeamA_stats = TeamA_row[features]
            try:
                TeamB_df = dfs[team_names.index(TeamB)]
            except:
                err_a += 1
                continue
            try:
                TeamB_row_index = TeamB_df[TeamB_df["Date"] == date][TeamB_df["Opponent"].str.contains(TeamA)].index[0]
                if TeamB_row_index == 0:
                    continue
                TeamB_row = TeamB_df.loc[TeamB_row_index-1]
            except:
                err_b += 1
                continue

            TeamB_stats = TeamB_row[features]
            data.append([date, TeamA, TeamB, Result, S, *TeamA_stats, *TeamB_stats])
        
    combined_df = pd.DataFrame(data, columns=combined_features)
    combined_df.to_csv(output_path, index=False)
    return dict(df_length=len(combined_df), err_a=err_a, err_b=err_b)



## Combine dataframe of math by match result without any averages

In [26]:
combine(
    input_path=f'../../data/ncaa/processed/{year}/game_by_game_cleaned',
    output_path=f'../../data/ncaa/processed/{year}/accumulated/matches_gathered.csv',
)


{'df_length': 9487, 'err_a': 440, 'err_b': 60}

## Combine dataframe for Simple Moving Average

In [6]:
year = 2016
window = 10
prev_combine(
    input_path=f'../../data/ncaa/processed/{year}/game_by_game_{window}_sma',
    output_path=f'../../data/ncaa/processed/{year}/accumulated/{window}_sma.csv',
)

{'df_length': 8852, 'err_a': 430, 'err_b': 58}

## Combine dataframe for Cumulative Moving Average

In [4]:
year = 2016
window = 10
prev_combine(
    input_path=f'../../data/ncaa/processed/{year}/game_by_game_cma',
    output_path=f'../../data/ncaa/processed/{year}/accumulated/cma.csv',
)

{'df_length': 8852, 'err_a': 430, 'err_b': 58}

## Combine dataframe for Exponentially Moving Average

In [5]:
year = 2016
alpha = 0.2
prev_combine(
    input_path=f'../../data/ncaa/processed/{year}/game_by_game_{alpha}_ewm',
    output_path=f'../../data/ncaa/processed/{year}/accumulated/{alpha}_ewm.csv',
)

{'df_length': 8852, 'err_a': 430, 'err_b': 58}

# Clean Player Data

In [15]:
def store_player(url, path):
    print(f"Fetching for {Path(path).name[:-4]} ...", end=' ')
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"})
    pd.read_html(r.text)[-1].drop(labels=[0], axis=0).to_csv(path, index=False)
    print("Done!")

urls = [
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=51&stats_player_seq=1906568",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=77&stats_player_seq=2020914",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=136&stats_player_seq=2259847",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=196&stats_player_seq=2199752",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=255&stats_player_seq=2206497",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=648&stats_player_seq=2199107",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=141&stats_player_seq=1920987",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=731&stats_player_seq=1787509",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=782&stats_player_seq=2210530",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=731&stats_player_seq=2020474",
]
paths = [
    "/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/raw/2019/player_game_wise/Baylor (Big 12)/Lockin, Hannah.csv",
    "/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/raw/2019/player_game_wise/BYU (WCC)/Tausinga, Tayler.csv",
    "/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/raw/2019/player_game_wise/Chicago St. (WAC)/Sisic, Isadora.csv",
    "/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/raw/2019/player_game_wise/East Carolina (AAC)/Garcia, Jaylibeth.csv",
    "/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/raw/2019/player_game_wise/Georgia Tech (ACC)/Lamborda, Paola.csv",
    "/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/raw/2019/player_game_wise/South Carolina (SEC)/Covas Córdova, Camilla.csv",
    "/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/raw/2019/player_game_wise/The Citadel (SoCon)/Jesus, Sharlissa.csv",
    "/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/raw/2019/player_game_wise/Utah St. (Mountain West)/Olson-Shepherd, Madi.csv",
    "/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/raw/2019/player_game_wise/Wichita St. (AAC)/Uluave, Sina.csv",
    "/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/raw/2019/player_game_wise/Utah St. (Mountain West)/Solosabal, Whitney.csv",
]

for url, path in zip(urls, paths):
    store_player(url, path)

Fetching for Lockin, Hannah ... 

FileNotFoundError: [Errno 2] No such file or directory: '/home/twm/comp/ml/volleyball-ml/data/ncaa/raw/2019/player_game_wise/Baylor (Big 12)/Lockin, Hannah.csv'

In [30]:
year = 2018
verbose = False

for root, dirs, _ in os.walk(f'../../data/ncaa/raw/{year}/player_game_wise/'):
    for team_dir in dirs:
        team_root, _, player_files = list(os.walk(Path(root).joinpath(team_dir)))[0]
        team_name = Path(team_root).name
        team_name = team_name[:team_name.find('(') - 1]
        if verbose:
            print(f"Cleaning player data for {team_name} ...")
        for player_file in player_files:
            if verbose:
                print(f"\tCleaning data for player {player_file[:-4]} ...", end=' ')
            outpath = Path(root).parent.parent.parent.joinpath(f"processed/{year}/player_game_wise_cleaned/{team_name}")
            if outpath.joinpath(player_file).is_file():
                if verbose:
                    print("Already Exists!")
                continue
            try:
                df = pd.read_csv(Path(team_root).joinpath(player_file), header=1)
            except:
                print(f"{Path(team_root).joinpath(player_file)} Failed!")
                continue
            
            df.drop(columns=["MP", "Attend", "BHE", "Unnamed: 20"], inplace=True)
            df.replace({'/':''}, regex=True, inplace=True)
            df.fillna(0, inplace=True)
            df[["Kills", "Errors", "Total Attacks", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr"]] = df[["Kills", "Errors", "Total Attacks", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr"]].astype(int)
            outpath.mkdir(parents=True, exist_ok=True)
            df.to_csv(outpath.joinpath(player_file), index=False)
            if verbose:
                print("Done!")

for root, dirs, _ in os.walk('/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/processed/2019/player_game_wise_cleaned'):
    for team_dir in dirs:
        for team_root, _, player_files in os.walk(Path(root).joinpath(team_dir)):
            for player_file in player_files:
                if player_file[player_file.find(".csv") - 1] == ' ':
                    cp = Path(team_root).joinpath(player_file)
                    n = player_file[:player_file.find(".csv") - 1] + ".csv"
                    print(cp.rename(cp.parent.joinpath(n)))

In [29]:
def transform_player_data(input_dir, output_dir, tf):
    for root, dirs, _ in os.walk(input_dir):
        for team_dir in dirs:
            for team_root, _, player_files in os.walk(Path(root).joinpath(team_dir)):
                for player_file in player_files:
                    df = pd.read_csv(Path(team_root).joinpath(player_file))
                    outpath = Path(output_dir).joinpath(team_dir)
                    tf(df)
                    outpath.mkdir(parents=True, exist_ok=True)
                    df.to_csv(outpath.joinpath(player_file), index=False)

features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]

## SMA

In [2]:
year = 2018
window = 10

def sma(df):
    df[features] = df[features].rolling(window, min_periods=1).mean()


transform_player_data(
    input_dir=f'../../data/ncaa/processed/{year}/player_game_wise_cleaned/',
    output_dir=f'../../data/ncaa/processed/{year}/player_game_wise_{window}_sma/',
    tf=sma
)

NameError: name 'transform_player_data' is not defined

## CMA

In [19]:
year = 2018

def cma(df):
    df[features] = df[features].expanding().mean()

transform_player_data(
    input_dir=f'../../data/ncaa/processed/{year}/player_game_wise_cleaned/',
    output_dir=f'../../data/ncaa/processed/{year}/player_game_wise_cma/',
    tf=cma
)

## EWM

In [7]:
year = 2018
alpha = 0.2

def ewm(df):
    df[features] = df[features].ewm(alpha=alpha).mean()

transform_player_data(
    input_dir=f'../../data/ncaa/processed/{year}/player_game_wise_cleaned/',
    output_dir=f'../../data/ncaa/processed/{year}/player_game_wise_{alpha}_ewm/',
    tf=ewm
)

# Combining into single dataframe

In [9]:
def clean_name(name):
    if '@' in name:
        if name.index('@') == 0:
            return name[2:]
        else:
            return name[:name.index('@')-1]
    else:
        return name

features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]
player_features = [f"Player {j} {f}" for f in features for j in range(12)]
combined_features = [
"Date", "TeamA", "TeamB", "Result", "S", "Team A Kills", "Team A Errors", "Team A Total Attacks", "Team A Hit Pct", "Team A Assists", "Team A Aces", "Team A SErr", "Team A Digs",  "Team A RErr", "Team A Block Solos", "Team A Block Assists", "Team A BErr", "Team A PTS", "Team B Kills", "Team B Errors", "Team B Total Attacks", "Team B Hit Pct", "Team B Assists", "Team B Aces", "Team B SErr", "Team B Digs", "Team B RErr", "Team B Block Solos", "Team B Block Assists", "Team B BErr", "Team B PTS",
*[f"Team A {s}" for s in player_features],
*[f"Team B {s}" for s in player_features]
]


def combine_with_player(player_input_path, team_stats_path, team_matches_path, macthes_with_player_info_path, combined_output_path):
    print("Combining player data for individual teams ...")

    print("Building team index ...", end=' ')
    team_names = []
    for root, _, files in os.walk(team_matches_path):
        for f in files:
            team_names.append(f[:-4])

    print("Done!")

    player_input_path = Path(player_input_path)
    team_stats_path = Path(team_stats_path)
    team_matches_path = Path(team_matches_path)
    macthes_with_player_info_path = Path(macthes_with_player_info_path)
    macthes_with_player_info_path.mkdir(exist_ok=True, parents=True)


    print("Sorting team data ...", end=' ')
    for i, name in enumerate(team_names):
        team_matches_df = pd.read_csv(team_matches_path.joinpath(f"{name}.csv"))
        team_stats_df = pd.read_csv(team_stats_path.joinpath(f"{name}.csv"))
        top_player_names = [] 
        for j, (_, player_row) in enumerate(team_stats_df[team_stats_df.Player != "TEAM"][team_stats_df.Player != "Totals"][team_stats_df.Player != "Opponent Totals"].sort_values(by=["GP"], ascending=False).iterrows()):
            top_player_names.append(player_row["Player"])
            if j == 11:
                break
        try:
            for j, player in enumerate(top_player_names):
                team_matches_df[[f"Player {j} {f}" for f in features]] = pd.read_csv(player_input_path.joinpath(f"{name}/{player}.csv"))[features]
            team_matches_df.to_csv(macthes_with_player_info_path.joinpath(f"{name}.csv"), index=False)        
        except:
            print(f"\nFailed to get player {player} for {name}!")
            continue
    print("Done!")

    print("Getting match wise dataframes ...", end=' ')
    dfs = []
    for root, _, files in os.walk(macthes_with_player_info_path):
        for f in files:
            dfs.append(pd.read_csv(Path(root).joinpath(f)))
    print("Done!")

    err_a, err_b = 0, 0
    data = []
    for i in tnrange(len(dfs), desc="Combining into single df"):
        name = team_names[i]
        df = dfs[i]
        for j in range(len(df)):
            if j == 0:
                continue
            TeamA_row = df.loc[j-1]
            date = TeamA_row["Date"]
            TeamA = name
            TeamB = clean_name(TeamA_row["Opponent"])
            Result = 1 if TeamA_row["Result"][0] == 'W' else 0
            S = TeamA_row["S"]
            TeamA_stats = TeamA_row[features + player_features]
            try:
                TeamB_df = dfs[team_names.index(TeamB)]
            except:
                err_a += 1
                continue
            try:
                TeamB_row_index = TeamB_df[TeamB_df["Date"] == date][TeamB_df["Opponent"].str.contains(TeamA)].index[0]
                if TeamB_row_index == 0:
                    continue
                TeamB_row = TeamB_df.loc[TeamB_row_index-1]
            except:
                err_b += 1
                continue

            TeamB_stats = TeamB_row[features + player_features]
            data.append([date, TeamA, TeamB, Result, S, *TeamA_stats, *TeamB_stats])

    combined_df = pd.DataFrame(data, columns=combined_features)
    combined_df.to_csv(combined_output_path, index=False)
    return dict(df_len=len(combined_df), err_a=err_a, err_b=err_b)



In [10]:
combine_with_player(
    player_input_path="/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/processed/2018/player_game_wise_0.2_ewm",
    team_stats_path="/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/raw/2018/team_stats",
    team_matches_path="/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/processed/2018/game_by_game_0.2_ewm",
    macthes_with_player_info_path="/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/processed/2018/game_by_game_with_players_0.2_ewm",
    combined_output_path="/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/processed/2018/accumulated/0.2_ewm_with_players.csv",
)

Combining player data for individual teams ...
Building team index ... Done!
Sorting team data ... 
Failed to get player Krenik, Paige for USC Upstate!

Failed to get player Gasser, Haylee for Clemson!

Failed to get player Cerame, Paula for North Dakota!

Failed to get player Nwosu, Udo for Army West Point!

Failed to get player Solis, Emily for SIUE!

Failed to get player Whalen, Mariah for Indiana!

Failed to get player Shelley, Kennedy for Arkansas St.!

Failed to get player Sass, Olivia for Arizona!

Failed to get player Sweder, Thea for Little Rock!

Failed to get player Schaffer, Albany for Campbell!

Failed to get player Patriciello, Sabrina for UTSA!

Failed to get player Jake-Turner, Kennadie for Rhode Island!

Failed to get player Lewis, Mary Hannah for Saint Louis!

Failed to get player Gunter, Sydney for Cincinnati!

Failed to get player Gates, Madison for Lehigh!

Failed to get player Ehlert, Makena for Winthrop!

Failed to get player Rodriguez, Nicole for Nicholls St.!



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Combining into single df', layout=Layou…




{'df_len': 0, 'err_a': 0, 'err_b': 0}

# Cleaning file names in team_stats

In [11]:
year = 2019
for root, _, files in os.walk(f'../../data/ncaa/raw/{year}/team_stats/'):
    for f in files:
        f_new = f[:f.find('(') - 1] + ".csv"
        os.rename(Path(root).joinpath(f), Path(root).joinpath(f_new))
        print(f"{f} successfully renamed to {f_new}!")



 Brown.csv!
UC Riverside (Big West).csv successfully renamed to UC Riverside.csv!
Charlotte (C-USA).csv successfully renamed to Charlotte.csv!
Northern Ariz. (Big Sky).csv successfully renamed to Northern Ariz..csv!
Cleveland St. (Horizon).csv successfully renamed to Cleveland St..csv!
Samford (SoCon).csv successfully renamed to Samford.csv!
Utah St. (Mountain West).csv successfully renamed to Utah St..csv!
UNCW (CAA).csv successfully renamed to UNCW.csv!
East Carolina (AAC).csv successfully renamed to East Carolina.csv!
Indiana St. (MVC).csv successfully renamed to Indiana St..csv!
Binghamton (America East).csv successfully renamed to Binghamton.csv!
UTRGV (WAC).csv successfully renamed to UTRGV.csv!
N.C. A&T (MEAC).csv successfully renamed to N.C. A&T.csv!
Northwestern (Big Ten).csv successfully renamed to Northwestern.csv!
Green Bay (Horizon).csv successfully renamed to Green Bay.csv!
Oklahoma (Big 12).csv successfully renamed to Oklahoma.csv!
Boise St. (Mountain West).csv successfu

# Combining(w.r.t. years) Data of all the years

In [3]:
data_path = "../../data/ncaa"
years = [2019, 2018, 2017, 2016]
outpath = Path(data_path).joinpath("combined")
acc = outpath.joinpath("accumulated")
acc.mkdir(parents=True, exist_ok=True)

## Combined(w.r.t. years) game by game vanilla

In [15]:
df_matches_gathered = [pd.read_csv(Path(data_path).joinpath(f"processed/{year}/accumulated/matches_gathered.csv")) for year in years]

df_team_v_team_combined = pd.concat(df_matches_gathered, ignore_index=True)
df_team_v_team_combined.to_csv(outpath.joinpath("accumulated/team_v_team.csv"))
print(f"Compiled yearwise data into a single data file\n Size of final dataframe : {df_team_v_team_combined.size}")

Compiled yearwise data into a single data file
 Size of final dataframe : 1167150


In [14]:
df = pd.read_csv(Path(data_path).joinpath("processed/2019/accumulated/matches_gathered.csv"))
print(df.size)

295523


In [18]:
root_path = Path(data_path).joinpath("processed/2019/game_by_game_cleaned")
failed = []
for root, _, files in os.walk(root_path):
    for f in files:
        try:
            df_list = [pd.read_csv(Path(root).parent.parent.joinpath(f"{year}/game_by_game_cleaned/{f}")) for year in years]
            df_team = pd.concat(df_list, ignore_index=True)
            game_by_game_cleaned_path = Path(root).parent.parent.parent.joinpath("combined/game_by_game_cleaned_combined/")
            game_by_game_cleaned_path.mkdir(parents=True, exist_ok=True)
            df_team.to_csv(game_by_game_cleaned_path.joinpath(f), index=False)
        except:
            print(f"Failed to process data for {f}")
            failed.append(f)


Failed to process data for LIU.csv
Failed to process data for LMU.csv
Failed to process data for Abilene Christian.csv
Failed to process data for Grand Canyon.csv
Failed to process data for Coastal Carolina.csv
Failed to process data for Kansas City.csv
Failed to process data for South Carolina St..csv
Failed to process data for Saint Peter's.csv
Failed to process data for UIW.csv


In [28]:
LIU = ["LIU.csv", "LIU Brooklyn.csv", "LIU Brooklyn.csv", "LIU.csv"]
CoastalCarolina = ["Coastal Carolina.csv", "Coastal Caro..csv", "Coastal Caro..csv", "Coastal Carolina.csv"]

tuple1 = zip(CoastalCarolina, years)
df_list = [pd.read_csv(Path(data_path).joinpath(f"processed/{tuple_[1]}/game_by_game_cleaned/{tuple_[0]}")) for tuple_ in tuple1]
df_team = pd.concat(df_list, ignore_index=True)
game_by_game_cleaned_path = Path(data_path).joinpath(f"combined/game_by_game_cleaned_combined/Coastal Carolina.csv")
df_team.to_csv(game_by_game_cleaned_path, index=False)
print("Done!")

tuple1 = zip(LIU, years)
df_list = [pd.read_csv(Path(data_path).joinpath(f"processed/{tuple_[1]}/game_by_game_cleaned/{tuple_[0]}")) for tuple_ in tuple1]
df_team = pd.concat(df_list, ignore_index=True)
game_by_game_cleaned_path = Path(data_path).joinpath(f"combined/game_by_game_cleaned_combined/LIU.csv")
df_team.to_csv(game_by_game_cleaned_path, index=False)
print("Done!")


Done!
Done!


In [45]:
def transform_team_data(input_dir, output_dir, tf):
    for root, dirs, files in os.walk(input_dir):
        new_root = Path(output_dir)
        new_root.mkdir(parents=True, exist_ok=True)
        for f in files:
            df = pd.read_csv(Path(root).joinpath(f))
            tf(df)
            df.to_csv(new_root.joinpath(f), index=False)

features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]

## Combined(w.r.t. years) SMA Game by Game

In [47]:
window = 10

def sma(df):
    df[features] = df[features].rolling(window, min_periods=1).mean()

transform_team_data(
    input_dir=f'../../data/ncaa/combined/game_by_game_cleaned_combined',
    output_dir=f'../../data/ncaa/combined/game_by_game_{window}_sma_combined',
    tf=sma,
)


## Combined(w.r.t. years) game by game CMA

In [50]:
def cma(df):
    df[features] = df[features].expanding().mean()

transform_team_data(
    input_dir=f'../../data/ncaa/combined/game_by_game_cleaned_combined',
    output_dir=f'../../data/ncaa/combined/game_by_game_cma_combined',
    tf=cma,
)

## Combined(w.r.t. years) game by game EWM

In [51]:
alpha = 0.2

def ewm(df):
    df[features] = df[features].ewm(alpha=alpha).mean()

transform_team_data(
    input_dir=f'../../data/ncaa/combined/game_by_game_cleaned_combined',
    output_dir=f'../../data/ncaa/combined/game_by_game_{alpha}_ewm_combined',
    tf=ewm,
)

In [3]:
def clean_name(name):
    name = name.replace('\"', '')
    if '@' in name:
        if name.index('@') == 0:
            return name[2:]
        else:
            return name[:name.index('@')-1]
    else:
        return name

features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]
combined_features = ["Date", "TeamA", "TeamB", "Result", "S", "Team A Kills", "Team A Errors", "Team A Total Attacks", "Team A Hit Pct", "Team A Assists", "Team A Aces", "Team A SErr", "Team A Digs", "Team A RErr", "Team A Block Solos", "Team A Block Assists", "Team A BErr", "Team A PTS", "Team B Kills", "Team B Errors", "Team B Total Attacks", "Team B Hit Pct", "Team B Assists", "Team B Aces", "Team B SErr", "Team B Digs", "Team B RErr", "Team B Block Solos", "Team B Block Assists", "Team B BErr", "Team B PTS"]

def combine(input_path, output_path):
    print("Combining data directly for team matches ...", end=' ')
    dfs = []
    team_names = []
    for root, dirs, files in os.walk(input_path):
        for f in files:
            team_names.append(f[:-4])
            dfs.append(pd.read_csv(Path(root).joinpath(f)))

    data = []

    err_a = 0
    err_b = 0

    for i, name in enumerate(team_names):
        df = dfs[i]
        for _, TeamA_row in df.iterrows(): 
            date = TeamA_row["Date"]
            TeamA = name
            TeamB = clean_name(TeamA_row["Opponent"])
            Result = 1 if TeamA_row["Result"][0] == 'W' else 0
            S = TeamA_row["S"]
            TeamA_stats = TeamA_row[features]
            try:
                TeamB_df = dfs[team_names.index(TeamB)]
            except:
                err_a += 1
                continue
            try:
                TeamB_row = TeamB_df[(TeamB_df["Date"] == date) & TeamB_df["Opponent"].str.contains(TeamA)].reset_index().loc[0]
            except:
                err_b += 1
                continue

            TeamB_stats = TeamB_row[features]
            data.append([date, TeamA, TeamB, Result, S, *TeamA_stats, *TeamB_stats])
        
    combined_df = pd.DataFrame(data, columns=combined_features)
    combined_df.to_csv(output_path, index=False)
    results = dict(df_length=len(combined_df), err_a=err_a, err_b=err_b)
    print(f"Done! Results: {results}")
    return results


def prev_combine(input_path, output_path):
    print("Combing data using cumulatives for team matches ...", end=' ')
    dfs = []
    team_names = []
    for root, _, files in os.walk(input_path):
        for f in files:
            team_names.append(f[:-4])
            dfs.append(pd.read_csv(Path(root).joinpath(f)))

    data = []

    err_a = 0
    err_b = 0

    for i, name in enumerate(team_names):
        df = dfs[i]
        for j in range(len(df)):
            if j == 0:
                continue
            TeamA_row = df.loc[j-1]
            date = TeamA_row["Date"]
            TeamA = name
            TeamB = clean_name(TeamA_row["Opponent"])
            Result = 1 if TeamA_row["Result"][0] == 'W' else 0
            S = TeamA_row["S"]
            TeamA_stats = TeamA_row[features]
            try:
                TeamB_df = dfs[team_names.index(TeamB)]
            except:
                err_a += 1
                continue
            try:
                TeamB_row_index = TeamB_df[(TeamB_df["Date"] == date) & TeamB_df["Opponent"].str.contains(TeamA)].index[0]
                if TeamB_row_index == 0:
                    continue
                TeamB_row = TeamB_df.loc[TeamB_row_index-1]
            except:
                err_b += 1
                continue

            TeamB_stats = TeamB_row[features]
            data.append([date, TeamA, TeamB, Result, S, *TeamA_stats, *TeamB_stats])
        
    combined_df = pd.DataFrame(data, columns=combined_features)
    combined_df.to_csv(output_path, index=False)
    results = dict(df_length=len(combined_df), err_a=err_a, err_b=err_b)
    print(f"Done! Results: {results}. Data stored at {output_path}")
    return results

## Compile all the yearwise combined data into a single dataframe with SMA

In [55]:
window = 10
prev_combine(
    input_path=f'../../data/ncaa/combined/game_by_game_{window}_sma_combined',
    output_path=f'../../data/ncaa/combined/accumulated/{window}_sma_combined.csv',
)

Combing data using cumulatives for team matches ... Done! Results: {'df_length': 36348, 'err_a': 2042, 'err_b': 246}. Data stored at ../../data/ncaa/combined/accumulated/10_sma_combined.csv


{'df_length': 36348, 'err_a': 2042, 'err_b': 246}

## Compile all the yearwise combined data into a single dataframe with CMA

In [57]:
prev_combine(
            input_path=f'../../data/ncaa/combined/game_by_game_cma_combined',
            output_path=f'../../data/ncaa/combined/accumulated/cma_combined.csv',
        )

Combing data using cumulatives for team matches ... Done! Results: {'df_length': 36348, 'err_a': 2042, 'err_b': 246}. Data stored at ../../data/ncaa/combined/accumulated/cma_combined.csv


{'df_length': 36348, 'err_a': 2042, 'err_b': 246}

## Compile all the yearwise combined data into a single dataframe with EWM

In [4]:
alpha = 0.2
prev_combine(
    input_path=f'../../data/ncaa/combined/game_by_game_{alpha}_ewm_combined',
    output_path=f'../../data/ncaa/combined/accumulated/{alpha}_ewm_combined.csv',
)

Combing data using cumulatives for team matches ... Done! Results: {'df_length': 36348, 'err_a': 2042, 'err_b': 246}. Data stored at ../../data/ncaa/combined/accumulated/0.2_ewm_combined.csv


{'df_length': 36348, 'err_a': 2042, 'err_b': 246}

# Combining Player Data (w.r.t. years)

## Moving the files out of the team directory

In [3]:
years = [2019, 2018, 2017, 2016]
root_path = Path(f"../../data/ncaa/processed/")
failed_list = []
for year in years:
    for _, dirs, files in os.walk(f"../../data/ncaa/processed/{year}/player_game_wise_cleaned"):
        outpath = Path(f"../../data/ncaa/combined/{year}/player_game_wise_cleaned")
        outpath.mkdir(parents=True, exist_ok=True)
        for d in dirs:
            for sub_root, _, sub_files in os.walk(root_path.joinpath(f"{year}/player_game_wise_cleaned/{d}")):
                for f in sub_files:
                    print(f"Copying {sub_root}/{f}")
                    f_path = f"{sub_root}/{f}"
                    shutil.copy(f_path, outpath.joinpath(f"{f}"))

final_outpath = Path(f"../../data/ncaa/combined/player_game_wise_cleaned_combined")
final_outpath.mkdir(parents=True, exist_ok=True)

file_dir = "../../data/ncaa/combined/2019"
not_found_log = []
for _, _, files in os.walk(file_dir):
    for f in files:
        df_list = []
        for year in years:
            try:
                df1 = pd.read_csv(Path(file_dir).parent.joinpath(f"{year}/player_game_wise_cleaned/{f}"))
                df_list.append(df1)
            except:
                error = f"{f} not found for {year}"
                not_found_log.append(error)
        final_df = pd.concat(df_list, ignore_index=True)
        final_df.to_csv(final_outpath.joinpath(f"{f}"), index=False)
print("\n")        
for error in not_found_log:
    print(error)
        
for year in years:
    path = Path(file_dir).parent.joinpath(f"{year}/player_game_wise_cleaned")
    shutil.rmtree(path)
    os.rmdir(path)
                

Marisa.csv no found for 2016
Cornist, Kerra.csv no found for 2017
Cornist, Kerra.csv no found for 2016
Coates, Mackenzie.csv no found for 2016
Brown, Madison.csv no found for 2016
Migliore, Mia.csv no found for 2018
Migliore, Mia.csv no found for 2017
Migliore, Mia.csv no found for 2016
Hamilton, Abby.csv no found for 2017
Hamilton, Abby.csv no found for 2016
Leblanc, Renee.csv no found for 2016
Sisic, Isidora.csv no found for 2018
Sisic, Isidora.csv no found for 2017
Sisic, Isidora.csv no found for 2016
Thomison, Ryann.csv no found for 2016
Bramschreiber, Shanel.csv no found for 2017
Bramschreiber, Shanel.csv no found for 2016
Nomura, Kanile'a.csv no found for 2016
Weiby, Sarah.csv no found for 2018
Weiby, Sarah.csv no found for 2017
Weiby, Sarah.csv no found for 2016
Karlen, Miranda.csv no found for 2016
Strobert, Briana.csv no found for 2018
Strobert, Briana.csv no found for 2017
Strobert, Briana.csv no found for 2016
Wehrheim, Annie.csv no found for 2016
Fuller, Haley.csv no found 

In [2]:
def transform_combined_player_data(input_dir, output_dir, tf):
    for root, _, player_files in os.walk(input_dir):
        for player_file in player_files:
            df = pd.read_csv(Path(root).joinpath(player_file))
            outpath = Path(output_dir)
            tf(df)
            outpath.mkdir(parents=True, exist_ok=True)
            df.to_csv(outpath.joinpath(player_file), index=False)

features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]

## SMA on the combined player data


In [5]:
window = 10

def sma(df):
    df[features] = df[features].rolling(window, min_periods=1).mean()


transform_combined_player_data(
    input_dir=f'../../data/ncaa/combined/player_game_wise_cleaned_combined/',
    output_dir=f'../../data/ncaa/combined/player_game_wise_{window}_sma_combined/',
    tf=sma
)

## CMA on the combined player data

In [3]:
def cma(df):
    df[features] = df[features].expanding().mean()

transform_combined_player_data(
    input_dir=f'../../data/ncaa/combined/player_game_wise_cleaned_combined/',
    output_dir=f'../../data/ncaa/combined/player_game_wise_cma_combined/',
    tf=cma
)

## EWM on the combined player data

In [5]:
alpha = 0.2

def ewm(df):
    df[features] = df[features].ewm(alpha=alpha).mean()

transform_combined_player_data(
    input_dir=f'../../data/ncaa/combined/player_game_wise_cleaned_combined/',
    output_dir=f'../../data/ncaa/combined/player_game_wise_{alpha}_ewm_combined/',
    tf=ewm
)

## Making common data frame out of the combined team and player data

In [2]:
def clean_name(name):
    if '@' in name:
        if name.index('@') == 0:
            return name[2:]
        else:
            return name[:name.index('@')-1]
    else:
        return name

features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]
player_features = [f"Player {j} {f}" for f in features for j in range(12)]
combined_features = [
"Date", "TeamA", "TeamB", "Result", "S", "Team A Kills", "Team A Errors", "Team A Total Attacks", "Team A Hit Pct", "Team A Assists", "Team A Aces", "Team A SErr", "Team A Digs",  "Team A RErr", "Team A Block Solos", "Team A Block Assists", "Team A BErr", "Team A PTS", "Team B Kills", "Team B Errors", "Team B Total Attacks", "Team B Hit Pct", "Team B Assists", "Team B Aces", "Team B SErr", "Team B Digs", "Team B RErr", "Team B Block Solos", "Team B Block Assists", "Team B BErr", "Team B PTS",
*[f"Team A {s}" for s in player_features],
*[f"Team B {s}" for s in player_features]
]

def combine_with_player(player_input_path, team_stats_paths, team_matches_path, macthes_with_player_info_path, combined_output_path):
    print(f"Combining player data for individual teams into {combined_output_path} -")

    print("\tBuilding team index ...", end=' ')
    team_names = []
    for root, _, files in os.walk(team_matches_path):
        for f in files:
            team_names.append(f[:-4])
    print("Done!")
    print(team_names)

    player_input_path = Path(player_input_path)
    team_stats_paths = [Path(team_stats_path) for team_stats_path in team_stats_paths]
    team_matches_path = Path(team_matches_path)
    macthes_with_player_info_path = Path(macthes_with_player_info_path)
    macthes_with_player_info_path.mkdir(exist_ok=True, parents=True)

    flag = False
    length = len(team_names)
    print("\tSorting team data ...", end=' ')
    for i, name in enumerate(team_names):
        print(f"Processing {i}/{length}")
        if name == "LIU":
            alternate_name = "LIU Brooklyn"
        elif name == "Coastal Carolina":
            alternate_name = "Coastal Caro."
        else:
            alternate_name = name

        team_matches_df = pd.read_csv(team_matches_path.joinpath(f"{name}.csv"))
        team_matches_df['Date']= team_matches_df['Date'].map(str)
        team_stats_dfs = []
        for i, team_stats_path in enumerate(team_stats_paths):
            if i == 1 or i == 2:
                df_read = pd.read_csv(team_stats_path.joinpath(f"{alternate_name}.csv"))
                team_stats_dfs.append(df_read)
            else:
                df_read = pd.read_csv(team_stats_path.joinpath(f"{name}.csv"))
                team_stats_dfs.append(df_read)
                
        top_player_names_list = []
        for team_stats_df in team_stats_dfs:
            top_player_names = []
            for j, (_, player_row) in enumerate(team_stats_df[(team_stats_df["Player"] != "TEAM") & (team_stats_df["Player"] != "Totals") & (team_stats_df.Player != "Opponent Totals")].sort_values(by=["GP"], ascending=False).iterrows()):
                top_player_names.append(player_row["Player"])
                if j == 11:
                    top_player_names_list.append(top_player_names)
                    break
            if len(top_player_names) < 12:
                print(f"Could not get enough players for {name}!")
                flag = True
                break
        if flag :
            continue
                

        for j, (player_2019, player_2018, player_2017, player_2016) in enumerate(zip(*top_player_names_list)):
            try:
                team_matches_df_2019 = team_matches_df[team_matches_df.Date.str.contains('2019', case=False)]
                team_matches_df_2019[[f"Player {j} {f}" for f in features]] = pd.read_csv(player_input_path.joinpath(f"{player_2019}.csv"))[features]
            except:
                print(f"\nFailed to get player {player_2019}!")
                continue

            try:
                team_matches_df_2018 = team_matches_df[team_matches_df.Date.str.contains('2018', case=False)]
                team_matches_df_2018[[f"Player {j} {f}" for f in features]] = pd.read_csv(player_input_path.joinpath(f"{player_2018}.csv"))[features]
            except:
                print(f"\nFailed to get player {player_2018}!")
                continue

            try:
                team_matches_df_2017 = team_matches_df[team_matches_df.Date.str.contains('2017', case=False)]
                team_matches_df_2017[[f"Player {j} {f}" for f in features]] = pd.read_csv(player_input_path.joinpath(f"{player_2017}.csv"))[features]
            except:
                print(f"\nFailed to get player {player_2017}!")
                continue 

            try:
                team_matches_df_2016 = team_matches_df[team_matches_df.Date.str.contains('2016', case=False)]
                team_matches_df_2016[[f"Player {j} {f}" for f in features]] = pd.read_csv(player_input_path.joinpath(f"{player_2016}.csv"))[features]
            except:
                print(f"\nFailed to get player {player_2016}!")
                continue

            team_matches_df_list = [team_matches_df_2019, team_matches_df_2018, team_matches_df_2017, team_matches_df_2016]
            team_matches_df = pd.concat(team_matches_df_list, ignore_index=True)

        team_matches_df.to_csv(macthes_with_player_info_path.joinpath(f"{name}.csv"), index=False)        

    print("Done!")

    print("\tGetting match wise dataframes ...", end=' ')
    dfs = []
    team_names = []
    for root, _, files in os.walk(macthes_with_player_info_path):
        for f in files:
            team_names.append(f[:-4])
            dfs.append(pd.read_csv(Path(root).joinpath(f)))
    print(f"Collected {len(dfs)} dataframes. Done!")

    err_a, err_b = 0, 0
    data = []
    print("\tCombining into a single df ...", end=' ')
    for i in range(len(dfs)):
        name = team_names[i]
        df = dfs[i]
        for j in range(len(df)):
            if j == 0:
                continue
            TeamA_row = df.loc[j-1]
            date = TeamA_row["Date"]
            TeamA = name
            TeamB = clean_name(TeamA_row["Opponent"])
            Result = 1 if TeamA_row["Result"][0] == 'W' else 0
            S = TeamA_row["S"]
            TeamA_stats = TeamA_row[features + player_features]
            try:
                TeamB_df = dfs[team_names.index(TeamB)]
            except:
                err_a += 1
                continue
            try:
                TeamB_row_index = TeamB_df[(TeamB_df["Date"] == date) & TeamB_df["Opponent"].str.contains(TeamA)].index[0]
                if TeamB_row_index == 0:
                    continue
                TeamB_row = TeamB_df.loc[TeamB_row_index-1]
            except:
                err_b += 1
                continue

            TeamB_stats = TeamB_row[features + player_features]
            data.append([date, TeamA, TeamB, Result, S, *TeamA_stats, *TeamB_stats])

    combined_df = pd.DataFrame(data, columns=combined_features)
    combined_df.to_csv(combined_output_path, index=False)
    results = dict(df_len=len(combined_df), err_a=err_a, err_b=err_b)
    print(f"Done! Results = {results}")
    return results

In [3]:
data_path = Path("../../data/ncaa")
years = [2019, 2018, 2017, 2016]
team_stats_paths = [data_path.joinpath(f"raw/{year}/team_stats") for year in years]

## Player + Team Combined SMA

In [4]:
window = 10

combine_with_player(
            player_input_path=data_path.joinpath(f"combined/player_game_wise_{window}_sma_combined"),
            team_stats_paths=team_stats_paths,
            team_matches_path=data_path.joinpath(f"combined/game_by_game_{window}_sma_combined"),
            macthes_with_player_info_path=data_path.joinpath(f"combined/game_by_game_with_players_{window}_sma_combined"),
            combined_output_path=data_path.joinpath(f"combined/accumulated/{window}_sme_with_players_combined.csv"),
        )

, 'Chattanooga', 'Brown', 'Liberty', 'UNC Greensboro', 'Northern Ky.', 'Idaho', 'St. Francis Brooklyn', 'Colorado St.', 'Fla. Atlantic', 'Elon']
	Sorting team data ... Processing 0/324

Failed to get player Yeargin, Callie!

Failed to get player Phillips, Lauren!

Failed to get player Shearer, Rebecca!

Failed to get player Tellschow, Brianna!

Failed to get player Yeargin, Callie!

Failed to get player Arnold, Dominique!

Failed to get player Duncan, Payton!

Failed to get player Yeargin, Callie!

Failed to get player Haake, Madison!

Failed to get player Deaville, Morgan!

Failed to get player Haake, Madison!

Failed to get player Shearer, Rebecca!
Processing 1/324

Failed to get player Harvell, Kailey!

Failed to get player Hayes, Keely!

Failed to get player Gamble, Ally!

Failed to get player Gamble, Ally!

Failed to get player Wilson-Talmadge, Kennedy!

Failed to get player Pearson, Caroline!

Failed to get player Carter, Annie!

Failed to get player Watts, Maddie!

Failed to get

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Index(['Player 0 Kills', 'Player 1 Kills', 'Player 2 Kills', 'Player 3 Kills',\n       'Player 4 Kills',\n       ...\n       'Player 7 PTS', 'Player 8 PTS', 'Player 9 PTS', 'Player 10 PTS',\n       'Player 11 PTS'],\n      dtype='object', length=156). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

## CMA

In [None]:
combine_with_player(
            player_input_path=data_path.joinpath(f"combined/player_game_wise_cma_combined"),
            team_stats_paths=team_stats_paths,
            team_matches_path=data_path.joinpath(f"combined/game_by_game_cma_combined"),
            macthes_with_player_info_path=data_path.joinpath(f"combined/game_by_game_with_players_cma_combined"),
            combined_output_path=data_path.joinpath(f"combined/accumulated/cma_with_players_combined.csv"),
        )

## EWM

In [None]:
alpha = 0.2

combine_with_player(
    player_input_path="/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/combined/player_game_wise_0.2_ewm_combined",
    team_stats_path=team_stats_paths,
    team_matches_path="/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/combined/game_by_game_0.2_ewm_combined",
    macthes_with_player_info_path="/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/combined/game_by_game_with_players_0.2_ewm_combined",
    combined_output_path="/home/veds12/Desktop/Projects/volleyball-ml/data/ncaa/combined/accumulated/0.2_ewm_with_players_combined.csv",
)