# Clean and organising the raw data scraped from https://stats.ncaa.org/

Author: Atharv Sonwane

In [1]:
import pandas as pd
import os
import sys
from pathlib import Path
import requests
from tqdm import tnrange

# Clean Data

In [28]:
year = 2019
for root, dirs, files in os.walk(f'../../data/ncaa/raw/{year}/team_game_by_game/'):
    for i in tnrange(len(files), desc='Cleaning data'):
        f = files[i]
        df = pd.read_csv(Path(root).joinpath(f), header=1)
        if year >= 2018:
            df.drop(columns=["MP", "Attend", "BHE", "Unnamed: 20"], inplace=True)
        else:
            df.drop(labels=["MP", "BHE", "Unnamed: 19"], axis=1, inplace=True)
        df.replace({'/':''}, regex=True, inplace=True)
        df.fillna(0, inplace=True)
        df[["Kills", "Errors", "Total Attacks", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr"]] = df[["Kills", "Errors", "Total Attacks", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr"]].astype(int)
        outpath = Path(root).parent.parent.parent.joinpath(f"processed/{year}/game_by_game_cleaned/")
        outpath.mkdir(parents=True, exist_ok=True)
        f = f[:f.find('(') - 1] + ".csv"
        df.to_csv(outpath.joinpath(f), index=False)

HBox(children=(HTML(value='Cleaning data'), FloatProgress(value=0.0, max=332.0), HTML(value='')))




# Computing Moving Averages

## Simple Moving Average

In [3]:
window = 10
year = 2019

for root, dirs, files in os.walk(f'../../data/ncaa/processed/{year}/game_by_game_cleaned'):
    new_root = Path(root).parent.joinpath(f"game_by_game_{window}_sma")
    new_root.mkdir(parents=True, exist_ok=True)
    for f in files:
        df = pd.read_csv(Path(root).joinpath(f))
        features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]
        df[features] = df[features].rolling(window, min_periods=1).mean()
        df.to_csv(new_root.joinpath(f), index=False)

## Cumulative Moving Average

In [4]:
year = 2019

for root, dirs, files in os.walk(f'../../data/ncaa/processed/{year}/game_by_game_cleaned'):
    new_root = Path(root).parent.joinpath("game_by_game_cma")
    new_root.mkdir(parents=True, exist_ok=True)
    for f in files:
        df = pd.read_csv(Path(root).joinpath(f))
        features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]
        df[features] = df[features].expanding().mean()
        df.to_csv(new_root.joinpath(f), index=False)

## Exponential Moving Average

In [5]:
year = 2019
alpha = 0.2

for root, dirs, files in os.walk(f'../../data/ncaa/processed/{year}/game_by_game_cleaned'):
    new_root = Path(root).parent.joinpath(f"game_by_game_{alpha}_ewm")
    new_root.mkdir(parents=True, exist_ok=True)
    for f in files:
        df = pd.read_csv(Path(root).joinpath(f))
        features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]
        df[features] = df[features].ewm(alpha=alpha).mean()
        df.to_csv(new_root.joinpath(f), index=False)

# Combine into single dataframe of matches

## Utility Function

In [6]:
def clean_name(name):
    if '@' in name:
        if name.index('@') == 0:
            return name[2:]
        else:
            return name[:name.index('@')-1]
    else:
        return name

features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]
combined_features = ["Date", "TeamA", "TeamB", "Result", "S", "Team A Kills", "Team A Errors", "Team A Total Attacks", "Team A Hit Pct", "Team A Assists", "Team A Aces", "Team A SErr", "Team A Digs", "Team A RErr", "Team A Block Solos", "Team A Block Assists", "Team A BErr", "Team A PTS", "Team B Kills", "Team B Errors", "Team B Total Attacks", "Team B Hit Pct", "Team B Assists", "Team B Aces", "Team B SErr", "Team B Digs", "Team B RErr", "Team B Block Solos", "Team B Block Assists", "Team B BErr", "Team B PTS"]


def combine(input_path, output_path):
    dfs = []
    team_names = []
    for root, dirs, files in os.walk(input_path):
        for f in files:
            team_names.append(f[:-4])
            dfs.append(pd.read_csv(Path(root).joinpath(f)))

    data = []

    err_a = 0
    err_b = 0

    for i, name in enumerate(team_names):
        df = dfs[i]
        for j, TeamA_row in df.iterrows(): 
            date = TeamA_row["Date"]
            TeamA = name
            TeamB = clean_name(TeamA_row["Opponent"])
            Result = 1 if TeamA_row["Result"][0] == 'W' else 0
            S = TeamA_row["S"]
            TeamA_stats = TeamA_row[features]
            try:
                TeamB_df = dfs[team_names.index(TeamB)]
            except:
                err_a += 1
                continue
            try:
                TeamB_row = TeamB_df[TeamB_df["Date"] == date][TeamB_df["Opponent"].str.contains(TeamA)].reset_index().loc[0]
            except:
                err_b += 1
                continue

            TeamB_stats = TeamB_row[features]
            data.append([date, TeamA, TeamB, Result, S, *TeamA_stats, *TeamB_stats])
        
    combined_df = pd.DataFrame(data, columns=combined_features)
    combined_df.to_csv(output_path, index=False)
    return combined_df, dict(err_a=err_a, err_b=err_b)

def prev_combine(input_path, output_path):
    dfs = []
    team_names = []
    for root, dirs, files in os.walk(input_path):
        for f in files:
            team_names.append(f[:-4])
            dfs.append(pd.read_csv(Path(root).joinpath(f)))

    data = []

    err_a = 0
    err_b = 0

    for i, name in enumerate(team_names):
        df = dfs[i]
        for j in range(len(df)):
            if j == 0:
                continue
            TeamA_row = df.loc[j-1]
            date = TeamA_row["Date"]
            TeamA = name
            TeamB = clean_name(TeamA_row["Opponent"])
            Result = 1 if TeamA_row["Result"][0] == 'W' else 0
            S = TeamA_row["S"]
            TeamA_stats = TeamA_row[features]
            try:
                TeamB_df = dfs[team_names.index(TeamB)]
            except:
                err_a += 1
                continue
            try:
                TeamB_row_index = TeamB_df[TeamB_df["Date"] == date][TeamB_df["Opponent"].str.contains(TeamA)].index[0]
                if TeamB_row_index == 0:
                    continue
                TeamB_row = TeamB_df.loc[TeamB_row_index-1]
            except:
                err_b += 1
                continue

            TeamB_stats = TeamB_row[features]
            data.append([date, TeamA, TeamB, Result, S, *TeamA_stats, *TeamB_stats])
        
    combined_df = pd.DataFrame(data, columns=combined_features)
    combined_df.to_csv(output_path, index=False)
    return combined_df, dict(err_a=err_a, err_b=err_b)



## Combine dataframe of math by match result without any averages

In [7]:
year = 2019
input_path = f'../../data/ncaa/processed/{year}/game_by_game_cleaned'
output_path = f'../../data/ncaa/processed/{year}/accumulated/matches_gathered.csv'

matches_gathered_df, info = combine(input_path, output_path)
print(info, len(matches_gathered_df))

{'err_a': 368, 'err_b': 59} 9533


## Combine dataframe for Simple Moving Average

In [8]:
year = 2019
window = 10
input_path = f'../../data/ncaa/processed/{year}/game_by_game_{window}_sma'
output_path = f'../../data/ncaa/processed/{year}/accumulated/{window}_sma.csv'

sma_df, info = prev_combine(input_path, output_path)
print(info, len(sma_df))

{'err_a': 361, 'err_b': 57} 8887


## Combine dataframe for Cumulative Moving Average

In [9]:
year = 2019
input_path = f'../../data/ncaa/processed/{year}/game_by_game_cma'
output_path = f'../../data/ncaa/processed/{year}/accumulated/cma.csv'

cma_df, info = prev_combine(input_path, output_path)
print(info, len(cma_df))

{'err_a': 361, 'err_b': 57} 8887


## Combine dataframe for Exponentially Moving Average

In [10]:
year = 2019
alpha = 0.2
input_path = f'../../data/ncaa/processed/{year}/game_by_game_{alpha}_ewm'
output_path = f'../../data/ncaa/processed/{year}/accumulated/{alpha}_ewm.csv'

ewm_df, info = prev_combine(input_path, output_path)
print(info, len(ewm_df))

{'err_a': 361, 'err_b': 57} 8887


# Clean Player Data

In [24]:
def store_player(url, path):
    print(f"Fetching for {Path(path).name[:-4]} ...", end=' ')
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"})
    pd.read_html(r.text)[-1].drop(labels=[0], axis=0).to_csv(path, index=False)
    print("Done!")

urls = [
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=51&stats_player_seq=1906568",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=77&stats_player_seq=2020914",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=136&stats_player_seq=2259847",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=196&stats_player_seq=2199752",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=255&stats_player_seq=2206497",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=648&stats_player_seq=2199107",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=141&stats_player_seq=1920987",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=731&stats_player_seq=1787509",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=782&stats_player_seq=2210530",
    "https://stats.ncaa.org/player/index?game_sport_year_ctl_id=14942&org_id=731&stats_player_seq=2020474",
]
paths = [
    "/home/twm/comp/ml/volleyball-ml/data/ncaa/raw/2019/player_game_wise/Baylor (Big 12)/Lockin, Hannah.csv",
    "/home/twm/comp/ml/volleyball-ml/data/ncaa/raw/2019/player_game_wise/BYU (WCC)/Tausinga, Tayler.csv",
    "/home/twm/comp/ml/volleyball-ml/data/ncaa/raw/2019/player_game_wise/Chicago St. (WAC)/Sisic, Isadora.csv",
    "/home/twm/comp/ml/volleyball-ml/data/ncaa/raw/2019/player_game_wise/East Carolina (AAC)/Garcia, Jaylibeth.csv",
    "/home/twm/comp/ml/volleyball-ml/data/ncaa/raw/2019/player_game_wise/Georgia Tech (ACC)/Lamborda, Paola.csv",
    "/home/twm/comp/ml/volleyball-ml/data/ncaa/raw/2019/player_game_wise/South Carolina (SEC)/Covas Córdova, Camilla.csv",
    "/home/twm/comp/ml/volleyball-ml/data/ncaa/raw/2019/player_game_wise/The Citadel (SoCon)/Jesus, Sharlissa.csv",
    "/home/twm/comp/ml/volleyball-ml/data/ncaa/raw/2019/player_game_wise/Utah St. (Mountain West)/Olson-Shepherd, Madi.csv",
    "/home/twm/comp/ml/volleyball-ml/data/ncaa/raw/2019/player_game_wise/Wichita St. (AAC)/Uluave, Sina.csv",
    "/home/twm/comp/ml/volleyball-ml/data/ncaa/raw/2019/player_game_wise/Utah St. (Mountain West)/Solosabal, Whitney.csv",
]

for url, path in zip(urls, paths):
    store_player(url, path)

Fetching for Lockin, Hannah ... Done!
Fetching for Tausinga, Tayler ... Done!
Fetching for Sisic, Isadora ... Done!
Fetching for Garcia, Jaylibeth ... Done!
Fetching for Lamborda, Paola ... Done!
Fetching for Covas Córdova, Camilla ... Done!
Fetching for Jesus, Sharlissa ... Done!
Fetching for Olson-Shepherd, Madi ... Done!
Fetching for Uluave, Sina ... Done!
Fetching for Solosabal, Whitney ... Done!


In [2]:
year = 2019
verbose = False

for root, dirs, _ in os.walk(f'../../data/ncaa/raw/{year}/player_game_wise/'):
    for team_dir in dirs:
        team_root, _, player_files = list(os.walk(Path(root).joinpath(team_dir)))[0]
        team_name = Path(team_root).name
        team_name = team_name[:team_name.find('(') - 1]
        if verbose:
            print(f"Cleaning player data for {team_name} ...")
        for player_file in player_files:
            if verbose:
                print(f"\tCleaning data for player {player_file[:-4]} ...", end=' ')
            outpath = Path(root).parent.parent.parent.joinpath(f"processed/{year}/player_game_wise_cleaned/{team_name}")
            if outpath.joinpath(player_file).is_file():
                if verbose:
                    print("Already Exists!")
                continue
            try:
                df = pd.read_csv(Path(team_root).joinpath(player_file), header=1)
            except:
                print(f"{Path(team_root).joinpath(player_file)} Failed!")
                continue
            
            df.drop(columns=["MP", "Attend", "BHE", "Unnamed: 20"], inplace=True)
            df.replace({'/':''}, regex=True, inplace=True)
            df.fillna(0, inplace=True)
            df[["Kills", "Errors", "Total Attacks", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr"]] = df[["Kills", "Errors", "Total Attacks", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr"]].astype(int)
            outpath.mkdir(parents=True, exist_ok=True)
            df.to_csv(outpath.joinpath(player_file), index=False)
            if verbose:
                print("Done!")

for root, dirs, _ in os.walk('/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/player_game_wise_cleaned'):
    for team_dir in dirs:
        for team_root, _, player_files in os.walk(Path(root).joinpath(team_dir)):
            for player_file in player_files:
                if player_file[player_file.find(".csv") - 1] == ' ':
                    cp = Path(team_root).joinpath(player_file)
                    n = player_file[:player_file.find(".csv") - 1] + ".csv"
                    print(cp.rename(cp.parent.joinpath(n)))

/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/player_game_wise_cleaned/Abilene Christian/Browning, Logan.csv
/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/player_game_wise_cleaned/Akron/Abramson, Rachel.csv
/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/player_game_wise_cleaned/Akron/Kelly, Megan.csv
/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/player_game_wise_cleaned/Akron/Latka, Lauren.csv
/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/player_game_wise_cleaned/Alabama A&M/Battle, Maci.csv
/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/player_game_wise_cleaned/Alabama St./Weatherwax, Morgan.csv
/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/player_game_wise_cleaned/Ark.-Pine Bluff/Akamine, Nikole.csv
/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/player_game_wise_cleaned/Ark.-Pine Bluff/Armstrong, Bri-Anna.csv
/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/player_game_wise_cleaned/Ark.-Pine 

## SMA

In [22]:
year = 2019
window = 10

for root, dirs, _ in os.walk(f'../../data/ncaa/processed/{year}/player_game_wise_cleaned/'):
    for team_dir in dirs:
        team_root, _, player_files = list(os.walk(Path(root).joinpath(team_dir)))[0]
        team_name = Path(team_root).name
        team_name = team_name[:team_name.find('(') - 1]
        for player_file in player_files:
            outpath = Path(root).parent.parent.parent.joinpath(f"processed/{year}/player_game_wise_{window}_sma/{team_name}")
            df = pd.read_csv(Path(team_root).joinpath(player_file))
            features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]
            df[features] = df[features].rolling(window, min_periods=1).mean()
            outpath.mkdir(parents=True, exist_ok=True)
            df.to_csv(outpath.joinpath(player_file), index=False)	

## CMA

In [23]:
year = 2019

for root, dirs, _ in os.walk(f'../../data/ncaa/processed/{year}/player_game_wise_cleaned/'):
    for team_dir in dirs:
        team_root, _, player_files = list(os.walk(Path(root).joinpath(team_dir)))[0]
        team_name = Path(team_root).name
        for player_file in player_files:
            outpath = Path(root).parent.parent.parent.joinpath(f"processed/{year}/player_game_wise_cma/{team_name}")
            df = pd.read_csv(Path(team_root).joinpath(player_file))
            features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]
            df[features] = df[features].expanding().mean()
            outpath.mkdir(parents=True, exist_ok=True)
            df.to_csv(outpath.joinpath(player_file), index=False)	

## EWM

In [5]:
year = 2019
alpha = 0.2

for root, dirs, _ in os.walk(f'../../data/ncaa/processed/{year}/player_game_wise_cleaned/'):
    for team_dir in dirs:
        team_root, _, player_files = list(os.walk(Path(root).joinpath(team_dir)))[0]
        team_name = Path(team_root).name
        for player_file in player_files:
            outpath = Path(root).parent.parent.parent.joinpath(f"processed/{year}/player_game_wise_{alpha}_ewm/{team_name}")
            df = pd.read_csv(Path(team_root).joinpath(player_file))
            features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]
            df[features] = df[features].ewm(alpha=alpha).mean()
            outpath.mkdir(parents=True, exist_ok=True)
            df.to_csv(outpath.joinpath(player_file), index=False)	

# Combining into single dataframe

In [6]:
def clean_name(name):
    if '@' in name:
        if name.index('@') == 0:
            return name[2:]
        else:
            return name[:name.index('@')-1]
    else:
        return name

features = ["Kills", "Errors", "Total Attacks", "Hit Pct", "Assists", "Aces", "SErr", "Digs", "RErr", "Block Solos", "Block Assists", "BErr", "PTS"]
player_features = [f"Player {j} {f}" for f in features for j in range(12)]
combined_features = [
"Date", "TeamA", "TeamB", "Result", "S", "Team A Kills", "Team A Errors", "Team A Total Attacks", "Team A Hit Pct", "Team A Assists", "Team A Aces", "Team A SErr", "Team A Digs",  "Team A RErr", "Team A Block Solos", "Team A Block Assists", "Team A BErr", "Team A PTS", "Team B Kills", "Team B Errors", "Team B Total Attacks", "Team B Hit Pct", "Team B Assists", "Team B Aces", "Team B SErr", "Team B Digs", "Team B RErr", "Team B Block Solos", "Team B Block Assists", "Team B BErr", "Team B PTS",
*[f"Team A {s}" for s in player_features],
*[f"Team B {s}" for s in player_features]
]


def combine_with_player(player_input_path, team_stats_path, team_matches_path, macthes_with_player_info_path, combined_output_path):
    print("Combining player data for individual teams ...")

    print("Building team index ...", end=' ')
    team_names = []
    for root, _, files in os.walk(team_matches_path):
        for f in files:
            team_names.append(f[:-4])
    print("Done!")

    player_input_path = Path(player_input_path)
    team_stats_path = Path(team_stats_path)
    team_matches_path = Path(team_matches_path)
    macthes_with_player_info_path = Path(macthes_with_player_info_path)
    macthes_with_player_info_path.mkdir(exist_ok=True, parents=True)

    print("Sorting team data ...", end=' ')
    for i, name in enumerate(team_names):
        team_matches_df = pd.read_csv(team_matches_path.joinpath(f"{name}.csv"))
        team_stats_df = pd.read_csv(team_stats_path.joinpath(f"{name}.csv"))
        top_player_names = [] 
        for j, (_, player_row) in enumerate(team_stats_df[team_stats_df.Player != "TEAM"][team_stats_df.Player != "Totals"][team_stats_df.Player != "Opponent Totals"].sort_values(by=["GP"], ascending=False).iterrows()):
            top_player_names.append(player_row["Player"])
            if j == 11:
                break
        try:
            for j, player in enumerate(top_player_names):
                team_matches_df[[f"Player {j} {f}" for f in features]] = pd.read_csv(player_input_path.joinpath(f"{name}/{player}.csv"))[features]
            team_matches_df.to_csv(macthes_with_player_info_path.joinpath(f"{name}.csv"), index=False)        
        except:
            print(f"\nFailed to get player {player} for {name}!")
            continue
    print("Done!")

    print("Getting match wise dataframes ...", end=' ')
    dfs = []
    for root, _, files in os.walk(macthes_with_player_info_path):
        for f in files:
            dfs.append(pd.read_csv(Path(root).joinpath(f)))
    print("Done!")

    err_a, err_b = 0, 0
    data = []
    for i in tnrange(len(dfs), desc="Combining into single df"):
        name = team_names[i]
        df = dfs[i]
        for j in range(len(df)):
            if j == 0:
                continue
            TeamA_row = df.loc[j-1]
            date = TeamA_row["Date"]
            TeamA = name
            TeamB = clean_name(TeamA_row["Opponent"])
            Result = 1 if TeamA_row["Result"][0] == 'W' else 0
            S = TeamA_row["S"]
            TeamA_stats = TeamA_row[features + player_features]
            try:
                TeamB_df = dfs[team_names.index(TeamB)]
            except:
                err_a += 1
                continue
            try:
                TeamB_row_index = TeamB_df[TeamB_df["Date"] == date][TeamB_df["Opponent"].str.contains(TeamA)].index[0]
                if TeamB_row_index == 0:
                    continue
                TeamB_row = TeamB_df.loc[TeamB_row_index-1]
            except:
                err_b += 1
                continue

            TeamB_stats = TeamB_row[features + player_features]
            data.append([date, TeamA, TeamB, Result, S, *TeamA_stats, *TeamB_stats])

    combined_df = pd.DataFrame(data, columns=combined_features)
    combined_df.to_csv(combined_output_path, index=False)
    return dict(df_len=len(combined_df), err_a=err_a, err_b=err_b)



In [7]:
combine_with_player(
    player_input_path="/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/player_game_wise_0.2_ewm",
    team_stats_path="/home/twm/comp/ml/volleyball-ml/data/ncaa/raw/2019/team_stats",
    team_matches_path="/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/game_by_game_0.2_ewm",
    macthes_with_player_info_path="/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/game_by_game_with_players_0.2_ewm",
    combined_output_path="/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/accumulated/0.2_ewm_with_players.csv",
)

Combining player data for individual teams ...
Building team index ... Done!
Sorting team data ... Done!
Getting match wise dataframes ... Done!
331 331


HBox(children=(HTML(value='Combining into single df'), FloatProgress(value=0.0, max=331.0), HTML(value='')))




{'df_len': 8887, 'err_a': 361, 'err_b': 57}

In [21]:
df = pd.read_csv("/home/twm/comp/ml/volleyball-ml/data/ncaa/processed/2019/accumulated/0.2_ewm_with_players.csv")
len(df)

7329