In [1]:
import os
import pandas as pd

# Csv To Parquet
#### Add player names and team names as column

In [2]:
# Function to read all CSVs in a directory and combine them into one DataFrame
# Includes adding the filename (without extension) as a new column "name"
def combine_csvs_from_folder(folder_path, add_team_name=False, team_name=None):
    all_files = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if file_path.endswith('.csv'):
            # Read the CSV and let pandas use the first row as header
            df = pd.read_csv(file_path, skiprows=1)  # By default, pandas will use the first row as column names
            # Add the filename (without extension) as a new column "name"
            df['name'] = os.path.splitext(filename)[0]
            # Optionally add the team name if provided
            if add_team_name:
                df['team_name'] = team_name
            all_files.append(df)
    return pd.concat(all_files, ignore_index=True)

# Function to import data by year, adding "name" and "team_name" columns for player data
def import_data_by_year(year): 
    # Define the path to the year folder
    base_path = f"volleyball_csvs/{year}" 

    # Combine team-based data (team_stats folder)
    team_base_folder = os.path.join(base_path, 'team_game_by_game')
    team_base_df = combine_csvs_from_folder(team_base_folder)

    # Combine player-based data (player_game_wise folder and subfolders)
    player_base_folder = os.path.join(base_path, 'player_game_wise')
    player_base_df = pd.DataFrame()

    # Loop through all team subfolders within the player_game_wise folder
    for team_folder in os.listdir(player_base_folder):
        team_folder_path = os.path.join(player_base_folder, team_folder)
        if os.path.isdir(team_folder_path):  # Ensure it's a folder
            # Combine CSVs for each team, add the team name, and append to player_base_df
            player_base_df = pd.concat(
                [
                    player_base_df, 
                    combine_csvs_from_folder(
                        team_folder_path, 
                        add_team_name=True, 
                        team_name=team_folder
                    )
                ],
                ignore_index=True
            )

    # Now we have two DataFrames: `team_base_df` and `player_base_df`
    return team_base_df, player_base_df

#### Combine all the years

In [3]:
def combine_all_years(output_dir='parquet_data'):
    
    data_dict_by_year = {}  # Dictionary to store yearly data for reference if needed. Like this { "2016": [team_base_df_2016, player_base_df_2016], "2017": [team_base_df_2017, player_base_df_2017], ...}
    team_df_all_years = pd.DataFrame()  # To combine all team data
    player_df_all_years = pd.DataFrame()  # To combine all player data

    for year in [2016, 2017, 2018, 2019]:
        # Import data for the year
        team_base_df, player_base_df = import_data_by_year(year)
        # Store the data in the dictionary for reference
        data_dict_by_year[f"{year}"] = [team_base_df, player_base_df]

        # Append the yearly data to the combined DataFrames
        team_df_all_years = pd.concat([team_df_all_years, team_base_df], ignore_index=True)
        player_df_all_years = pd.concat([player_df_all_years, player_base_df], ignore_index=True)

    #Convert numeric but in str format values to int or float
    cols_to_convert = ['S',	'MP', 'Kills', 'Errors','Total Attacks', 'Hit Pct', 'Assists','Aces', 'SErr', 'Digs','RErr', 'Block Solos', 'Block Assists', 'BErr','PTS', 'BHE']
    team_df_all_years[cols_to_convert]=team_df_all_years[cols_to_convert].apply(pd.to_numeric, errors='coerce')
    player_df_all_years[cols_to_convert] = player_df_all_years[cols_to_convert].apply(pd.to_numeric, errors='coerce')
    
    # Save the combined DataFrames to parquet files
    os.makedirs(output_dir, exist_ok=True) # Ensure the output directory exists
        
    team_parquet_path = os.path.join(output_dir, 'team_df_raw.parquet')
    player_parquet_path = os.path.join(output_dir, 'player_df_raw.parquet')

    team_df_all_years.to_parquet(team_parquet_path, index=False)
    player_df_all_years.to_parquet(player_parquet_path, index=False)

    # Return the dictionary
    return data_dict_by_year

#### Save the data as parquet, into the parquet_data folder

In [4]:
# pandas uses one of the pyarrow or fastparquet libraries to be able to read or write Parquet files.
# However, these libraries do not come with the default installation of pandas. So we have to download it first.
! pip install pyarrow
import pyarrow
data_dict_by_year =combine_all_years()


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable


#### Convert from Parquet to DataFrame





You can quickly convert a parquet file to a dataframe using pandas:
from Parquet to DataFrame example

team_df_raw = pd.read_parquet('parquet_data\team_df_all_years.parquet')
player_df_raw = pd.read_parquet('parquet_data\player_df_all_years.parquet')

In [5]:
team_df_raw = pd.read_parquet('parquet_data/team_df_raw.parquet')
player_df_raw = pd.read_parquet('parquet_data/player_df_raw.parquet')

In [6]:
team_df_raw

Unnamed: 0,Date,Opponent,Result,S,MP,Kills,Errors,Total Attacks,Hit Pct,Assists,...,RErr,Block Solos,Block Assists,BErr,PTS,BHE,Unnamed: 19,name,Attend,Unnamed: 20
0,08/26/2016,"Prairie View @ Waco, Texas",W 3 - 0,3.0,,46.0,13.0,107.0,0.308,40.0,...,3.0,,2.0,3.0,53.0,,,A&M-Corpus Christi (Southland),,
1,08/27/2016,@ Baylor,L 1 - 3,4.0,,37.0,20.0,119.0,0.143,32.0,...,5.0,2.0,18.0,,53.0,,,A&M-Corpus Christi (Southland),,
2,08/27/2016,"UTRGV @ Waco, TX",W 3 - 2,5.0,,76.0,26.0,,0.231,62.0,...,8.0,2.0,16.0,2.0,90.0,,,A&M-Corpus Christi (Southland),,
3,09/03/2016,"San Diego @ Madison, Wis.",L 0 - 3,3.0,,30.0,16.0,94.0,0.149,29.0,...,1.0,4.0,15.0,1.0,42.5,1.0,,A&M-Corpus Christi (Southland),,
4,09/04/2016,@ Wisconsin,L 0 - 3,3.0,,28.0,23.0,104.0,0.048,27.0,...,3.0,,,1.0,29.0,1.0,,A&M-Corpus Christi (Southland),,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40156,11/03/2019,@ Oakland,L 0 - 3,3.0,,26.0,22.0,119.0,0.034,24.0,...,4.0,1.0,6.0,3.0,35.0,1.0,,Youngstown St. (Horizon),,
40157,11/06/2019,IUPUI,L 2 - 3,5.0,,56.0,20.0,149.0,0.242,53.0,...,8.0,1.0,12.0,,71.0,2.0,,Youngstown St. (Horizon),,
40158,11/08/2019,@ Green Bay,L 0 - 3,3.0,,33.0,19.0,110.0,0.127,31.0,...,,,12.0,,43.0,,,Youngstown St. (Horizon),,
40159,11/15/2019,Wright St.,L 0 - 3,3.0,,28.0,19.0,113.0,0.080,27.0,...,6.0,,6.0,2.0,32.0,3.0,,Youngstown St. (Horizon),,


In [7]:
player_df_raw

Unnamed: 0,Date,Opponent,Result,S,MP,Kills,Errors,Total Attacks,Hit Pct,Assists,...,Block Solos,Block Assists,BErr,PTS,BHE,Unnamed: 19,name,team_name,Attend,Unnamed: 20
0,08/26/2016,"Prairie View @ Waco, Texas",W 3 - 0,2.0,1.0,4.0,1.0,8.0,0.375,1.0,...,,,,5.0,,,"Carlson, Morgan",A&M-Corpus Christi (Southland),,
1,08/27/2016,@ Baylor,L 1 - 3,4.0,1.0,7.0,3.0,19.0,0.211,,...,,1.0,,8.5,,,"Carlson, Morgan",A&M-Corpus Christi (Southland),,
2,08/27/2016,"UTRGV @ Waco, TX",W 3 - 2,5.0,1.0,11.0,8.0,31.0,0.097,,...,,,,13.0,,,"Carlson, Morgan",A&M-Corpus Christi (Southland),,
3,09/03/2016,"San Diego @ Madison, Wis.",L 0 - 3,3.0,1.0,4.0,4.0,15.0,,1.0,...,,2.0,,6.0,,,"Carlson, Morgan",A&M-Corpus Christi (Southland),,
4,09/04/2016,@ Wisconsin,L 0 - 3,3.0,1.0,5.0,3.0,14.0,0.143,,...,,,,5.0,,,"Carlson, Morgan",A&M-Corpus Christi (Southland),,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
633963,11/03/2019,@ Oakland,L 0 - 3,3.0,1.0,0.0,0.0,0.0,,1.0,...,,,,,,,"Thompson, Margaux",Youngstown St. (Horizon),,
633964,11/06/2019,IUPUI,L 2 - 3,5.0,1.0,0.0,0.0,0.0,,2.0,...,,,,2.0,,,"Thompson, Margaux",Youngstown St. (Horizon),,
633965,11/08/2019,@ Green Bay,L 0 - 3,3.0,1.0,0.0,0.0,0.0,,1.0,...,,,,1.0,,,"Thompson, Margaux",Youngstown St. (Horizon),,
633966,11/15/2019,Wright St.,L 0 - 3,3.0,1.0,0.0,0.0,0.0,,1.0,...,,,,,,,"Thompson, Margaux",Youngstown St. (Horizon),,
