In [2]:
import os
import pandas as pd

In [3]:
# Function to read all CSVs in a directory and combine them into one DataFrame
def combine_csvs_from_folder(folder_path):
    all_files = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if file_path.endswith('.csv'):
            # Read the CSV and let pandas use the first row as header
            df = pd.read_csv(file_path,skiprows=1)  # By default, pandas will use the first row as column names
            all_files.append(df)
    return pd.concat(all_files, ignore_index=True)

In [4]:
def import_data_by_year(year): 
    
    # Define the path to the 2016 folder
    base_path = f"volleyball_csvs/{year}" 
    # Combine team-based data (team_stats folder)
    team_base_folder = os.path.join(base_path, 'team_game_by_game')
    team_base_df = combine_csvs_from_folder(team_base_folder)

    # Combine player-based data (player_game_wise folder and subfolders)
    player_base_folder = os.path.join(base_path, 'player_game_wise')
    player_base_df = pd.DataFrame()

    # Loop through all team subfolders within the player_game_wise folder
    for team_folder in os.listdir(player_base_folder):
        team_folder_path = os.path.join(player_base_folder, team_folder)
        if os.path.isdir(team_folder_path):  # Ensure it's a folder
            player_base_df = pd.concat([player_base_df, combine_csvs_from_folder(team_folder_path)], ignore_index=True)

    # Now we have two DataFrames: `team_base_df` and `player_base_df`
    return team_base_df, player_base_df

In [5]:
def isnull():
    # Initialize lists to store NaN counts for each year
    team_base_nan_counts_list = []
    player_base_nan_counts_list = []
    data_dict={}
    for year in [2016, 2017, 2018, 2019]:
        # Import data for the year
        team_base_df, player_base_df = import_data_by_year(year)
        data_dict[f"{year}"] = [team_base_df, player_base_df]
        
        # Calculate NaN counts for team_base_df
        team_base_nan_counts = team_base_df.isna().sum()
        # Append the NaN counts for the current year as a dictionary
        team_base_nan_counts_list.append(team_base_nan_counts.to_dict())
        
        # Calculate NaN counts for player_base_df
        player_base_nan_counts = player_base_df.isna().sum()
        # Append the NaN counts for the current year as a dictionary
        player_base_nan_counts_list.append(player_base_nan_counts.to_dict())

    # Convert the lists of dictionaries into DataFrames
    team_base_nan_df = pd.DataFrame(team_base_nan_counts_list)
    player_base_nan_df = pd.DataFrame(player_base_nan_counts_list)

    # Optionally, set the year as the index for both DataFrames
    team_base_nan_df['Year'] = [2016, 2017, 2018, 2019]
    player_base_nan_df['Year'] = [2016, 2017, 2018, 2019]

    team_base_nan_df.set_index('Year', inplace=True)
    player_base_nan_df.set_index('Year', inplace=True)

    return team_base_nan_df, player_base_nan_df, data_dict

In [6]:
# Set pandas to display all columns
pd.set_option('display.max_columns', None)
pd.reset_option('display.max_rows')

In [7]:
team_base_nan_df, player_base_nan_df, data_dict=isnull()

In [8]:
team_base_nan_df # approximate total amount of data for each year is 10,000

Unnamed: 0_level_0,Date,Opponent,Result,S,MP,Kills,Errors,Total Attacks,Hit Pct,Assists,Aces,SErr,Digs,RErr,Block Solos,Block Assists,BErr,PTS,BHE,Unnamed: 19,Attend,Unnamed: 20
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2016,0,0,0,0,10018,0,0,0,43,0,312,35,0,319,2567,183,2765,0,4072,10018.0,,
2017,0,0,0,1,10107,1,1,1,24,4,339,33,1,344,2601,136,2790,1,4113,10107.0,,
2018,0,0,0,20,10047,20,20,20,64,20,304,53,20,308,2589,153,2854,20,4091,,10047.0,10047.0
2019,0,0,0,22,4547,22,22,22,41,24,270,61,24,278,2583,190,3146,464,4398,,9989.0,9989.0


In [9]:
player_base_nan_df # approximate total amount of data for each year is 160,000

Unnamed: 0_level_0,Date,Opponent,Result,S,MP,Kills,Errors,Total Attacks,Hit Pct,Assists,Aces,SErr,Digs,RErr,Block Solos,Block Assists,BErr,PTS,BHE,Unnamed: 19,Attend,Unnamed: 20
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2016,0,0,0,46935,46935,46935,46935,46935,88270,111187,125610,112716,71713,130343,142175,106196,141378,73489,146952,155291.0,,
2017,0,0,0,48974,48974,48974,48974,48974,90624,114006,128406,115171,73685,132906,145182,108511,144463,75833,149849,158256.0,,
2018,0,0,0,50042,50042,50042,50042,50042,92220,115218,129379,116533,74558,134073,147141,110487,146616,76805,151684,,160093.0,160093.0
2019,0,0,0,50485,50427,50427,50427,50427,83661,115678,128665,115846,74988,134813,147156,111033,147446,76743,152417,,160328.0,160328.0


In [10]:
data_dict.get("2016")[0] # 2016 team base

Unnamed: 0,Date,Opponent,Result,S,MP,Kills,Errors,Total Attacks,Hit Pct,Assists,Aces,SErr,Digs,RErr,Block Solos,Block Assists,BErr,PTS,BHE,Unnamed: 19
0,08/26/2016,"Prairie View @ Waco, Texas",W 3 - 0,3,,46,13,107,0.308,40,6,11,51,3,,2.0,3.0,53.0,,
1,08/27/2016,@ Baylor,L 1 - 3,4,,37,20,119,0.143,32,5,6,46,5,2.0,18.0,,53.0,,
2,08/27/2016,"UTRGV @ Waco, TX",W 3 - 2,5,,76,26,216/,0.231,62,4,12,88,8,2.0,16.0,2.0,90.0,,
3,09/03/2016,"San Diego @ Madison, Wis.",L 0 - 3,3,,30,16,94,0.149,29,1,3,32,1,4.0,15.0,1.0,42.5,1.0,
4,09/04/2016,@ Wisconsin,L 0 - 3,3,,28,23,104,0.048,27,1,1,33,3,,,1.0,29.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10013,10/29/2016,Valparaiso,L 1 - 3,4,,47,30,142,0.120,47,4,8,57,6,3,22,1,65.0,2.0,
10014,11/04/2016,@ Oakland,L 0 - 3,3,,25,19,105,0.057,23,3,4,39,5,,16,3,36.0,1.0,
10015,11/06/2016,@ Cleveland St.,L 0 - 3,3,,38,20,112,0.161,36,5,4,34,3,1,4,3,46.0,,
10016,11/09/2016,Northern Ky.,L 1 - 3,4,,51,19,144,0.222,49,7,6,61,7,,18,3,67.0,,


In [11]:
data_dict.get("2016")[1] # 2016 player base

Unnamed: 0,Date,Opponent,Result,S,MP,Kills,Errors,Total Attacks,Hit Pct,Assists,Aces,SErr,Digs,RErr,Block Solos,Block Assists,BErr,PTS,BHE,Unnamed: 19
0,08/26/2016,"Prairie View @ Waco, Texas",W 3 - 0,2.0,1.0,4.0,1.0,8.0,0.375,1.0,1.0,,3.0,,,,,5.0,,
1,08/27/2016,@ Baylor,L 1 - 3,4.0,1.0,7.0,3.0,19.0,0.211,,1.0,1.0,4.0,,,1.0,,8.5,,
2,08/27/2016,"UTRGV @ Waco, TX",W 3 - 2,5.0,1.0,11.0,8.0,31.0,0.097,,2.0,4.0,2.0,,,,,13.0,,
3,09/03/2016,"San Diego @ Madison, Wis.",L 0 - 3,3.0,1.0,4.0,4.0,15.0,,1.0,1.0,1.0,,,,2.0,,6.0,,
4,09/04/2016,@ Wisconsin,L 0 - 3,3.0,1.0,5.0,3.0,14.0,0.143,,,,,,,,,5.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155286,10/29/2016,Valparaiso,L 1 - 3,,,,,,,,,,,,,,,,,
155287,11/04/2016,@ Oakland,L 0 - 3,,,,,,,,,,,,,,,,,
155288,11/06/2016,@ Cleveland St.,L 0 - 3,,,,,,,,,,,,,,,,,
155289,11/09/2016,Northern Ky.,L 1 - 3,,,,,,,,,,,,,,,,,


In [12]:
data_dict.get("2017")[0] # 2017 team base

Unnamed: 0,Date,Opponent,Result,S,MP,Kills,Errors,Total Attacks,Hit Pct,Assists,Aces,SErr,Digs,RErr,Block Solos,Block Assists,BErr,PTS,BHE,Unnamed: 19
0,08/25/2017,"Mississippi St. @ Dallas, Texas",L 2 - 3,5.0,,61,25,153,0.235,58,5.0,5,75,10,2.0,10.0,1.0,73.0,1.0,
1,08/25/2017,"Rice @ Dallas, Texas",W 3 - 1,4.0,,48,20,154,0.182,45,4.0,5,73,1,2.0,4.0,,56.0,2.0,
2,08/26/2017,@ SMU,L 0 - 3,3.0,,37,12,119,0.210,35,4.0,3,44,5,1.0,10.0,1.0,47.0,1.0,
3,09/01/2017,"San Diego St. @ East Lansing, Mich.",L 0 - 3,3.0,,30,12,88,0.205,30,2.0,6,31,8,1.0,12.0,3.0,39.0,,
4,09/02/2017,"Marshall @ East Lansing, Mich.",W 3 - 0,3.0,,45,11,102,0.333,42,5.0,3,49,5,1.0,4.0,1.0,53.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10102,10/28/2017,@ Northern Ky.,L 0 - 3,3.0,,40,14,112,0.232,38,5.0,4,63,9,5.0,10,,55.0,,
10103,11/04/2017,@ Cleveland St.,L 0 - 3,3.0,,29,26,127,0.024,27,1.0,2,51,5,1.0,4,4,33.0,2,
10104,11/05/2017,Oakland,L 2 - 3,5.0,,60,25,190,0.184,56,10.0,6,69,4,,12,2,76.0,2,
10105,11/08/2017,@ Wright St.,L 0 - 3,3.0,,26,14,96,0.125,23,4.0,8,36,8,4.0,4,2,36.0,,


In [13]:
data_dict.get("2017")[1] # 2017 player base

Unnamed: 0,Date,Opponent,Result,S,MP,Kills,Errors,Total Attacks,Hit Pct,Assists,Aces,SErr,Digs,RErr,Block Solos,Block Assists,BErr,PTS,BHE,Unnamed: 19
0,08/25/2017,"Mississippi St. @ Dallas, Texas",L 2 - 3,,,,,,,,,,,,,,,,,
1,08/25/2017,"Rice @ Dallas, Texas",W 3 - 1,,,,,,,,,,,,,,,,,
2,08/26/2017,@ SMU,L 0 - 3,,,,,,,,,,,,,,,,,
3,09/01/2017,"San Diego St. @ East Lansing, Mich.",L 0 - 3,1.0,1.0,0.0,0.0,0.0,,5.0,,,1.0,,,,,,,
4,09/02/2017,"Marshall @ East Lansing, Mich.",W 3 - 0,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158251,10/28/2017,@ Northern Ky.,L 0 - 3,,,,,,,,,,,,,,,,,
158252,11/04/2017,@ Cleveland St.,L 0 - 3,,,,,,,,,,,,,,,,,
158253,11/05/2017,Oakland,L 2 - 3,,,,,,,,,,,,,,,,,
158254,11/08/2017,@ Wright St.,L 0 - 3,,,,,,,,,,,,,,,,,


In [14]:
data_dict.get("2018")[0] # 2018 team base

Unnamed: 0,Date,Opponent,Result,Attend,S,MP,Kills,Errors,Total Attacks,Hit Pct,Assists,Aces,SErr,Digs,RErr,Block Solos,Block Assists,BErr,PTS,BHE,Unnamed: 20
0,08/24/2018,@ South Carolina,L 0 - 3,,3.0,,27,16,69,0.159,24,4,9,31,6,3.0,4,,36.0,1.0,
1,08/24/2018,"Clemson @ Columbia, S.C.",L 0 - 3,,3.0,,25,22,93,0.032,25,2,9,29,4,1.0,4,2.0,30.0,1.0,
2,08/25/2018,"ETSU @ Columbia, S.C.",L 0 - 3,,3.0,,41,16,111,0.225,38,5,6,37,4,,4,2.0,48.0,,
3,08/28/2018,UTRGV,L 2 - 3,,5.0,,54,29,175,0.143,52,4,8,73,2,,8,,62.0,1.0,
4,08/31/2018,@ LSU,L 0 - 3,,3.0,,26,15,103,0.107,24,2,7,28,6,1.0,2,,30.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10042,10/27/2018,Milwaukee,L 1 - 3,,4.0,,49,28,161,0.130,49,1.0,6,71,4,,10,4,55.0,1.0,
10043,10/31/2018,@ Northern Ky.,L 0 - 3,,3.0,,24,16,101,0.079,23,3.0,5,38,7,1.0,8,3,32.0,,
10044,11/04/2018,UIC,L 1 - 3,,4.0,,45,25,166,0.120,45,3.0,3,70,5,3.0,14,3,58.0,1.0,
10045,11/09/2018,@ IUPUI,L 0 - 3,,3.0,,33,22,131,0.084,32,3.0,4,66,5,1.0,8,,41.0,1.0,


In [15]:
data_dict.get("2018")[1] # 2018 player base

Unnamed: 0,Date,Opponent,Result,Attend,S,MP,Kills,Errors,Total Attacks,Hit Pct,Assists,Aces,SErr,Digs,RErr,Block Solos,Block Assists,BErr,PTS,BHE,Unnamed: 20
0,08/24/2018,@ South Carolina,L 0 - 3,,1.0,1.0,0.0,0.0,0.0,,,,,,2.0,,,,,,
1,08/24/2018,"Clemson @ Columbia, S.C.",L 0 - 3,,3.0,1.0,0.0,0.0,0.0,,,,,2.0,,,,,,,
2,08/25/2018,"ETSU @ Columbia, S.C.",L 0 - 3,,3.0,1.0,1.0,0.0,1.0,1.0,1.0,,1.0,4.0,2.0,,,,1.0,,
3,08/28/2018,UTRGV,L 2 - 3,,3.0,1.0,0.0,0.0,0.0,,,,,3.0,,,,,,,
4,08/31/2018,@ LSU,L 0 - 3,,3.0,1.0,0.0,0.0,1.0,,,,,5.0,2.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160088,10/27/2018,Milwaukee,L 1 - 3,,4.0,1.0,0.0,0.0,0.0,,3.0,,,11.0,3.0,,,,,,
160089,10/31/2018,@ Northern Ky.,L 0 - 3,,3.0,1.0,0.0,0.0,0.0,,1.0,,,4.0,5.0,,,,,,
160090,11/04/2018,UIC,L 1 - 3,,4.0,1.0,0.0,0.0,0.0,,,,,11.0,1.0,,,,,,
160091,11/09/2018,@ IUPUI,L 0 - 3,,3.0,1.0,1.0,0.0,2.0,0.5,1.0,,,15.0,3.0,,,,1.0,,


In [16]:
data_dict.get("2019")[0] # 2019 team base

Unnamed: 0,Date,Opponent,Result,S,MP,Attend,Kills,Errors,Total Attacks,Hit Pct,Assists,Aces,SErr,Digs,RErr,Block Solos,Block Assists,BErr,PTS,BHE,Unnamed: 20
0,08/30/2019,"UT Arlington @ Houston, TX",L 2 - 3,5.0,1.0,,58,22,156,0.231,45,10,10,67,5,2.0,4.0,,72.0,1.0,
1,08/30/2019,@ Houston,W 3 - 0,3.0,1.0,,49,16,112,0.295,48,4,6,47,7,,2.0,,,,
2,08/31/2019,"Arizona St. @ Houston, Texas",L 1 - 3,4.0,,,46,17,147,0.197,37,2,5,64,12,1.0,,,49.0,3.0,
3,09/04/2019,UTRGV,L 2 - 3,5.0,1.0,,61,26,167,0.210,57,6,9,60,7,4.0,6.0,,74.0,2.0,
4,09/06/2019,"New Mexico @ Las Cruces, NM",W 3 - 0,3.0,1.0,,48,12,104,0.346,41,2,7,51,4,,6.0,,53.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9984,11/03/2019,@ Oakland,L 0 - 3,3.0,,,26,22,119,0.034,24,5,7,40,4.0,1.0,6,3,35.0,1.0,
9985,11/06/2019,IUPUI,L 2 - 3,5.0,,,56,20,149,0.242,53,8,8,63,8.0,1.0,12,,71.0,2.0,
9986,11/08/2019,@ Green Bay,L 0 - 3,3.0,,,33,19,110,0.127,31,4,8,45,,,12,,43.0,,
9987,11/15/2019,Wright St.,L 0 - 3,3.0,,,28,19,113,0.080,27,1,4,43,6.0,,6,2,32.0,3.0,


In [17]:
data_dict.get("2019")[1] # 2019 player base

Unnamed: 0,Date,Opponent,Result,S,MP,Attend,Kills,Errors,Total Attacks,Hit Pct,Assists,Aces,SErr,Digs,RErr,Block Solos,Block Assists,BErr,PTS,BHE,Unnamed: 20
0,08/30/2019,"UT Arlington @ Houston, TX",L 2 - 3,5.0,1.0,,0.0,0.0,1.0,0.000,3.0,3.0,2.0,20.0,,,,,3.0,,
1,08/30/2019,@ Houston,W 3 - 0,3.0,1.0,,2.0,0.0,2.0,1.000,6.0,3.0,,17.0,1.0,,,,5.0,,
2,08/31/2019,"Arizona St. @ Houston, Texas",L 1 - 3,4.0,1.0,,2.0,0.0,6.0,0.333,3.0,1.0,2.0,21.0,5.0,,,,3.0,,
3,09/04/2019,UTRGV,L 2 - 3,5.0,1.0,,1.0,0.0,1.0,1.000,4.0,4.0,3.0,27.0,,,,,5.0,,
4,09/06/2019,"New Mexico @ Las Cruces, NM",W 3 - 0,3.0,1.0,,0.0,0.0,1.0,0.000,,1.0,2.0,14.0,1.0,,,,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160323,11/03/2019,@ Oakland,L 0 - 3,3.0,1.0,,0.0,0.0,0.0,,1.0,,,13.0,1.0,,,,,,
160324,11/06/2019,IUPUI,L 2 - 3,5.0,1.0,,0.0,0.0,0.0,,2.0,2.0,,19.0,3.0,,,,2.0,,
160325,11/08/2019,@ Green Bay,L 0 - 3,3.0,1.0,,0.0,0.0,0.0,,1.0,1.0,,12.0,,,,,1.0,,
160326,11/15/2019,Wright St.,L 0 - 3,3.0,1.0,,0.0,0.0,0.0,,1.0,,1.0,13.0,1.0,,,,,,


there are too many null values, the data should be cleaned up.