In [1]:
# vct_2024/
#     agents/
#         agents_pick_rates.csv
#     ids/
#         players_ids.csv
#         teams_ids.csv
#         tournaments_stages_match_types_ids.csv
#         tournaments_stages_matches_games_ids.csv
#     matches/
#         maps_played.csv
#         maps_scores.csv
#         overview.csv
#         scores.csv
#     players_stats/
#         players_stats.csv

In [21]:
import pandas as pd
from pathlib import Path

In [None]:
df = pd.read_parquet("../data/raw/vct_2024/players_stats/players_stats.parquet")

In [12]:
df.columns

Index(['Tournament', 'Stage', 'Match Type', 'Player', 'Teams', 'Agents',
       'Rounds Played', 'Rating', 'Average Combat Score', 'Kills:Deaths',
       'Kill, Assist, Trade, Survive %', 'Average Damage Per Round',
       'Kills Per Round', 'Assists Per Round', 'First Kills Per Round',
       'First Deaths Per Round', 'Headshot %', 'Clutch Success %',
       'Clutches (won/played)', 'Maximum Kills in a Single Map', 'Kills',
       'Deaths', 'Assists', 'First Kills', 'First Deaths'],
      dtype='str')

In [13]:
# Index(['Tournament', 'Stage', 'Match Type', 'Player', 'Teams', 'Agents',
#        'Rounds Played', 'Rating', 'Average Combat Score', 'Kills:Deaths',
#        'Kill, Assist, Trade, Survive %', 'Average Damage Per Round',
#        'Kills Per Round', 'Assists Per Round', 'First Kills Per Round',
#        'First Deaths Per Round', 'Headshot %', 'Clutch Success %',
#        'Clutches (won/played)', 'Maximum Kills in a Single Map', 'Kills',
#        'Deaths', 'Assists', 'First Kills', 'First Deaths'],
#       dtype='str')

df = df.rename(columns={
    "Tournament" : "tournament_name",
    "Stage" : "stage_name",
    "Match Type" : "match_type_name",
    "Player" : "player_name",
    "Teams" : "team_name",
    "Agents" : "agent_name",
    "Rounds Played" : "rounds_played",
    "Rating" : "rating",
    "Average Combat Score" : "average_combat_score",
    "Kills:Deaths" : "kills_deaths_ratio",
    "Kill, Assist, Trade, Survive %" : "kill_assist_trade_survive_percentage",
    "Average Damage Per Round" : "average_damage_per_round",
    "Kills Per Round" : "kills_per_round",
    "Assists Per Round" : "assists_per_round",
    "First Kills Per Round" : "first_kills_per_round",
    "First Deaths Per Round" : "first_deaths_per_round",
    "Headshot %" : "headshot_percentage",
    "Clutch Success %" : "clutch_success_percentage",
    "Clutches (won/played)" : "clutches_won_played",
    "Maximum Kills in a Single Map" : "maximum_kills_single_map",
    "Kills" : "kills",
    "Deaths" : "deaths",
    "Assists" : "assists",
    "First Kills" : "first_kills",
    "First Deaths" : "first_deaths"
})

In [14]:
df.columns

Index(['tournament_name', 'stage_name', 'match_type_name', 'player_name',
       'team_name', 'agent_name', 'rounds_played', 'rating',
       'average_combat_score', 'kills_deaths_ratio',
       'kill_assist_trade_survive_percentage', 'average_damage_per_round',
       'kills_per_round', 'assists_per_round', 'first_kills_per_round',
       'first_deaths_per_round', 'headshot_percentage',
       'clutch_success_percentage', 'clutches_won_played',
       'maximum_kills_single_map', 'kills', 'deaths', 'assists', 'first_kills',
       'first_deaths'],
      dtype='str')

In [15]:
df['kill_assist_trade_survive_percentage'] = df['kill_assist_trade_survive_percentage'].str.rstrip('%').astype('float') / 100.0
df['headshot_percentage'] = df['headshot_percentage'].str.rstrip('%').astype('float') / 100.0
df['clutch_success_percentage'] = df['clutch_success_percentage'].str.rstrip('%').astype('float') / 100.0

In [16]:
# handle nan values in 'clutches_won_played' before splitting
df['clutches_won_played'] = df['clutches_won_played'].fillna('0/0')
df[['clutches_won', 'clutches_played']] = df['clutches_won_played'].str.split('/', expand=True).astype('int')
df = df.drop(columns=['clutches_won_played'])

In [17]:
df['clutch_success_percentage'] = df['clutch_success_percentage'].fillna(0.0)

In [22]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 15030 entries, 0 to 15029
Data columns (total 26 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   tournament_name                       15030 non-null  str    
 1   stage_name                            15030 non-null  str    
 2   match_type_name                       15030 non-null  str    
 3   player_name                           15030 non-null  str    
 4   team_name                             15030 non-null  str    
 5   agent_name                            15030 non-null  str    
 6   rounds_played                         15030 non-null  int64  
 7   rating                                11052 non-null  float64
 8   average_combat_score                  15030 non-null  int64  
 9   kills_deaths_ratio                    15030 non-null  float64
 10  kill_assist_trade_survive_percentage  11052 non-null  float64
 11  average_damage_per_round  

In [23]:
path = Path("../data/clean/vct_2024/players_stats")
path.mkdir(parents=True, exist_ok=True)


In [24]:
df.to_parquet(path / "players_stats.parquet", index=False)

In [25]:
df = pd.read_parquet("../data/raw/vct_2024/agents/agents_pick_rates.parquet")

In [26]:
df.columns

Index(['Tournament', 'Stage', 'Match Type', 'Map', 'Agent', 'Pick Rate'], dtype='str')

In [27]:
# Index(['Tournament', 'Stage', 'Match Type', 'Map', 'Agent', 'Pick Rate'], dtype='str')
df = df.rename(columns={
    "Tournament" : "tournament_name",
    "Stage" : "stage_name",
    "Match Type" : "match_type_name",
    "Map" : "map_name",
    "Agent" : "agent_name",
    "Pick Rate" : "pick_rate"
})

In [28]:
df.columns

Index(['tournament_name', 'stage_name', 'match_type_name', 'map_name',
       'agent_name', 'pick_rate'],
      dtype='str')

In [31]:
df['pick_rate'] = df['pick_rate'].str.rstrip('%').astype('float') / 100.0

In [32]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 24840 entries, 0 to 24839
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tournament_name  24840 non-null  str    
 1   stage_name       24840 non-null  str    
 2   match_type_name  24840 non-null  str    
 3   map_name         24840 non-null  str    
 4   agent_name       24840 non-null  str    
 5   pick_rate        24840 non-null  float64
dtypes: float64(1), str(5)
memory usage: 2.7 MB


In [34]:
path = Path("../data/clean/vct_2024/agents")
path.mkdir(parents=True, exist_ok=True)

In [35]:
df.to_parquet(path / "agents_pick_rates.parquet", index=False)

In [37]:
df = pd.read_parquet("../data/raw/vct_2024/matches/maps_played.parquet")

In [41]:
df.columns

Index(['tournament_name', 'stage_name', 'match_type_name', 'match_name',
       'map_name'],
      dtype='str')

In [40]:
# Index(['Tournament', 'Stage', 'Match Type', 'Match Name', 'Map'], dtype='str')
df = df.rename(columns={
    "Tournament" : "tournament_name",
    "Stage" : "stage_name",
    "Match Type" : "match_type_name",
    "Match Name" : "match_name",
    "Map" : "map_name"
})

In [43]:
path = Path("../data/clean/vct_2024/matches")
path.mkdir(parents=True, exist_ok=True)

In [44]:
df.to_parquet(path / "maps_played.parquet", index=False)

In [None]:
# vct_2024/
#     agents/
#         agents_pick_rates.csv-------------------
#     ids/
#         players_ids.csv           
#         teams_ids.csv
#         tournaments_stages_match_types_ids.csv
#         tournaments_stages_matches_games_ids.csv
#     matches/
#         maps_played.csv------------------
#         maps_scores.csv------------------
#         overview.csv---------------------
#         scores.csv-----------------------
#     players_stats/
#         players_stats.csv-----------------

In [45]:
df = pd.read_parquet("../data/raw/vct_2024/matches/maps_scores.parquet")

In [46]:
df.columns

Index(['Tournament', 'Stage', 'Match Type', 'Match Name', 'Map', 'Team A',
       'Team A Score', 'Team A Attacker Score', 'Team A Defender Score',
       'Team A Overtime Score', 'Team B', 'Team B Score',
       'Team B Attacker Score', 'Team B Defender Score',
       'Team B Overtime Score', 'Duration'],
      dtype='str')

In [47]:
# Index(['Tournament', 'Stage', 'Match Type', 'Match Name', 'Map', 'Team A',
#        'Team A Score', 'Team A Attacker Score', 'Team A Defender Score',
#        'Team A Overtime Score', 'Team B', 'Team B Score',
#        'Team B Attacker Score', 'Team B Defender Score',
#        'Team B Overtime Score', 'Duration'],
#       dtype='str')

df = df.rename(columns={
    "Tournament" : "tournament_name",
    "Stage" : "stage_name",
    "Match Type" : "match_type_name",
    "Match Name" : "match_name",
    "Map" : "map_name",
    "Team A" : "team_a_name",
    "Team A Score" : "team_a_score",
    "Team A Attacker Score" : "team_a_attacker_score",
    "Team A Defender Score" : "team_a_defender_score",
    "Team A Overtime Score" : "team_a_overtime_score",
    "Team B" : "team_b_name",
    "Team B Score" : "team_b_score",
    "Team B Attacker Score" : "team_b_attacker_score",
    "Team B Defender Score" : "team_b_defender_score",
    "Team B Overtime Score" : "team_b_overtime_score",
    "Duration" : "duration"
})

In [48]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 1104 entries, 0 to 1103
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   tournament_name        1104 non-null   str    
 1   stage_name             1104 non-null   str    
 2   match_type_name        1104 non-null   str    
 3   match_name             1104 non-null   str    
 4   map_name               1104 non-null   str    
 5   team_a_name            1104 non-null   str    
 6   team_a_score           1104 non-null   int64  
 7   team_a_attacker_score  1104 non-null   int64  
 8   team_a_defender_score  1104 non-null   int64  
 9   team_a_overtime_score  112 non-null    float64
 10  team_b_name            1104 non-null   str    
 11  team_b_score           1104 non-null   int64  
 12  team_b_attacker_score  1104 non-null   int64  
 13  team_b_defender_score  1104 non-null   int64  
 14  team_b_overtime_score  112 non-null    float64
 15  duration       

In [54]:
df['duration'] = df['duration'].apply(
    lambda x: "00:" + x if isinstance(x, str) and x.count(":") == 1 else x
)

In [55]:
df['duration'] = pd.to_timedelta(df['duration'])

In [56]:
df['duration_seconds'] = df['duration'].dt.total_seconds()
df = df.drop(columns=['duration'])

In [58]:
df.to_parquet(path / "maps_scores.parquet", index=False)

In [59]:
df = pd.read_parquet("../data/raw/vct_2024/matches/overview.parquet")

In [60]:
df.columns

Index(['Tournament', 'Stage', 'Match Type', 'Match Name', 'Map', 'Player',
       'Team', 'Agents', 'Rating', 'Average Combat Score', 'Kills', 'Deaths',
       'Assists', 'Kills - Deaths (KD)', 'Kill, Assist, Trade, Survive %',
       'Average Damage Per Round', 'Headshot %', 'First Kills', 'First Deaths',
       'Kills - Deaths (FKD)', 'Side'],
      dtype='str')

In [61]:
# Index(['Tournament', 'Stage', 'Match Type', 'Match Name', 'Map', 'Player',
#        'Team', 'Agents', 'Rating', 'Average Combat Score', 'Kills', 'Deaths',
#        'Assists', 'Kills - Deaths (KD)', 'Kill, Assist, Trade, Survive %',
#        'Average Damage Per Round', 'Headshot %', 'First Kills', 'First Deaths',
#        'Kills - Deaths (FKD)', 'Side'],
#       dtype='str')

df = df.rename(columns={
    "Tournament" : "tournament_name",
    "Stage" : "stage_name",
    "Match Type" : "match_type_name",
    "Match Name" : "match_name",
    "Map" : "map_name",
    "Player" : "player_name",
    "Team" : "team_name",
    "Agents" : "agent_name",
    "Rating" : "rating",
    "Average Combat Score" : "average_combat_score",
    "Kills" : "kills",
    "Deaths" : "deaths",
    "Assists" : "assists",
    "Kills - Deaths (KD)" : "kills_deaths_ratio",
    "Kill, Assist, Trade, Survive %" : "kill_assist_trade_survive_percentage",
    "Average Damage Per Round" : "average_damage_per_round",
    "Headshot %" : "headshot_percentage",
    "First Kills" : "first_kills",
    "First Deaths" : "first_deaths",
    "Kills - Deaths (FKD)" : "first_kills_deaths_ratio",
    "Side" : "side"
})

In [72]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 46152 entries, 0 to 46151
Data columns (total 21 columns):
 #   Column                                Non-Null Count  Dtype   
---  ------                                --------------  -----   
 0   tournament_name                       46152 non-null  str     
 1   stage_name                            46152 non-null  str     
 2   match_type_name                       46152 non-null  str     
 3   match_name                            46152 non-null  str     
 4   map_name                              46152 non-null  str     
 5   player_name                           46152 non-null  str     
 6   team_name                             46152 non-null  str     
 7   agent_name                            46152 non-null  str     
 8   rating                                34062 non-null  float64 
 9   average_combat_score                  38083 non-null  float64 
 10  kills                                 38092 non-null  float64 
 11  deaths       

In [67]:
df['kill_assist_trade_survive_percentage'] = df['kill_assist_trade_survive_percentage'].str.rstrip('%').astype('float') / 100.0
df['headshot_percentage'] = df['headshot_percentage'].str.rstrip('%').astype('float') / 100.0

In [68]:
# Side has values attack, defence, both. Convert them to lowercase for consistency, then to pandas category type
df['side'] = df['side'].str.lower().astype('category')

In [73]:
df.to_parquet(path / "overview.parquet", index=False)

In [75]:
df = pd.read_parquet("../data/raw/vct_2024/matches/scores.parquet")

In [76]:
df.columns

Index(['Tournament', 'Stage', 'Match Type', 'Match Name', 'Team A', 'Team B',
       'Team A Score', 'Team B Score', 'Match Result'],
      dtype='str')

In [77]:
# Index(['Tournament', 'Stage', 'Match Type', 'Match Name', 'Team A', 'Team B',
#        'Team A Score', 'Team B Score', 'Match Result'],
#       dtype='str')

df = df.rename(columns={
    "Tournament" : "tournament_name",
    "Stage" : "stage_name",
    "Match Type" : "match_type_name",
    "Match Name" : "match_name",
    "Team A" : "team_a_name",
    "Team B" : "team_b_name",
    "Team A Score" : "team_a_score",
    "Team B Score" : "team_b_score",
    "Match Result" : "match_result"
})

In [None]:
df['winning_team_name'] = df['match_result'].str.replace(' won', '', regex=False)
df = df.drop(columns=['match_result'])

In [82]:
df.to_parquet(path / "scores.parquet", index=False)

In [None]:
# vct_2024/
#     agents/
#         agents_pick_rates.csv-------------------
#     ids/
#         players_ids.csv-----------------------    
#         teams_ids.csv------------------------
#         tournaments_stages_match_types_ids.csv
#         tournaments_stages_matches_games_ids.csv
#     matches/
#         maps_played.csv------------------
#         maps_scores.csv------------------
#         overview.csv---------------------
#         scores.csv-----------------------
#     players_stats/
#         players_stats.csv-----------------

In [84]:
df = pd.read_parquet("../data/raw/vct_2024/ids/players_ids.parquet")

In [86]:
df.columns

Index(['Player', 'Player ID'], dtype='str')

In [87]:
# Index(['Player', 'Player ID'], dtype='str')
df = df.rename(columns={
    "Player" : "player_name",
    "Player ID" : "player_id"
})

In [89]:
path = Path("../data/clean/vct_2024/ids")
path.mkdir(parents=True, exist_ok=True)

In [90]:
df.to_parquet(path / "players_ids.parquet", index=False)

In [91]:
df = pd.read_parquet("../data/raw/vct_2024/ids/teams_ids.parquet")

In [92]:
df.columns

Index(['Team', 'Team ID'], dtype='str')

In [93]:
# Index(['Team', 'Team ID'], dtype='str')
df = df.rename(columns={
    "Team" : "team_name",
    "Team ID" : "team_id"
})

In [95]:
df.to_parquet(path / "teams_ids.parquet", index=False)

In [117]:
df = pd.read_parquet("../data/raw/vct_2024/ids/tournaments_stages_match_types_ids.parquet")

In [118]:
df.columns

Index(['Tournament', 'Tournament ID', 'Stage', 'Stage ID', 'Match Type',
       'Match Type ID'],
      dtype='str')

In [119]:
# Index(['Tournament', 'Tournament ID', 'Stage', 'Stage ID', 'Match Type',
#        'Match Type ID'],
#       dtype='str')
df = df.rename(columns={
    "Tournament" : "tournament_name",
    "Tournament ID" : "tournament_id",
    "Stage" : "stage_name",
    "Stage ID" : "stage_id",
    "Match Type" : "match_type_name",
    "Match Type ID" : "match_type_id"
})

In [120]:
df = df.dropna(subset=['stage_id', 'match_type_id'])

In [121]:
# convert stage_id and match_type_id to integers
df['stage_id'] = df['stage_id'].astype('int')
df['match_type_id'] = df['match_type_id'].astype('int')

In [122]:
df.to_parquet(path / "tournaments_stages_match_types_ids.parquet", index=False)

In [123]:
df = pd.read_parquet("../data/raw/vct_2024/ids/tournaments_stages_matches_games_ids.parquet")

In [124]:
df.columns

Index(['Tournament', 'Tournament ID', 'Stage', 'Stage ID', 'Match Type',
       'Match Name', 'Match ID', 'Map', 'Game ID'],
      dtype='str')

In [125]:
# Index(['Tournament', 'Tournament ID', 'Stage', 'Stage ID', 'Match Type',
#        'Match Name', 'Match ID', 'Map', 'Game ID'],
#       dtype='str')
df = df.rename(columns={
    "Tournament" : "tournament_name",
    "Tournament ID" : "tournament_id",
    "Stage" : "stage_name",
    "Stage ID" : "stage_id",
    "Match Type" : "match_type_name",
    "Match Name" : "match_name",
    "Match ID" : "match_id",
    "Map" : "map_name",
    "Game ID" : "game_id"
})

In [126]:
df.to_parquet(path / "tournaments_stages_matches_games_ids.parquet", index=False)