In [3]:
import pandas as pd
from pathlib import Path

BASE = Path("../data/clean/vct_2024")
DIM = Path("../data/gold/dimensions")
OUT = Path("../data/gold/facts")

OUT.mkdir(parents=True, exist_ok=True)

In [4]:
# Silver source
scores = pd.read_parquet(
    BASE / "matches/scores.parquet"
)

# Dimensions
dim_team = pd.read_parquet(DIM / "dim_team.parquet")
dim_match = pd.read_parquet(DIM / "dim_match.parquet")
dim_tournament = pd.read_parquet(DIM / "dim_tournament.parquet")

In [6]:
fact = scores.merge(
    dim_team,
    left_on="team_a_name",
    right_on="team_name",
    how="left"
).rename(
    columns={"team_id": "team_a_id"}
)

In [8]:
fact = fact.drop(columns=["team_name"])

In [10]:
print(fact["team_a_id"].isna().sum())

0


In [11]:
fact = fact.merge(
    dim_team,
    left_on="team_b_name",
    right_on="team_name",
    how="left"
).rename(
    columns={"team_id": "team_b_id"}
)

In [12]:
fact = fact.drop(columns=["team_name"])

In [13]:
print(fact['team_b_id'].isna().sum())

0


In [14]:
fact = fact.merge(
    dim_team,
    left_on="winning_team_name",
    right_on="team_name",
    how="left"
).rename(
    columns={"team_id": "winning_team_id"}
)

In [15]:
fact = fact.drop(columns=["team_name"])

In [17]:
print(fact["winning_team_id"].isna().sum())

0


In [19]:
fact = fact.merge(
    dim_tournament,
    on=[
        "tournament_name",
        "stage_name",
        "match_type_name"
    ],
    how="left"
)

In [21]:
print(fact["tournament_id"].isna().sum())

0


In [26]:
dim_match

Unnamed: 0,match_id,game_id,match_name,map_name,tournament_id,stage_id,match_type_id
0,295605,153722,NRG Esports vs FURIA,Breeze,1923,3637,23017
1,295605,153723,NRG Esports vs FURIA,Ascent,1923,3637,23017
2,295606,153725,Cloud9 vs MIBR,Bind,1923,3637,23017
3,295606,153726,Cloud9 vs MIBR,Split,1923,3637,23017
4,295606,153727,Cloud9 vs MIBR,Lotus,1923,3637,23017
...,...,...,...,...,...,...,...
1099,378835,180454,Team Heretics vs Sentinels,Abyss,2097,4131,26241
1100,378836,180455,LEVIATÁN vs Team Heretics,Icebox,2097,4131,26242
1101,378836,180456,LEVIATÁN vs Team Heretics,Sunset,2097,4131,26242
1102,378836,180457,LEVIATÁN vs Team Heretics,Abyss,2097,4131,26242


In [28]:
fact = fact.merge(
    dim_match[
        [
            "match_id",
            "match_name",
            "tournament_id",
            "stage_id",
            "match_type_id"
        ]
    ].drop_duplicates(),
    on=[
        "tournament_id",
        "stage_id",
        "match_type_id",
        "match_name"
    ],
    how="left"
)

In [30]:
print(fact["match_id"].isna().sum())

0


In [31]:
fact_match_results = fact[
    [
        # Keys
        "match_id",

        "tournament_id",
        "stage_id",
        "match_type_id",

        "team_a_id",
        "team_b_id",
        "winning_team_id",

        # Scores
        "team_a_score",
        "team_b_score",
    ]
].copy()

In [32]:
print(fact_match_results.duplicated("match_id").sum())

0


In [34]:
fact_match_results.isna().sum()

match_id           0
tournament_id      0
stage_id           0
match_type_id      0
team_a_id          0
team_b_id          0
winning_team_id    0
team_a_score       0
team_b_score       0
dtype: int64

In [35]:
out_path = OUT / "fact_match_results.parquet"

fact_match_results.to_parquet(
    out_path,
    index=False
)