In [3]:
import pandas as pd
from pathlib import Path

PROCESSED_DIR = Path("../data/processed")

for csv_file in PROCESSED_DIR.glob("*.csv"):
    df = pd.read_csv(csv_file)

    # find columns that look like IDs and are floats
    for col in df.columns:
        if col.endswith("_id") and pd.api.types.is_float_dtype(df[col]):
            # drop NaNs first (IDs should not be null in dims/facts)
            df[col] = df[col].dropna().astype(int)

    df.to_csv(csv_file, index=False)
    print(f"Fixed IDs in {csv_file.name}")


Fixed IDs in dim_competition.csv
Fixed IDs in dim_match.csv
Fixed IDs in dim_player.csv
Fixed IDs in dim_season.csv
Fixed IDs in dim_team.csv
Fixed IDs in fact_event.csv
Fixed IDs in fact_pass.csv
Fixed IDs in fact_player_match_stats.csv
Fixed IDs in fact_shot.csv
Fixed IDs in fact_team_match_stats.csv


In [4]:
fact_event = pd.read_csv(PROCESSED_DIR / "fact_event.csv")

# player_id can be null → use nullable integer
fact_event["player_id"] = fact_event["player_id"].astype("Int64")

# team_id and possession_team_id should NOT be null
fact_event["team_id"] = fact_event["team_id"].astype(int)
fact_event["possession_team_id"] = fact_event["possession_team_id"].astype(int)

fact_event.dtypes

event_id               object
match_id                int64
team_id                 int64
player_id               Int64
period                  int64
minute                  int64
second                  int64
event_type             object
event_category         object
possession              int64
possession_team_id      int64
under_pressure           bool
x                     float64
y                     float64
is_pass                  bool
is_shot                  bool
is_carry                 bool
is_pressure              bool
dtype: object

In [5]:
fact_event.to_csv(PROCESSED_DIR / "fact_event.csv", index=False)


In [6]:
fact_player_match_stats = pd.read_csv(PROCESSED_DIR / "fact_player_match_stats.csv")

# player_id can be null → use nullable integer
fact_player_match_stats["goals"] = fact_player_match_stats["goals"].astype("Int64")

# # team_id and possession_team_id should NOT be null
# fact_event["team_id"] = fact_event["team_id"].astype(int)
# fact_event["possession_team_id"] = fact_event["possession_team_id"].astype(int)

fact_player_match_stats.dtypes

match_id       int64
player_id      int64
events         int64
passes         int64
shots          int64
carries        int64
pressures      int64
goals          Int64
xg           float64
dtype: object

In [7]:
fact_player_match_stats.to_csv(PROCESSED_DIR / "fact_player_match_stats.csv", index=False)
