In [1]:
import sys
from pathlib import Path
sys.path.append(str(Path("..").resolve()))

from src.load_data import load_matches, load_players, load_scorers
from src.cleaning import clean_matches, clean_players, clean_scorers
from src.config import PROCESSED_DATA

matches = clean_matches(load_matches())
players = clean_players(load_players())
scorers = clean_scorers(load_scorers())

PROCESSED_DATA.mkdir(exist_ok=True)

matches.to_csv(PROCESSED_DATA / "matches_clean.csv", index=False)
players.to_csv(PROCESSED_DATA / "players_clean.csv", index=False)
scorers.to_csv(PROCESSED_DATA / "scorers_clean.csv", index=False)

print("✅ Clean files saved")

matches.head()

✅ Clean files saved


Unnamed: 0,year,datetime,stage,stadium,city,home_team_name,home_team_goals,away_team_goals,away_team_name,win_conditions,...,half_time_home_goals,half_time_away_goals,referee,assistant_1,assistant_2,roundid,matchid,home_team_initials,away_team_initials,total_goals
0,1930,1930-07-13 15:00:00,Group 1,Pocitos,Montevideo,France,4,1,Mexico,,...,3,0,LOMBARDI Domingo (URU),CRISTOPHE Henry (BEL),REGO Gilberto (BRA),201,1096,FRA,MEX,5
1,1930,1930-07-13 15:00:00,Group 4,Parque Central,Montevideo,USA,3,0,Belgium,,...,2,0,MACIAS Jose (ARG),MATEUCCI Francisco (URU),WARNKEN Alberto (CHI),201,1090,USA,BEL,3
2,1930,1930-07-14 12:45:00,Group 2,Parque Central,Montevideo,Yugoslavia,2,1,Brazil,,...,2,0,TEJADA Anibal (URU),VALLARINO Ricardo (URU),BALWAY Thomas (FRA),201,1093,YUG,BRA,3
3,1930,1930-07-14 14:50:00,Group 3,Pocitos,Montevideo,Romania,3,1,Peru,,...,1,0,WARNKEN Alberto (CHI),LANGENUS Jean (BEL),MATEUCCI Francisco (URU),201,1098,ROU,PER,4
4,1930,1930-07-15 16:00:00,Group 1,Parque Central,Montevideo,Argentina,1,0,France,,...,0,0,REGO Gilberto (BRA),SAUCEDO Ulises (BOL),RADULESCU Constantin (ROU),201,1085,ARG,FRA,1


In [2]:
matches.isna().sum()


year                     0
datetime                10
stage                    0
stadium                  0
city                     0
home_team_name           0
home_team_goals          0
away_team_goals          0
away_team_name           0
win_conditions           0
attendance               1
half_time_home_goals     0
half_time_away_goals     0
referee                  0
assistant_1              0
assistant_2              0
roundid                 48
matchid                 48
home_team_initials      48
away_team_initials      48
total_goals              0
dtype: int64

In [3]:
(matches.isna().mean() * 100).sort_values(ascending=False)


matchid                 5.206074
roundid                 5.206074
away_team_initials      5.206074
home_team_initials      5.206074
datetime                1.084599
attendance              0.108460
stadium                 0.000000
stage                   0.000000
year                    0.000000
city                    0.000000
home_team_name          0.000000
away_team_goals         0.000000
home_team_goals         0.000000
half_time_away_goals    0.000000
half_time_home_goals    0.000000
win_conditions          0.000000
away_team_name          0.000000
assistant_2             0.000000
assistant_1             0.000000
referee                 0.000000
total_goals             0.000000
dtype: float64

In [4]:
matches = matches.dropna(subset=["home_team_goals", "away_team_goals"])


In [5]:
matches["attendance"] = matches["attendance"].fillna(matches["attendance"].median())


In [6]:
def handle_missing_values(matches):

    matches = matches.dropna(subset=["home_team_goals", "away_team_goals"])

    matches["attendance"] = matches["attendance"].fillna(
        matches["attendance"].median()
    )

    return matches
