In [4]:
# trying to predict the ucl 

In [None]:
# loading the datasets
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings # to supress harmless warninigs
import requests
from bs4 import BeautifulSoup
warnings.filterwarnings("ignore")

# Fbref scores and fixtures urls per season
fixture_urls = {"2025-26" : "https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures",
                "2024-25" :"https://fbref.com/en/comps/9/2024-2025/schedule/2024-2025-Premier-League-Scores-and-Fixtures",
                "2023-24" :"https://fbref.com/en/comps/9/2023-2024/schedule/2023-2024-Premier-League-Scores-and-Fixtures",
                "2022-23" :"https://fbref.com/en/comps/9/2022-2023/schedule/2022-2023-Premier-League-Scores-and-Fixtures",
                "2021-22" : "https://fbref.com/en/comps/9/2021-2022/schedule/2021-2022-Premier-League-Scores-and-Fixtures",
                "2020-21" : "https://fbref.com/en/comps/9/2020-2021/schedule/2020-2021-Premier-League-Scores-and-Fixtures"
               }
                

stats_urls = {"2025-26" : "https://fbref.com/en/comps/9/stats/Premier-League-Stats",
              "2024-25" : "https://fbref.com/en/comps/9/2024-2025/stats/2024-2025-Premier-League-Stats",  # latest full stats season
              "2023-24": "https://fbref.com/en/comps/9/2023-2024/stats/2023-2024-Premier-League-Stats",  # previous
              "2022-23": "https://fbref.com/en/comps/9/2022-2023/stats/2022-2023-Premier-League-Stats",
              "2021-22" : "https://fbref.com/en/comps/9/2021-2022/stats/2021-2022-Premier-League-Stats",
              "2020-21" : "https://fbref.com/en/comps/9/2020-2021/stats/2020-2021-Premier-League-Stats"}  # previous

train_seasons = ["2020-21","2021-22","2022-23", "2023-24"]
test_seasons = ["2024-25"]
predict_season = ["2025-26"]

stats_cols = ["Squad", "Goals", "Ast", "G+A", "Gls/90", "Ast/90", "G+A/90", "xG", "xGA", "Poss"]




## parsing and loading

def split_score_to_goals(score_series):
    extracted = score_series.astype(str).str.extract(
        r'(?P<HomeGoals>\d+)\s*[-–:]\s*(?P<AwayGoals>\d+)'
    )
    return extracted.astype(float).astype("Int64") 


def load_fixtures_for_season(season): # scrapes one season's fixtures/results from FBRef
    url = fixture_urls[season] # get the page url for this season
    tables = pd.read_html(url,flavor="html5lib")
    df = tables[0].copy() # the first table is the full fixture table
    df = df.rename(columns={"Home": "HomeTeam", "Away": "AwayTeam"})
    df["Season"] = season
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

    goals = split_score_to_goals(df["Score"]) # plit score into numeric home and away goals
    df = pd.concat([df, goals], axis=1) # attach goals coloums

    df["is_played"] = df["HomeGoals"].notna() & df["AwayGoals"].notna() # mark completed matches

    df["Result"] = pd.NA

    # Fill only for played games
    mask = df["is_played"]
    df.loc[mask & (df["HomeGoals"] > df["AwayGoals"]), "Result"] = "HomeWin"
    df.loc[mask & (df["AwayGoals"] > df["HomeGoals"]), "Result"] = "AwayWin"
    df.loc[mask & (df["HomeGoals"] == df["AwayGoals"]), "Result"] = "Draw"

    keep = ["Season", "Date", "HomeTeam", "AwayTeam", "Score", 
            "HomeGoals", "AwayGoals", "Result", "is_played"]
    return df[keep].reset_index(drop=True)
    
#print(load_fixtures_for_season("2023-24").head(10))

def load_stats_for_season(season): # load the team standard stats
    url = stats_urls[season] # get the stats url page for this season
    headers = {"User Agent"}
    tables = pd.read_html(url, flavor="html5lib")
    base = tables[0].copy()

    # Flatten headers → take last row of header tuple
    base.columns = [col[-1] if isinstance(col, tuple) else col for col in base.columns]

    # Drop unnamed junk columns
    base = base.loc[:, ~base.columns.str.contains("Unnamed")]

    # Keep relevant stats
    keep_cols = [
        "Squad", "Poss", "Gls", "Ast", "G+A", "Gls/90", "Ast/90", "G+A/90",
        "xG", "npxG", "xAG", "npxG+xAG", "PrgC", "PrgP"
    ]
    base = base[[c for c in keep_cols if c in base.columns]]

    return base.reset_index(drop=True)

#print(load_stats_for_season("2024-25").head(20))

def build_match_dataset(season):
    fixtures = load_fixtures_for_season(season) # load fixtures for given season
    stats = load_stats_for_season(season) # load stats for a given season

    merged = fixtures.merge(stats, left_on="HomeTeam", right_on="Squad", how="left", suffixes=("", "_Home"))
    merged = merged.drop(columns=["Squad"]) # drop the duplicate Squad coloum
    merged = merged.rename(columns={c: f"Home_{c}" for c in merged.columns if c not in fixtures.columns}) # prefix the home stats with "Home_"

    merged = merged.merge(stats, left_on="AwayTeam", right_on="Squad", how="left", suffixes=("", "_Away"))
    merged = merged.drop(columns=["Squad"]) # drop the duplicate Squad coloum
    merged = merged.rename(columns={c: f"Away_{c}" for c in merged.columns if c not in fixtures.columns}) # prefix the away stats with "Away_"
    
    return merged

#print(build_match_dataset("2023-24").head(10))

def add_rolling_form_features(df, window=5):
    """
    Adds rolling form features for both Home and Away teams.
    Uses last `window` matches before each fixture.
    """
    form_features = ["HomeGoals", "AwayGoals"]  # you can add more like xG if available
    df = df.sort_values("Date").reset_index(drop=True)

    new_features = []

    for team_col, prefix in [("HomeTeam", "HomeForm"), ("AwayTeam", "AwayForm")]:
        for f in ["Goals_For", "Goals_Against", "Points"]:
            new_features.append(f"{prefix}_{f}_rolling{window}")

        # Expand fixtures per team
        for team in df[team_col].unique():
            team_matches = df[(df["HomeTeam"] == team) | (df["AwayTeam"] == team)].copy()
            team_matches = team_matches.sort_values("Date")

            # Calculate goals for/against
            team_matches["Goals_For"] = np.where(team_matches["HomeTeam"] == team,
                                                 team_matches["HomeGoals"], team_matches["AwayGoals"])
            team_matches["Goals_Against"] = np.where(team_matches["HomeTeam"] == team,
                                                     team_matches["AwayGoals"], team_matches["HomeGoals"])

            # Calculate points
            team_matches["Points"] = np.where(team_matches["Goals_For"] > team_matches["Goals_Against"], 3,
                                     np.where(team_matches["Goals_For"] == team_matches["Goals_Against"], 1, 0))

            # Rolling averages
            team_matches[f"{prefix}_Goals_For_rolling{window}"] = team_matches["Goals_For"].rolling(window).mean().shift(1)
            team_matches[f"{prefix}_Goals_Against_rolling{window}"] = team_matches["Goals_Against"].rolling(window).mean().shift(1)
            team_matches[f"{prefix}_Points_rolling{window}"] = team_matches["Points"].rolling(window).mean().shift(1)

            # Merge back
            df.loc[team_matches.index, f"{prefix}_Goals_For_rolling{window}"] = team_matches[f"{prefix}_Goals_For_rolling{window}"]
            df.loc[team_matches.index, f"{prefix}_Goals_Against_rolling{window}"] = team_matches[f"{prefix}_Goals_Against_rolling{window}"]
            df.loc[team_matches.index, f"{prefix}_Points_rolling{window}"] = team_matches[f"{prefix}_Points_rolling{window}"]

    return df


train_data = pd.concat([build_match_dataset(s) for s in train_seasons], ignore_index=True)
test_data = pd.concat([build_match_dataset(s) for s in test_seasons], ignore_index=True)

# Add rolling form
train_data = add_rolling_form_features(train_data, window=5)
test_data = add_rolling_form_features(test_data, window=5)

# Keep played matches only
train_data = train_data[train_data["is_played"]]
test_data = test_data[test_data["is_played"]]

# Select features (include rolling form features now)
feature_cols = [c for c in train_data.columns if ("Home_" in c or "Away_" in c) and c not in ["HomeTeam", "AwayTeam"]]

X_train = train_data[feature_cols] #training features
y_train = train_data["Result"]

X_test = test_data[feature_cols] # testing features
y_test = test_data["Result"]

rf = RandomForestClassifier(n_estimators=500, random_state=42) # define random forest with 200 forest
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("\nClassification Report: \n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))  # Print confusion matrix (true vs predicted results)


# === Step 6: Prepare prediction dataset for 2025-26 season ===
predict_data = build_match_dataset("2025-26")  # Build dataset with 2025-26 fixtures and stats

# Keep only unplayed matches from the future season for prediction
future_matches = predict_data[~predict_data["is_played"]]  

X_future = future_matches[feature_cols]  # Features for future matches
future_matches["Predicted_Result"] = rf.predict(X_future)  # Predict results for unplayed games

# Print first 15 predicted results for upcoming matches
print("\nUpcoming Predictions (2025-26):")
print(future_matches[["Date", "HomeTeam", "AwayTeam", "Predicted_Result"]].head(25))  





HTTPError: HTTP Error 403: Forbidden

In [None]:
# === Imports ===
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

# === FBRef URLs ===
fixture_urls = {
    "2025-26": "https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures",
    "2024-25": "https://fbref.com/en/comps/9/2024-2025/schedule/2024-2025-Premier-League-Scores-and-Fixtures",
    "2023-24": "https://fbref.com/en/comps/9/2023-2024/schedule/2023-2024-Premier-League-Scores-and-Fixtures",
    "2022-23": "https://fbref.com/en/comps/9/2022-2023/schedule/2022-2023-Premier-League-Scores-and-Fixtures",
    "2021-22": "https://fbref.com/en/comps/9/2021-2022/schedule/2021-2022-Premier-League-Scores-and-Fixtures",
    "2020-21": "https://fbref.com/en/comps/9/2020-2021/schedule/2020-2021-Premier-League-Scores-and-Fixtures"
}

stats_urls = {
    "2025-26": "https://fbref.com/en/comps/9/stats/Premier-League-Stats",
    "2024-25": "https://fbref.com/en/comps/9/2024-2025/stats/2024-2025-Premier-League-Stats",
    "2023-24": "https://fbref.com/en/comps/9/2023-2024/stats/2023-2024-Premier-League-Stats",
    "2022-23": "https://fbref.com/en/comps/9/2022-2023/stats/2022-2023-Premier-League-Stats",
    "2021-22": "https://fbref.com/en/comps/9/2021-2022/stats/2021-2022-Premier-League-Stats",
    "2020-21": "https://fbref.com/en/comps/9/2020-2021/stats/2020-2021-Premier-League-Stats"
}

train_seasons = ["2020-21","2021-22","2022-23","2023-24"]
test_seasons = ["2024-25"]

# === Helper Functions ===
def split_score_to_goals(score_series):
    extracted = score_series.astype(str).str.extract(r'(?P<HomeGoals>\d+)\s*[-–:]\s*(?P<AwayGoals>\d+)')
    return extracted.astype(float).astype("Int64")

def load_fixtures_for_season(season):
    url = fixture_urls[season]
    tables = pd.read_html(url, flavor="html5lib")
    df = tables[0].copy()
    df = df.rename(columns={"Home": "HomeTeam", "Away": "AwayTeam"})
    df["Season"] = season
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    goals = split_score_to_goals(df["Score"])
    df = pd.concat([df, goals], axis=1)
    df["is_played"] = df["HomeGoals"].notna() & df["AwayGoals"].notna()
    df["Result"] = pd.NA
    mask = df["is_played"]
    df.loc[mask & (df["HomeGoals"] > df["AwayGoals"]), "Result"] = "HomeWin"
    df.loc[mask & (df["AwayGoals"] > df["HomeGoals"]), "Result"] = "AwayWin"
    df.loc[mask & (df["HomeGoals"] == df["AwayGoals"]), "Result"] = "Draw"
    keep = ["Season", "Date", "HomeTeam", "AwayTeam", "Score", "HomeGoals", "AwayGoals", "Result", "is_played"]
    return df[keep].reset_index(drop=True)

def load_stats_for_season(season):
    url = stats_urls[season]
    tables = pd.read_html(url, flavor="html5lib")
    base = tables[0].copy()
    base.columns = [col[-1] if isinstance(col, tuple) else col for col in base.columns]
    base = base.loc[:, ~base.columns.str.contains("Unnamed")]
    keep_cols = ["Squad", "Poss", "Gls", "Ast", "G+A", "Gls/90", "Ast/90", "G+A/90", "xG", "npxG", "xAG", "npxG+xAG", "PrgC", "PrgP"]
    base = base[[c for c in keep_cols if c in base.columns]]
    return base.reset_index(drop=True)

def build_match_dataset(season):
    fixtures = load_fixtures_for_season(season)
    stats = load_stats_for_season(season)
    merged = fixtures.merge(stats, left_on="HomeTeam", right_on="Squad", how="left")
    merged = merged.drop(columns=["Squad"])
    merged = merged.rename(columns={c: f"Home_{c}" for c in merged.columns if c not in fixtures.columns})
    merged = merged.merge(stats, left_on="AwayTeam", right_on="Squad", how="left")
    merged = merged.drop(columns=["Squad"])
    merged = merged.rename(columns={c: f"Away_{c}" for c in merged.columns if c not in fixtures.columns})
    return merged

def add_rolling_form_features(df, window=5):
    df = df.sort_values("Date").reset_index(drop=True)
    for team_col, prefix in [("HomeTeam", "HomeForm"), ("AwayTeam", "AwayForm")]:
        for team in df[team_col].unique():
            team_matches = df[(df["HomeTeam"]==team) | (df["AwayTeam"]==team)].sort_values("Date").copy()
            team_matches["Goals_For"] = np.where(team_matches["HomeTeam"]==team, team_matches["HomeGoals"], team_matches["AwayGoals"])
            team_matches["Goals_Against"] = np.where(team_matches["HomeTeam"]==team, team_matches["AwayGoals"], team_matches["HomeGoals"])
            team_matches["Points"] = np.where(team_matches["Goals_For"]>team_matches["Goals_Against"],3,
                                      np.where(team_matches["Goals_For"]==team_matches["Goals_Against"],1,0))
            for stat in ["Goals_For","Goals_Against","Points"]:
                team_matches[f"{prefix}_{stat}_rolling{window}"] = team_matches[stat].rolling(window).mean().shift(1)
                df.loc[team_matches.index, f"{prefix}_{stat}_rolling{window}"] = team_matches[f"{prefix}_{stat}_rolling{window}"]
    return df

# === Build datasets ===
train_data = pd.concat([build_match_dataset(s) for s in train_seasons], ignore_index=True)
test_data = pd.concat([build_match_dataset(s) for s in test_seasons], ignore_index=True)

train_data = add_rolling_form_features(train_data, window=5)
test_data = add_rolling_form_features(test_data, window=5)

train_data = train_data[train_data["is_played"]].fillna(0)
test_data = test_data[test_data["is_played"]].fillna(0)

# === Features & Labels ===
feature_cols = [c for c in train_data.columns if ("Home_" in c or "Away_" in c) and c not in ["HomeTeam","AwayTeam"]]
# ensure numeric only
feature_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(train_data[c])]

X_train = train_data[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
X_test = test_data[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
y_map = {"AwayWin":2, "Draw":1.5, "HomeWin":1}
y_train_enc = train_data["Result"].map(y_map)
y_test_enc = test_data["Result"].map(y_map)

# === Train XGBoost ===
model = xgb.XGBClassifier(
    n_estimators=1000,
    max_depth=6,
    learning_rate=0.01,
    objective="multi:softmax",
    num_class=3,
    random_state=42,
    eval_metric="mlogloss",
    use_label_encoder=False
)
model.fit(X_train, y_train_enc)

# === Evaluation ===
y_pred_enc = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test_enc, y_pred_enc))
print("\nClassification Report:\n", classification_report(y_test_enc, y_pred_enc, target_names=list(y_map.keys())))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_enc, y_pred_enc))

# === Predict 2025-26 ===
predict_data = build_match_dataset("2025-26")
predict_data = add_rolling_form_features(predict_data, window=5)
future_matches = predict_data[~predict_data["is_played"]].fillna(0)
X_future = future_matches[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
future_matches["Predicted_Result"] = model.predict(X_future)
future_matches["Predicted_Result"] = future_matches["Predicted_Result"].map({v:k for k,v in y_map.items()})

print("\nUpcoming Predictions (2025-26):")
print(future_matches[["Date","HomeTeam","AwayTeam","Predicted_Result"]].head(25))


HTTPError: HTTP Error 429: Too Many Requests

In [6]:
# === Step 0: Imports ===
import pandas as pd
import numpy as np
import requests
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
import requests
from bs4 import BeautifulSoup
warnings.filterwarnings("ignore")


# === Step 1: Constants ===
fixture_urls = {
    "2025-26": "https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures",
    "2024-25": "https://fbref.com/en/comps/9/2024-2025/schedule/2024-2025-Premier-League-Scores-and-Fixtures",
    "2023-24": "https://fbref.com/en/comps/9/2023-2024/schedule/2023-2024-Premier-League-Scores-and-Fixtures",
    "2022-23": "https://fbref.com/en/comps/9/2022-2023/schedule/2022-2023-Premier-League-Scores-and-Fixtures",
    "2021-22": "https://fbref.com/en/comps/9/2021-2022/schedule/2021-2022-Premier-League-Scores-and-Fixtures",
    "2020-21": "https://fbref.com/en/comps/9/2020-2021/schedule/2020-2021-Premier-League-Scores-and-Fixtures"
}

stats_urls = {
    "2025-26": "https://fbref.com/en/comps/9/stats/Premier-League-Stats",
    "2024-25": "https://fbref.com/en/comps/9/2024-2025/stats/2024-2025-Premier-League-Stats",
    "2023-24": "https://fbref.com/en/comps/9/2023-2024/stats/2023-2024-Premier-League-Stats",
    "2022-23": "https://fbref.com/en/comps/9/2022-2023/stats/2022-2023-Premier-League-Stats",
    "2021-22": "https://fbref.com/en/comps/9/2021-2022/stats/2021-2022-Premier-League-Stats",
   "2020-21": "https://fbref.com/en/comps/9/2020-2021/stats/2020-2021-Premier-League-Stats"
}

train_seasons = [ "2021-22", "2022-23", "2023-24"]
test_seasons = ["2024-25"]
predict_season = ["2025-26"]

# === Step 2: Utility Functions ===

def split_score_to_goals(score_series):
    extracted = score_series.astype(str).str.extract(
        r'(?P<HomeGoals>\d+)\s*[-–:]\s*(?P<AwayGoals>\d+)'
    )
    return extracted.astype(float).astype("Int64")

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Referer": "https://www.google.com/",
    "Accept-Language": "en-US,en;q=0.9"
}

def load_fixtures_for_season(season):
    url = fixture_urls[season]
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "lxml")
    tables = pd.read_html(str(soup))
    
    df = tables[0].copy()
    df = df.rename(columns={"Home": "HomeTeam", "Away": "AwayTeam"})
    df["Season"] = season
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

    goals = split_score_to_goals(df["Score"])
    df = pd.concat([df, goals], axis=1)

    df["is_played"] = df["HomeGoals"].notna() & df["AwayGoals"].notna()
    df["Result"] = pd.NA

    mask = df["is_played"]
    df.loc[mask & (df["HomeGoals"] > df["AwayGoals"]), "Result"] = "HomeWin"
    df.loc[mask & (df["AwayGoals"] > df["HomeGoals"]), "Result"] = "AwayWin"
    df.loc[mask & (df["HomeGoals"] == df["AwayGoals"]), "Result"] = "Draw"

    keep = ["Season", "Date", "HomeTeam", "AwayTeam", "Score",
            "HomeGoals", "AwayGoals", "Result", "is_played"]
    return df[keep].reset_index(drop=True)

def load_stats_for_season(season):
    url = stats_urls[season]
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.text,"lxml")
    tables = pd.read_html(str(soup))

    base = tables[0].copy()
    base.columns = [col[-1] if isinstance(col, tuple) else col for col in base.columns]
    base = base.loc[:, ~base.columns.str.contains("Unnamed")]

    keep_cols = [
        "Squad", "Poss", "Gls", "Ast", "G+A", "Gls/90", "Ast/90", "G+A/90",
        "xG", "npxG", "xAG", "npxG+xAG", "PrgC", "PrgP"
    ]
    base = base[[c for c in keep_cols if c in base.columns]]
    return base.reset_index(drop=True)

def build_match_dataset(season):
    fixtures = load_fixtures_for_season(season)
    stats = load_stats_for_season(season)

    merged = fixtures.merge(stats, left_on="HomeTeam", right_on="Squad", how="left", suffixes=("", "_Home"))
    merged = merged.drop(columns=["Squad"])
    merged = merged.rename(columns={c: f"Home_{c}" for c in merged.columns if c not in fixtures.columns})

    merged = merged.merge(stats, left_on="AwayTeam", right_on="Squad", how="left", suffixes=("", "_Away"))
    merged = merged.drop(columns=["Squad"])
    merged = merged.rename(columns={c: f"Away_{c}" for c in merged.columns if c not in fixtures.columns})

    return merged

def add_rolling_form_features(df, window=5):
    df = df.sort_values("Date").reset_index(drop=True)

    for team_col, prefix in [("HomeTeam", "HomeForm"), ("AwayTeam", "AwayForm")]:
        for team in df[team_col].unique():
            team_matches = df[(df["HomeTeam"] == team) | (df["AwayTeam"] == team)].copy()
            team_matches = team_matches.sort_values("Date")

            team_matches["Goals_For"] = np.where(team_matches["HomeTeam"] == team,
                                                 team_matches["HomeGoals"], team_matches["AwayGoals"])
            team_matches["Goals_Against"] = np.where(team_matches["HomeTeam"] == team,
                                                     team_matches["AwayGoals"], team_matches["HomeGoals"])
            team_matches["Points"] = np.where(team_matches["Goals_For"] > team_matches["Goals_Against"], 3,
                                              np.where(team_matches["Goals_For"] == team_matches["Goals_Against"], 1, 0))

            team_matches[f"{prefix}_Goals_For_rolling{window}"] = team_matches["Goals_For"].rolling(window).mean().shift(1)
            team_matches[f"{prefix}_Goals_Against_rolling{window}"] = team_matches["Goals_Against"].rolling(window).mean().shift(1)
            team_matches[f"{prefix}_Points_rolling{window}"] = team_matches["Points"].rolling(window).mean().shift(1)

            df.loc[team_matches.index, f"{prefix}_Goals_For_rolling{window}"] = team_matches[f"{prefix}_Goals_For_rolling{window}"]
            df.loc[team_matches.index, f"{prefix}_Goals_Against_rolling{window}"] = team_matches[f"{prefix}_Goals_Against_rolling{window}"]
            df.loc[team_matches.index, f"{prefix}_Points_rolling{window}"] = team_matches[f"{prefix}_Points_rolling{window}"]

    return df

# === Step 3: Build datasets ===

train_data = pd.concat([build_match_dataset(s) for s in train_seasons], ignore_index=True)
test_data = pd.concat([build_match_dataset(s) for s in test_seasons], ignore_index=True)

train_data = add_rolling_form_features(train_data, window=5)
test_data = add_rolling_form_features(test_data, window=5)

train_data = train_data[train_data["is_played"]]
test_data = test_data[test_data["is_played"]]

feature_cols = [c for c in train_data.columns if ("Home_" in c or "Away_" in c) and c not in ["HomeTeam", "AwayTeam"]]

X_train = train_data[feature_cols]
y_train = train_data["Result"]

X_test = test_data[feature_cols]
y_test = test_data["Result"]

# === Step 4: Train Model ===

rf = RandomForestClassifier(n_estimators=500, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("\nClassification Report: \n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# === Step 5: Predict Future Matches (2025-26) ===

predict_data = build_match_dataset("2025-26")
predict_data = add_rolling_form_features(predict_data, window=5)
future_matches = predict_data[~predict_data["is_played"]]

X_future = future_matches[feature_cols]
future_matches["Predicted_Result"] = rf.predict(X_future)

print("\nUpcoming Predictions (2025-26):")
print(future_matches[["Date", "HomeTeam", "AwayTeam", "Predicted_Result"]].head(25))


HTTPError: 403 Client Error: Forbidden for url: https://fbref.com/en/comps/9/2021-2022/schedule/2021-2022-Premier-League-Scores-and-Fixtures