In [142]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

In [None]:
# Generating a season summary using match data
def summarise_season(matches):
    teams = defaultdict(lambda: {
        "points": 0, "wins": 0, "draws": 0, "losses": 0,
        "goals_for": 0, "goals_against": 0,
    })
    
    matches_clean = matches.dropna(subset=['team', 'opponent', 'GF', 'GA'])
    matches_clean = matches_clean.astype({'GF': 'int64', 'GA': 'int64'})
    
    for _, row in matches_clean.iterrows():
        home, away = row["team"], row["opponent"]
        gf, ga = row["GF"], row["GA"]

        teams[home]["goals_for"] += gf
        teams[home]["goals_against"] += ga
        teams[away]["goals_for"] += ga
        teams[away]["goals_against"] += gf

        if gf > ga:
            teams[home]["points"] += 3
            teams[home]["wins"] += 1
            teams[away]["losses"] += 1
        elif gf < ga:
            teams[away]["points"] += 3
            teams[away]["wins"] += 1
            teams[home]["losses"] += 1
        else:
            teams[home]["points"] += 1
            teams[away]["points"] += 1
            teams[home]["draws"] += 1
            teams[away]["draws"] += 1
    
    data = []
    for team, stats in teams.items():
        goal_diff = stats["goals_for"] - stats["goals_against"]
        data.append({
            "team": team, "points": stats["points"], "wins": stats["wins"],
            "draws": stats["draws"], "losses": stats["losses"],
            "goals_for": stats["goals_for"], "goals_against": stats["goals_against"],
            "goal_diff": goal_diff,
        })
        
    summary = pd.DataFrame(data)
    summary = summary.sort_values(
        ["points", "goal_diff", "goals_for"], ascending=[False, False, False]
    ).reset_index(drop=True)
    summary["position"] = summary.index + 1
    return summary

# Preparing training data
def prepare_training_data(df, fixtures):
    df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)
    years = df['date'].dt.year.dropna().unique().astype(int)
    season_starts = sorted([f"{y}-07-01" for y in years])
    
    season_files = {}
    for i in range(len(season_starts) - 1):
        start_date = pd.to_datetime(season_starts[i])
        end_date = pd.to_datetime(season_starts[i+1])
        season_data = df[(df['date'] >= start_date) & (df['date'] < end_date)]
        if not season_data.empty:
            season_files[f"{start_date.year}/{start_date.year+1}"] = season_data
    
    season_summaries = {
        name: summarise_season(season_data) for name, season_data in season_files.items()
    }
    
    feature_rows = []
    target_rows = []
    seasons = list(season_summaries.keys())
    for i in range(len(seasons) - 1):
        prev_summary = season_summaries[seasons[i]].set_index("team")
        curr_summary = season_summaries[seasons[i+1]].set_index("team")
        
        bottom_three = prev_summary.sort_values('points').head(3)
        default_features = bottom_three.mean(numeric_only=True).to_dict()
        
        for team, row in curr_summary.iterrows():
            if team in prev_summary.index:
                feats = prev_summary.loc[team][
                    ["points", "wins", "draws", "losses", "goals_for", "goals_against", "goal_diff"]
                ].to_dict()
            else:
                feats = {k: default_features[k] for k in [
                    "points", "wins", "draws", "losses", "goals_for", "goals_against", "goal_diff"
                ]}
            feature_rows.append(feats)
            target_rows.append(row["position"])
            
    X_train = pd.DataFrame(feature_rows)
    y_train = pd.Series(target_rows)
    
    last_season_summary = season_summaries[seasons[-1]].set_index("team")
    bottom_three_last = last_season_summary.sort_values('points').head(3)
    default_features_last = bottom_three_last.mean(numeric_only=True).to_dict()
    
    latest_features_rows = []
    fixture_teams = fixtures['team'].unique()
    
    for team in fixture_teams:
        if team in last_season_summary.index:
            feats = last_season_summary.loc[team][
                ["points", "wins", "draws", "losses", "goals_for", "goals_against", "goal_diff"]
            ].to_dict()
        else:
            feats = {k: default_features_last[k] for k in [
                "points", "wins", "draws", "losses", "goals_for", "goals_against", "goal_diff"
            ]}
        latest_features_rows.append((team, feats))
        
    latest_features_df = pd.DataFrame(
        [feats for _, feats in latest_features_rows],
        index=[t for t, _ in latest_features_rows]
    )
    
    return X_train, y_train, latest_features_df

# Building and training a random forest classifier on the data 
def build_and_train_model(X, y):
    model = Pipeline([
        ("scaler", StandardScaler()),
        ("rf", RandomForestClassifier(n_estimators=500, max_depth=8, random_state=42))
    ])
    model.fit(X, y)
    return model

# Predicting the league table
def predict_league_table(model, features):
    probas = model.predict_proba(features)
    classes = model.named_steps["rf"].classes_
    exp_positions = probas.dot(classes)
    
    prediction_df = pd.DataFrame({
        "team": features.index,
        "expected_position": exp_positions
    })
    
    prediction_df = prediction_df.sort_values("expected_position").reset_index(drop=True)
    prediction_df["predicted_rank"] = np.arange(1, len(prediction_df) + 1)
    
    return prediction_df[["predicted_rank", "team", "expected_position"]]

# Running the prediction pipeline
def run_prediction_pipeline(train_path, fixtures_path):
    try:
        train_df = pd.read_csv(train_path)
        fixtures_df = pd.read_csv(fixtures_path)

        X_train, y_train, latest_features = prepare_training_data(train_df, fixtures_df)
        
        model = build_and_train_model(X_train, y_train)
        
        predictions = predict_league_table(model, latest_features)
        
        predictions['predicted_rank'] = np.arange(1, len(predictions) + 1)

        predictions.to_csv("bundesliga_2025_26_prediction.csv", index=False)
        
        print("Predicted Bundesliga 2025/26 Table:")
        print(predictions)

        print("\nFinal results saved to bundesliga_2025_26_prediction.csv")
    
    except Exception as e:
        print(f"An error occurred: {e}")

run_prediction_pipeline("fixture_training_data.csv", "2526_team-fixtures.csv")

Predicted Bundesliga 2025/26 Table:
    predicted_rank                 team  expected_position
0                1        Bayern Munich           1.858118
1                2           Leverkusen           5.788379
2                3             Mainz 05           6.388993
3                4             Dortmund           6.666480
4                5            Stuttgart           7.000014
5                6  Eintracht Frankfurt           7.009869
6                7           RB Leipzig           7.736897
7                8        Werder Bremen           9.473444
8                9             Freiburg           9.553041
9               10             Augsburg          10.862315
10              11            Wolfsburg          10.880230
11              12         Union Berlin          10.888565
12              13             Gladbach          10.916562
13              14             St Pauli          11.909824
14              15           Hoffenheim          11.940395
15              16  