## ðŸ“ˆ Predicting Premier League Final Positions Using Betting Odds & Simulation

**Competition:** English Premier League 2025/26  
**Purpose:** Estimate probabilities of final league positions using betting market information and simulation  
**Methods:** Odds-implied probabilities, Monte Carlo simulation, scenario analysis  
**Author:** [Victoria Friss de Kereki](https://www.linkedin.com/in/victoria-friss-de-kereki/)  

---

**Notebook first written:** `17/01/2026`  
**Last updated:** `17/01/2026`  

> This notebook develops a probabilistic framework to predict final Premier League final positions using betting odds as market-based expectations.
>
> Betting odds are transformed into implied probabilities and adjusted for bookmaker margin. These probabilities are then used to simulate the remainder of the season via Monte Carlo methods, generating distributions over final points totals and league positions.
>
> The analysis focuses on estimating the likelihood of key outcomes such as title wins, top-four finishes, relegation, and mid-table placements. Results are presented at team level with uncertainty intervals, and the framework can be extended to incorporate form, fixture difficulty, or alternative predictive inputs beyond betting markets.


In [1]:
import soccerdata as sd

## 1. Premier League Final Standings (ESPN Scraping)
##### Using the ESPN scraper I built in my previous project.

In [2]:
import pandas as pd

year = 2025  # current Premier League season start year

url = f"https://www.espn.com/soccer/standings/_/league/ENG.1/season/{year}"
tables = pd.read_html(url)

teams_raw = tables[0]
stats = tables[1]

teams = pd.DataFrame()
teams["position"] = teams_raw.iloc[:, 0].str.extract(r"^(\d+)").astype(int)
teams["team"] = (
    teams_raw.iloc[:, 0]
    .str.replace(r"^\d+", "", regex=True)
    .str.replace(r"^[A-Z]{2,3}", "", regex=True)
    .str.strip()
)

stats.columns = ["gp", "w", "d", "l", "gf", "ga", "gd", "pts"]
stats = stats.apply(lambda c: c.astype(str)
                              .str.replace("+", "", regex=False)
                              .astype(int))

premierleague = pd.concat([teams, stats], axis=1)
# premierleague["season"] = f"{year}-{year+1}"

premierleague


Unnamed: 0,position,team,gp,w,d,l,gf,ga,gd,pts
0,1,Arsenal,22,15,5,2,40,14,26,50
1,2,Manchester City,22,13,4,5,45,21,24,43
2,3,Aston Villa,21,13,4,4,33,24,9,43
3,4,Liverpool,22,10,6,6,33,29,4,36
4,5,Manchester United,22,9,8,5,38,32,6,35
5,6,Chelsea,22,9,7,6,36,24,12,34
6,7,Brentford,22,10,3,9,35,30,5,33
7,8,Newcastle United,22,9,6,7,32,27,5,33
8,9,Sunderland,22,8,9,5,23,23,0,33
9,10,Fulham,22,9,4,9,30,31,-1,31


## 2. Get betting odds using API

In [3]:
from dotenv import load_dotenv
import os

# Load variables from API_KEY.env
load_dotenv("API_KEY.env")

API_KEY = os.getenv("ODDS_DATA_API_KEY")

if API_KEY is None:
    raise ValueError("API_KEY not found. Check API_KEY.env")

print("API key loaded successfully")

API key loaded successfully


In [4]:
import requests
import pandas as pd
from datetime import datetime

url = "https://api.the-odds-api.com/v4/sports/soccer_epl/odds"

params = {
    "apiKey": API_KEY,
    "regions": "uk",
    "markets": "h2h",
    "oddsFormat": "decimal",
    "dateFormat": "iso",
    "days": 365  # get all upcoming matches for the next year
}

response = requests.get(url, params=params)
response.raise_for_status()

odds_data = response.json()
print("Total upcoming matches:", len(odds_data))

Total upcoming matches: 23


In [5]:
import pandas as pd

def flatten_odds(data):
    rows = []

    for match in data:
        match_id = match["id"]
        home = match["home_team"]
        away = match["away_team"]
        time = match["commence_time"]

        for book in match["bookmakers"]:
            bookmaker = book["title"]

            # Find h2h market
            h2h = next((m for m in book["markets"] if m["key"] == "h2h"), None)
            if not h2h:
                continue

            outcomes = {o["name"]: o["price"] for o in h2h["outcomes"]}

            rows.append({
                "match_id": match_id,
                "commence_time": time,
                "home_team": home,
                "away_team": away,
                "bookmaker": bookmaker,
                "home_odds": outcomes.get(home),
                "draw_odds": outcomes.get("Draw"),
                "away_odds": outcomes.get(away),
            })

    return pd.DataFrame(rows)

df = flatten_odds(odds_data)
df.head()

Unnamed: 0,match_id,commence_time,home_team,away_team,bookmaker,home_odds,draw_odds,away_odds
0,e15eb2362921b16a6b6a0397ce607a11,2026-01-18T14:00:00Z,Wolverhampton Wanderers,Newcastle United,Smarkets,54.95,1.05,18.48
1,e15eb2362921b16a6b6a0397ce607a11,2026-01-18T14:00:00Z,Wolverhampton Wanderers,Newcastle United,Paddy Power,34.0,1.02,34.0
2,e15eb2362921b16a6b6a0397ce607a11,2026-01-18T14:00:00Z,Wolverhampton Wanderers,Newcastle United,Sky Bet,41.0,1.04,16.0
3,e15eb2362921b16a6b6a0397ce607a11,2026-01-18T14:00:00Z,Wolverhampton Wanderers,Newcastle United,Betway,23.0,1.04,15.0
4,e15eb2362921b16a6b6a0397ce607a11,2026-01-18T14:00:00Z,Wolverhampton Wanderers,Newcastle United,888sport,23.0,1.06,13.0


In [6]:
betting_odds_avg = (
    df.groupby(["match_id", "home_team", "away_team"])
      .agg({
          "home_odds": "mean",
          "draw_odds": "mean",
          "away_odds": "mean"
      })
      .reset_index()
)

betting_odds_avg.head()

Unnamed: 0,match_id,home_team,away_team,home_odds,draw_odds,away_odds
0,1ca6d3d9cde3e58a39211feb9188530c,Newcastle United,Aston Villa,2.017647,3.611765,3.432353
1,1e811fa7ead0a3e6ef920b15b2bbb95d,Burnley,Tottenham Hotspur,3.794737,3.418421,1.965263
2,342788786c22e570ed2da53a9608113f,Brighton and Hove Albion,Bournemouth,1.852105,3.905263,3.797368
3,36820753efb36739a83c6e5e440827b2,Brighton and Hove Albion,Everton,1.80375,3.11375,3.4925
4,38a3cb5e295f55e274d589fc646cf2dd,Tottenham Hotspur,Manchester City,4.61,3.585,1.6


In [7]:
# 1) Convert odds -> raw probabilities
betting_odds_avg["p_home_raw"] = 1 / betting_odds_avg["home_odds"]
betting_odds_avg["p_draw_raw"] = 1 / betting_odds_avg["draw_odds"]
betting_odds_avg["p_away_raw"] = 1 / betting_odds_avg["away_odds"]

# 2) Normalize (remove bookmaker margin)
betting_odds_avg["total_raw"] = (
    betting_odds_avg["p_home_raw"] +
    betting_odds_avg["p_draw_raw"] +
    betting_odds_avg["p_away_raw"]
)

betting_odds_avg["p_home_book"] = betting_odds_avg["p_home_raw"] / betting_odds_avg["total_raw"]
betting_odds_avg["p_draw_book"] = betting_odds_avg["p_draw_raw"] / betting_odds_avg["total_raw"]
betting_odds_avg["p_away_book"] = betting_odds_avg["p_away_raw"] / betting_odds_avg["total_raw"]

# 3) Keep only useful columns
betting_odds_avg = betting_odds_avg[[
#match_id",
    "home_team",
    "away_team",
    "p_home_book",
    "p_draw_book",
    "p_away_book",
    "home_odds",
    "draw_odds",
    "away_odds"
]]

betting_odds_avg.head()

Unnamed: 0,home_team,away_team,p_home_book,p_draw_book,p_away_book,home_odds,draw_odds,away_odds
0,Newcastle United,Aston Villa,0.465882,0.260257,0.273861,2.017647,3.611765,3.432353
1,Burnley,Tottenham Hotspur,0.247464,0.274706,0.47783,3.794737,3.418421,1.965263
2,Brighton and Hove Albion,Bournemouth,0.509686,0.241723,0.248591,1.852105,3.905263,3.797368
3,Brighton and Hove Albion,Everton,0.477156,0.27641,0.246434,1.80375,3.11375,3.4925
4,Tottenham Hotspur,Manchester City,0.19353,0.248863,0.557608,4.61,3.585,1.6


## 3. Get fixtures.

In [8]:
# Load variables from API_KEY.env
load_dotenv("API_KEY.env")

API_KEY = os.getenv("FOOTBALL_DATA_API_KEY")

if API_KEY is None:
    raise ValueError("API_KEY not found. Check API_KEY.env")

print("API key loaded successfully")

API key loaded successfully


In [9]:
import requests
import pandas as pd
from datetime import datetime, timedelta

url = "https://api.football-data.org/v4/competitions/PL/matches"

headers = {
    "X-Auth-Token": API_KEY
}

today = datetime.utcnow().date()
end_of_season = today + timedelta(days=365)  # big range to cover all remaining games

params = {
    "status": "SCHEDULED",
    "dateFrom": today.isoformat(),
    "dateTo": end_of_season.isoformat()
}

response = requests.get(url, headers=headers, params=params)
response.raise_for_status()

data = response.json()
fixtures = data["matches"]

df_fixtures = pd.DataFrame(fixtures)

df_fixtures_clean = df_fixtures[[
    "utcDate",
    "status",
    "homeTeam",
    "awayTeam"
]]

df_fixtures_clean.head()
print("Total scheduled matches:", len(df_fixtures_clean))


Total scheduled matches: 162


In [10]:
df_fixtures_clean["homeTeam"] = df_fixtures_clean["homeTeam"].apply(lambda x: x["name"])
df_fixtures_clean["awayTeam"] = df_fixtures_clean["awayTeam"].apply(lambda x: x["name"])

In [11]:
df_fixtures_clean

Unnamed: 0,utcDate,status,homeTeam,awayTeam
0,2026-01-18T16:30:00Z,TIMED,Aston Villa FC,Everton FC
1,2026-01-19T20:00:00Z,TIMED,Brighton & Hove Albion FC,AFC Bournemouth
2,2026-01-24T12:30:00Z,TIMED,West Ham United FC,Sunderland AFC
3,2026-01-24T15:00:00Z,TIMED,Burnley FC,Tottenham Hotspur FC
4,2026-01-24T15:00:00Z,TIMED,Fulham FC,Brighton & Hove Albion FC
...,...,...,...,...
157,2026-05-24T15:00:00Z,TIMED,Liverpool FC,Brentford FC
158,2026-05-24T15:00:00Z,TIMED,Manchester City FC,Aston Villa FC
159,2026-05-24T15:00:00Z,TIMED,Nottingham Forest FC,AFC Bournemouth
160,2026-05-24T15:00:00Z,TIMED,Tottenham Hotspur FC,Everton FC


## Get this season results (2025/26)

In [12]:
url = "https://api.football-data.org/v4/competitions/PL/matches"
params = {
    "season": 2025,   # season year
    "status": "FINISHED"
}

response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
past_matches = response.json()["matches"]

In [13]:
clean_rows = []

for m in past_matches:
    row = {
        "utcDate": m["utcDate"],
        "matchday": m["matchday"],
        "status": m["status"],
        "homeTeam": m["homeTeam"]["name"],
        "awayTeam": m["awayTeam"]["name"],
        "homeGoals": m["score"]["fullTime"]["home"],
        "awayGoals": m["score"]["fullTime"]["away"],
        "winner": m["score"]["winner"]
    }
    clean_rows.append(row)

past_matches_clean = pd.DataFrame(clean_rows)
past_matches_clean.head()

Unnamed: 0,utcDate,matchday,status,homeTeam,awayTeam,homeGoals,awayGoals,winner
0,2025-08-15T19:00:00Z,1,FINISHED,Liverpool FC,AFC Bournemouth,4,2,HOME_TEAM
1,2025-08-16T11:30:00Z,1,FINISHED,Aston Villa FC,Newcastle United FC,0,0,DRAW
2,2025-08-16T14:00:00Z,1,FINISHED,Brighton & Hove Albion FC,Fulham FC,1,1,DRAW
3,2025-08-16T14:00:00Z,1,FINISHED,Sunderland AFC,West Ham United FC,3,0,HOME_TEAM
4,2025-08-16T14:00:00Z,1,FINISHED,Tottenham Hotspur FC,Burnley FC,3,0,HOME_TEAM


## Get past season results (2024/25)

In [14]:
url = "https://api.football-data.org/v4/competitions/PL/matches"
params = {
    "season": 2024,   # season year
    "status": "FINISHED"
}

response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
past_matches_24 = response.json()["matches"]

In [15]:
clean_rows = []

for m in past_matches_24:
    row = {
        "utcDate": m["utcDate"],
        "matchday": m["matchday"],
        "status": m["status"],
        "homeTeam": m["homeTeam"]["name"],
        "awayTeam": m["awayTeam"]["name"],
        "homeGoals": m["score"]["fullTime"]["home"],
        "awayGoals": m["score"]["fullTime"]["away"],
        "winner": m["score"]["winner"]
    }
    clean_rows.append(row)

past_matches_24_clean = pd.DataFrame(clean_rows)
past_matches_24_clean.head()

Unnamed: 0,utcDate,matchday,status,homeTeam,awayTeam,homeGoals,awayGoals,winner
0,2024-08-16T19:00:00Z,1,FINISHED,Manchester United FC,Fulham FC,1,0,HOME_TEAM
1,2024-08-17T11:30:00Z,1,FINISHED,Ipswich Town FC,Liverpool FC,0,2,AWAY_TEAM
2,2024-08-17T14:00:00Z,1,FINISHED,Arsenal FC,Wolverhampton Wanderers FC,2,0,HOME_TEAM
3,2024-08-17T14:00:00Z,1,FINISHED,Everton FC,Brighton & Hove Albion FC,0,3,AWAY_TEAM
4,2024-08-17T14:00:00Z,1,FINISHED,Newcastle United FC,Southampton FC,1,0,HOME_TEAM


## Combine and predict

In [16]:
import pandas as pd
import numpy as np
from scipy.stats import poisson

# ----------------------------
# 1. Load your dataframes
# ----------------------------
df_current = past_matches_clean
df_prev = past_matches_24_clean
df_future = df_fixtures_clean


# ----------------------------
# 2. Combine and weight games
# ----------------------------
df_all = pd.concat([df_prev, df_current], ignore_index=True)

# Add weights: more recent games = more weight
df_all["date"] = pd.to_datetime(df_all["utcDate"])
df_all["weight"] = np.linspace(1, 2, len(df_all))  # simple linear weighting


# ----------------------------
# 3. Compute home advantage
# ----------------------------
# Home advantage = average home goals - average away goals
home_avg = df_all["homeGoals"].mean()
away_avg = df_all["awayGoals"].mean()
home_advantage = home_avg - away_avg


# ----------------------------
# 4. Calculate attack & defense strengths
# ----------------------------
teams = pd.unique(df_all[["homeTeam", "awayTeam"]].values.ravel("K"))

attack = pd.Series(1.0, index=teams)
defense = pd.Series(1.0, index=teams)

# Initialize with goals per match
team_stats = {}

for team in teams:
    home_games = df_all[df_all["homeTeam"] == team]
    away_games = df_all[df_all["awayTeam"] == team]

    goals_scored = (home_games["homeGoals"] * home_games["weight"]).sum() + \
                   (away_games["awayGoals"] * away_games["weight"]).sum()

    goals_against = (home_games["awayGoals"] * home_games["weight"]).sum() + \
                    (away_games["homeGoals"] * away_games["weight"]).sum()

    matches = home_games["weight"].sum() + away_games["weight"].sum()

    team_stats[team] = {
        "scored": goals_scored / matches,
        "against": goals_against / matches
    }

# Strengths = relative to league average
league_avg_scored = df_all["homeGoals"].mean() + df_all["awayGoals"].mean()
league_avg_scored /= 2

for team in teams:
    attack[team] = team_stats[team]["scored"] / league_avg_scored
    defense[team] = team_stats[team]["against"] / league_avg_scored


# ----------------------------
# 5. Predict probabilities for each future match
# ----------------------------
def match_probabilities(home, away):
    # expected goals
    exp_home = np.exp(np.log(league_avg_scored) + np.log(attack[home]) + np.log(defense[away]) + home_advantage)
    exp_away = np.exp(np.log(league_avg_scored) + np.log(attack[away]) + np.log(defense[home]))

    # compute probabilities up to 6 goals
    max_goals = 6
    p_home = poisson.pmf(range(max_goals + 1), exp_home)
    p_away = poisson.pmf(range(max_goals + 1), exp_away)

    # result probabilities
    p_win = 0
    p_draw = 0
    p_loss = 0

    for i in range(max_goals + 1):
        for j in range(max_goals + 1):
            prob = p_home[i] * p_away[j]
            if i > j:
                p_win += prob
            elif i == j:
                p_draw += prob
            else:
                p_loss += prob

    return p_win, p_draw, p_loss


# ----------------------------
# 6. Apply to all fixtures
# ----------------------------
results = []

for _, row in df_future.iterrows():
    home = row["homeTeam"]
    away = row["awayTeam"]

    p_win, p_draw, p_loss = match_probabilities(home, away)

    results.append({
        "utcDate": row["utcDate"],
        "homeTeam": home,
        "awayTeam": away,
        "p_home_win": p_win,
        "p_draw": p_draw,
        "p_away_win": p_loss,
        "odds_home_win": 1 / p_win,
        "odds_draw": 1 / p_draw,
        "odds_away_win": 1 / p_loss
    })

df_odds = pd.DataFrame(results)
df_odds.head()


Unnamed: 0,utcDate,homeTeam,awayTeam,p_home_win,p_draw,p_away_win,odds_home_win,odds_draw,odds_away_win
0,2026-01-18T16:30:00Z,Aston Villa FC,Everton FC,0.500195,0.258623,0.240125,1.999218,3.866633,4.164501
1,2026-01-19T20:00:00Z,Brighton & Hove Albion FC,AFC Bournemouth,0.490516,0.209368,0.292787,2.03867,4.776286,3.415456
2,2026-01-24T12:30:00Z,West Ham United FC,Sunderland AFC,0.301143,0.279975,0.418414,3.320677,3.571754,2.38998
3,2026-01-24T15:00:00Z,Burnley FC,Tottenham Hotspur FC,0.265022,0.215763,0.513925,3.77327,4.634707,1.94581
4,2026-01-24T15:00:00Z,Fulham FC,Brighton & Hove Albion FC,0.406215,0.227506,0.362712,2.461754,4.395482,2.757008


## Compare calculated probabilitie to bookmaker ones

In [17]:
unique_bet_home = betting_odds_avg["home_team"].unique()
unique_model_home = df_odds["homeTeam"].unique()

In [18]:
print(unique_bet_home)
print(unique_model_home)

['Newcastle United' 'Burnley' 'Brighton and Hove Albion'
 'Tottenham Hotspur' 'Crystal Palace' 'Sunderland' 'Arsenal' 'Bournemouth'
 'Aston Villa' 'Brentford' 'Liverpool' 'West Ham United' 'Chelsea'
 'Manchester City' 'Wolverhampton Wanderers' 'Nottingham Forest' 'Fulham'
 'Manchester United' 'Leeds United' 'Everton']
['Aston Villa FC' 'Brighton & Hove Albion FC' 'West Ham United FC'
 'Burnley FC' 'Fulham FC' 'Manchester City FC' 'AFC Bournemouth'
 'Crystal Palace FC' 'Brentford FC' 'Newcastle United FC' 'Arsenal FC'
 'Everton FC' 'Leeds United FC' 'Wolverhampton Wanderers FC' 'Chelsea FC'
 'Liverpool FC' 'Manchester United FC' 'Nottingham Forest FC'
 'Tottenham Hotspur FC' 'Sunderland AFC']


In [19]:
def normalize_team(name):
    name = name.lower()
    name = name.replace(" fc", "")
    name = name.replace(" afc", "")
    name = name.replace("&", "and")
    name = name.replace("afc ", "")   # <--- this removes AFC from start
    name = name.strip()
    return name


In [20]:
df_odds["home_norm"] = df_odds["homeTeam"].apply(normalize_team)
df_odds["away_norm"] = df_odds["awayTeam"].apply(normalize_team)

betting_odds_avg["home_norm"] = betting_odds_avg["home_team"].apply(normalize_team)
betting_odds_avg["away_norm"] = betting_odds_avg["away_team"].apply(normalize_team)


In [21]:
unique_model_norm = df_odds["home_norm"].unique()
unique_bet_norm = betting_odds_avg["home_norm"].unique()

print(unique_model_norm)
print(unique_bet_norm)

set(unique_model_norm) == set(unique_bet_norm)

['aston villa' 'brighton and hove albion' 'west ham united' 'burnley'
 'fulham' 'manchester city' 'bournemouth' 'crystal palace' 'brentford'
 'newcastle united' 'arsenal' 'everton' 'leeds united'
 'wolverhampton wanderers' 'chelsea' 'liverpool' 'manchester united'
 'nottingham forest' 'tottenham hotspur' 'sunderland']
['newcastle united' 'burnley' 'brighton and hove albion'
 'tottenham hotspur' 'crystal palace' 'sunderland' 'arsenal' 'bournemouth'
 'aston villa' 'brentford' 'liverpool' 'west ham united' 'chelsea'
 'manchester city' 'wolverhampton wanderers' 'nottingham forest' 'fulham'
 'manchester united' 'leeds united' 'everton']


True

In [22]:
df_compare = df_odds.merge(
    betting_odds_avg,
    left_on=["home_norm", "away_norm"],
    right_on=["home_norm", "away_norm"],
    how="inner"
)

print("Matched rows:", len(df_compare))
df_compare.head()


Matched rows: 22


Unnamed: 0,utcDate,homeTeam,awayTeam,p_home_win,p_draw,p_away_win,odds_home_win,odds_draw,odds_away_win,home_norm,away_norm,home_team,away_team,p_home_book,p_draw_book,p_away_book,home_odds,draw_odds,away_odds
0,2026-01-18T16:30:00Z,Aston Villa FC,Everton FC,0.500195,0.258623,0.240125,1.999218,3.866633,4.164501,aston villa,everton,Aston Villa,Everton,0.596194,0.239652,0.164154,1.587778,3.95,5.766667
1,2026-01-19T20:00:00Z,Brighton & Hove Albion FC,AFC Bournemouth,0.490516,0.209368,0.292787,2.03867,4.776286,3.415456,brighton and hove albion,bournemouth,Brighton and Hove Albion,Bournemouth,0.509686,0.241723,0.248591,1.852105,3.905263,3.797368
2,2026-01-24T12:30:00Z,West Ham United FC,Sunderland AFC,0.301143,0.279975,0.418414,3.320677,3.571754,2.38998,west ham united,sunderland,West Ham United,Sunderland,0.393449,0.287222,0.319329,2.383889,3.265556,2.937222
3,2026-01-24T15:00:00Z,Burnley FC,Tottenham Hotspur FC,0.265022,0.215763,0.513925,3.77327,4.634707,1.94581,burnley,tottenham hotspur,Burnley,Tottenham Hotspur,0.247464,0.274706,0.47783,3.794737,3.418421,1.965263
4,2026-01-24T15:00:00Z,Fulham FC,Brighton & Hove Albion FC,0.406215,0.227506,0.362712,2.461754,4.395482,2.757008,fulham,brighton and hove albion,Fulham,Brighton and Hove Albion,0.372469,0.280321,0.34721,2.510526,3.335789,2.693158


In [23]:
df_compare["diff_home"] = df_compare["p_home_win"] - df_compare["p_home_book"]
df_compare["diff_draw"] = df_compare["p_draw"] - df_compare["p_draw_book"]
df_compare["diff_away"] = df_compare["p_away_win"] - df_compare["p_away_book"]

df_compare[["homeTeam", "awayTeam", "diff_home", "diff_draw", "diff_away"]].head()

Unnamed: 0,homeTeam,awayTeam,diff_home,diff_draw,diff_away
0,Aston Villa FC,Everton FC,-0.095999,0.018971,0.07597
1,Brighton & Hove Albion FC,AFC Bournemouth,-0.01917,-0.032355,0.044196
2,West Ham United FC,Sunderland AFC,-0.092306,-0.007247,0.099085
3,Burnley FC,Tottenham Hotspur FC,0.017558,-0.058943,0.036095
4,Fulham FC,Brighton & Hove Albion FC,0.033746,-0.052815,0.015502


In [24]:
import numpy as np

rmse_home = np.sqrt(np.mean((df_compare["p_home_win"] - df_compare["p_home_book"])**2))
rmse_draw = np.sqrt(np.mean((df_compare["p_draw"] - df_compare["p_draw_book"])**2))
rmse_away = np.sqrt(np.mean((df_compare["p_away_win"] - df_compare["p_away_book"])**2))

rmse_home, rmse_draw, rmse_away


(0.05450330982624227, 0.040819901302238594, 0.04698286372342343)

In [25]:
rmse_total = np.sqrt(np.mean((
    df_compare["p_home_win"] - df_compare["p_home_book"]
)**2 + (
    df_compare["p_draw"] - df_compare["p_draw_book"]
)**2 + (
    df_compare["p_away_win"] - df_compare["p_away_book"]
)**2 ))

rmse_total


0.08273007076023586

In [26]:
df_compare["abs_diff"] = (
    abs(df_compare["diff_home"]) +
    abs(df_compare["diff_draw"]) +
    abs(df_compare["diff_away"])
)

df_compare.sort_values("abs_diff", ascending=False).head(10)[
    ["homeTeam", "awayTeam", "diff_home", "diff_draw", "diff_away"]
]


Unnamed: 0,homeTeam,awayTeam,diff_home,diff_draw,diff_away
10,Arsenal FC,Manchester United FC,0.097549,-0.05562,-0.049486
2,West Ham United FC,Sunderland AFC,-0.092306,-0.007247,0.099085
11,Everton FC,Leeds United FC,0.097855,-0.049009,-0.050261
18,Manchester United FC,Fulham FC,-0.071719,-0.027841,0.097013
0,Aston Villa FC,Everton FC,-0.095999,0.018971,0.07597
8,Brentford FC,Nottingham Forest FC,0.083177,-0.061951,-0.025365
13,Leeds United FC,Arsenal FC,-0.01845,-0.059743,0.072176
21,Sunderland AFC,Burnley FC,0.06618,-0.044892,-0.022987
15,Chelsea FC,West Ham United FC,0.059234,-0.041944,-0.030081
3,Burnley FC,Tottenham Hotspur FC,0.017558,-0.058943,0.036095


## Replace my estimates probabilities with the ones I have from odds

In [71]:
df_odds.head(2)

Unnamed: 0,utcDate,homeTeam,awayTeam,p_home_win,p_draw,p_away_win,odds_home_win,odds_draw,odds_away_win,home_norm,away_norm
0,2026-01-18T16:30:00Z,Aston Villa FC,Everton FC,0.500195,0.258623,0.240125,1.999218,3.866633,4.164501,aston villa,everton
1,2026-01-19T20:00:00Z,Brighton & Hove Albion FC,AFC Bournemouth,0.490516,0.209368,0.292787,2.03867,4.776286,3.415456,brighton and hove albion,bournemouth


In [72]:
betting_odds_avg.head(2)

Unnamed: 0,home_team,away_team,p_home_book,p_draw_book,p_away_book,home_odds,draw_odds,away_odds,home_norm,away_norm
0,Newcastle United,Aston Villa,0.465882,0.260257,0.273861,2.017647,3.611765,3.432353,newcastle united,aston villa
1,Burnley,Tottenham Hotspur,0.247464,0.274706,0.47783,3.794737,3.418421,1.965263,burnley,tottenham hotspur


In [89]:
df_final_probabilities = df_odds.merge(
    betting_odds_avg,
    left_on=["home_norm", "away_norm"],
    right_on=["home_norm", "away_norm"],
    how="left"
)

In [90]:
df_final_probabilities = df_final_probabilities[[
    "utcDate",
    "homeTeam",
    "awayTeam",
    "p_home_win",
    "p_draw",
    "p_away_win",
    "p_home_book",
    "p_draw_book",
    "p_away_book",
]]

df_final_probabilities

Unnamed: 0,utcDate,homeTeam,awayTeam,p_home_win,p_draw,p_away_win,p_home_book,p_draw_book,p_away_book
0,2026-01-18T16:30:00Z,Aston Villa FC,Everton FC,0.500195,0.258623,0.240125,0.596194,0.239652,0.164154
1,2026-01-19T20:00:00Z,Brighton & Hove Albion FC,AFC Bournemouth,0.490516,0.209368,0.292787,0.509686,0.241723,0.248591
2,2026-01-24T12:30:00Z,West Ham United FC,Sunderland AFC,0.301143,0.279975,0.418414,0.393449,0.287222,0.319329
3,2026-01-24T15:00:00Z,Burnley FC,Tottenham Hotspur FC,0.265022,0.215763,0.513925,0.247464,0.274706,0.477830
4,2026-01-24T15:00:00Z,Fulham FC,Brighton & Hove Albion FC,0.406215,0.227506,0.362712,0.372469,0.280321,0.347210
...,...,...,...,...,...,...,...,...,...
157,2026-05-24T15:00:00Z,Liverpool FC,Brentford FC,0.565462,0.197218,0.227511,,,
158,2026-05-24T15:00:00Z,Manchester City FC,Aston Villa FC,0.578684,0.210358,0.205576,,,
159,2026-05-24T15:00:00Z,Nottingham Forest FC,AFC Bournemouth,0.412937,0.236402,0.348225,,,
160,2026-05-24T15:00:00Z,Tottenham Hotspur FC,Everton FC,0.428585,0.255595,0.314685,,,


In [91]:
import numpy as np

df_final_probabilities["p_home_final"] = np.where(
    df_final_probabilities["p_home_book"].notna(),
    df_final_probabilities["p_home_book"],
    df_final_probabilities["p_home_win"]
)

df_final_probabilities["p_draw_final"] = np.where(
    df_final_probabilities["p_draw_book"].notna(),
    df_final_probabilities["p_draw_book"],
    df_final_probabilities["p_draw"]
)

df_final_probabilities["p_away_final"] = np.where(
    df_final_probabilities["p_away_book"].notna(),
    df_final_probabilities["p_away_book"],
    df_final_probabilities["p_away_win"]
)

In [92]:
print("Used betting odds:", df_final_probabilities["p_home_book"].notna().sum())
print("Used model:", df_final_probabilities["p_home_book"].isna().sum())


Used betting odds: 22
Used model: 140


In [93]:
df_final_probabilities = df_final_probabilities[[
    "utcDate",
    "homeTeam",
    "awayTeam",
    "p_home_final",
    "p_draw_final",
    "p_away_final"
]]

In [94]:
df_final_probabilities

Unnamed: 0,utcDate,homeTeam,awayTeam,p_home_final,p_draw_final,p_away_final
0,2026-01-18T16:30:00Z,Aston Villa FC,Everton FC,0.596194,0.239652,0.164154
1,2026-01-19T20:00:00Z,Brighton & Hove Albion FC,AFC Bournemouth,0.509686,0.241723,0.248591
2,2026-01-24T12:30:00Z,West Ham United FC,Sunderland AFC,0.393449,0.287222,0.319329
3,2026-01-24T15:00:00Z,Burnley FC,Tottenham Hotspur FC,0.247464,0.274706,0.477830
4,2026-01-24T15:00:00Z,Fulham FC,Brighton & Hove Albion FC,0.372469,0.280321,0.347210
...,...,...,...,...,...,...
157,2026-05-24T15:00:00Z,Liverpool FC,Brentford FC,0.565462,0.197218,0.227511
158,2026-05-24T15:00:00Z,Manchester City FC,Aston Villa FC,0.578684,0.210358,0.205576
159,2026-05-24T15:00:00Z,Nottingham Forest FC,AFC Bournemouth,0.412937,0.236402,0.348225
160,2026-05-24T15:00:00Z,Tottenham Hotspur FC,Everton FC,0.428585,0.255595,0.314685


In [96]:
df_final_probabilities["homeTeam"].unique()

array(['Aston Villa FC', 'Brighton & Hove Albion FC',
       'West Ham United FC', 'Burnley FC', 'Fulham FC',
       'Manchester City FC', 'AFC Bournemouth', 'Crystal Palace FC',
       'Brentford FC', 'Newcastle United FC', 'Arsenal FC', 'Everton FC',
       'Leeds United FC', 'Wolverhampton Wanderers FC', 'Chelsea FC',
       'Liverpool FC', 'Manchester United FC', 'Nottingham Forest FC',
       'Tottenham Hotspur FC', 'Sunderland AFC'], dtype=object)

In [98]:
name_map = {
    "Aston Villa FC": "Aston Villa",
    "Brighton & Hove Albion FC": "Brighton & Hove Albion",
    "AFC Bournemouth": "AFC Bournemouth",   # keep as is
    "Bournemouth": "AFC Bournemouth",
    "Sunderland AFC": "Sunderland",
    "Newcastle United FC": "Newcastle United",
    "Manchester City FC": "Manchester City",
    "Manchester United FC": "Manchester United",
    "West Ham United FC": "West Ham United",
    "Wolverhampton Wanderers FC": "Wolverhampton Wanderers",
    "Tottenham Hotspur FC": "Tottenham Hotspur",
    "Crystal Palace FC": "Crystal Palace",
    "Brentford FC": "Brentford",
    "Everton FC": "Everton",
    "Leeds United FC": "Leeds United",
    "Chelsea FC": "Chelsea",
    "Liverpool FC": "Liverpool",
    "Nottingham Forest FC": "Nottingham Forest",
    "Burnley FC": "Burnley",
    "Fulham FC": "Fulham",
    "Arsenal FC": "Arsenal"
}

df_final_probabilities["home_team_norm"] = df_final_probabilities["homeTeam"].replace(name_map)
df_final_probabilities["away_team_norm"] = df_final_probabilities["awayTeam"].replace(name_map)

premierleague["team_norm"] = premierleague["team"].replace({
    "Brighton & Hove Albion": "Brighton & Hove Albion",
    "AFC Bournemouth": "AFC Bournemouth"
})


In [99]:
set(df_final_probabilities["home_team_norm"].unique()) - set(premierleague["team_norm"].unique())


set()

In [110]:
df_simulation = df_final_probabilities.copy()

In [111]:
# Drop rows with NaN probs
df_simulation = df_final_probabilities.dropna(
    subset=["home_team_norm", "away_team_norm", "p_home_final", "p_draw_final", "p_away_final"]
).copy()

# Normalize probabilities so they sum to 1
prob_cols = ["p_home_final", "p_draw_final", "p_away_final"]
df_simulation[prob_cols] = df_simulation[prob_cols].div(df_simulation[prob_cols].sum(axis=1), axis=0)


In [145]:
import numpy as np
import pandas as pd

def simulate_once(fixtures, table):
    table_sim = table.copy()

    # Use normalized team name column
    points = dict(zip(table_sim["team_norm"], table_sim["pts"]))

    for _, row in fixtures.iterrows():
        home = row["home_team_norm"]
        away = row["away_team_norm"]

        # choose outcome
        probs = [row["p_home_final"], row["p_draw_final"], row["p_away_final"]]
        outcome = np.random.choice(["H", "D", "A"], p=probs)

        if outcome == "H":
            points[home] += 3
        elif outcome == "D":
            points[home] += 1
            points[away] += 1
        else:
            points[away] += 3

    result_df = table_sim.copy()
    result_df["pts"] = result_df["team_norm"].map(points)

    # sort by points and goal difference
    result_df = result_df.sort_values(["pts", "gd"], ascending=[False, False])
    result_df["position"] = np.arange(1, len(result_df)+1)

    return result_df


def run_simulations(fixtures, table, n_sim=10000):
    position_counts = {team: np.zeros(len(table)) for team in table["team_norm"]}

    for _ in range(n_sim):
        final_table = simulate_once(fixtures, table)

        for _, row in final_table.iterrows():
            position_counts[row["team_norm"]][row["position"]-1] += 1

    pos_df = pd.DataFrame(position_counts, index=np.arange(1, len(table)+1))
    pos_df.index.name = "position"
    return pos_df



# RUN
position_distribution = run_simulations(df_simulation, premierleague, n_sim=20000)


In [149]:
position_distribution_t = position_distribution.T

In [150]:
position_distribution_pct = position_distribution_t.div(
    position_distribution_t.sum(axis=1),
    axis=0
) * 100


In [148]:
vmax = 40

position_distribution_pct.style \
    .background_gradient(
        cmap=green_cmap,
        vmin=0,
        vmax=vmax
    ) \
    .applymap(lambda x: "background-color: #ffdddd" if x == 0 else "") \
    .format("{:.2f}")


position,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Arsenal,88.38,10.22,1.27,0.12,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Manchester City,8.97,58.3,24.64,6.13,1.44,0.36,0.1,0.03,0.01,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Aston Villa,2.4,24.95,44.52,17.72,6.53,2.43,0.89,0.29,0.18,0.06,0.02,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Liverpool,0.2,4.68,17.04,33.7,19.6,10.88,6.26,3.4,2.03,1.14,0.62,0.29,0.09,0.06,0.01,0.0,0.0,0.0,0.0,0.0
Manchester United,0.0,0.09,0.96,4.53,9.04,13.29,13.94,13.05,12.16,9.96,7.78,5.83,3.96,2.81,1.65,0.77,0.19,0.0,0.0,0.0
Chelsea,0.05,1.27,7.52,19.66,24.98,17.01,11.24,7.23,4.61,2.77,1.86,0.98,0.43,0.23,0.1,0.01,0.02,0.0,0.0,0.0
Brentford,0.0,0.18,1.37,5.68,11.51,14.99,15.47,13.35,11.49,8.48,6.38,4.5,3.12,1.93,0.95,0.46,0.12,0.01,0.0,0.0
Newcastle United,0.0,0.21,1.92,7.56,13.43,16.43,15.29,13.28,10.2,7.52,5.58,3.71,2.41,1.42,0.73,0.3,0.04,0.0,0.0,0.0
Sunderland,0.0,0.03,0.3,1.65,4.3,6.96,10.14,12.13,12.89,12.9,11.53,9.34,7.34,5.13,3.08,1.71,0.52,0.01,0.0,0.0
Fulham,0.0,0.0,0.08,0.65,1.85,3.79,5.82,8.29,10.24,12.33,12.65,12.37,10.92,9.0,6.6,3.92,1.41,0.09,0.0,0.0
