In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

# Load dataset
df = pd.read_csv("/content/cleaned_games_2018_2022 (1).csv")

# Function to compute final AFL score
def compute_final_score(score):
    goals, points = map(int, str(score).split('.'))
    return (goals * 6) + points

df['home_score'] = df['hometeamscoreft'].apply(compute_final_score)
df['away_score'] = df['awayteamscoreft'].apply(compute_final_score)

# Encode categorical variables
label_encoders = {}
categorical_columns = ['hometeam', 'awayteam', 'venue', 'round']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoders for later use

# Select features and target
features = ['year', 'round', 'hometeam', 'awayteam', 'venue', 'maxtemp', 'mintemp']
target = ['home_score', 'away_score']

X = df[features]
y = df[target]

# Split data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost Model
model = XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Convert predictions to integers (rounding)
y_pred_int = np.round(y_pred)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred_int)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_int))
r2 = r2_score(y_test, y_pred)

# print(f"Mean Absolute Error (MAE): {mae:.2f}")
# print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
# print(f"R² Score: {r2:.4f}")

# Prediction function
def predict_match(home_team, away_team, match_round, venue, year=2024, maxtemp=20, mintemp=10):
    # Encode categorical values using saved LabelEncoders
    home_team_encoded = label_encoders['hometeam'].transform([home_team])[0]
    away_team_encoded = label_encoders['awayteam'].transform([away_team])[0]
    venue_encoded = label_encoders['venue'].transform([venue])[0]
    round_encoded = label_encoders['round'].transform([match_round])[0]

    # Create input array
    match_features = np.array([[year, round_encoded, home_team_encoded, away_team_encoded, venue_encoded, maxtemp, mintemp]])

    # Predict scores
    predicted_scores = model.predict(match_features)

    # Round to nearest integer
    home_score = int(round(predicted_scores[0][0]))
    away_score = int(round(predicted_scores[0][1]))

    print(f"Predicted Score for {home_team} vs {away_team}:")
    print(f"{home_team}: {home_score}")
    print(f"{away_team}: {away_score}")




In [None]:
# Example Prediction: Richmond vs Carlton at MCG
predict_match("Richmond", "Carlton", "R1", "M.C.G.", year=2024, maxtemp=28.7, mintemp=14)

Predicted Score for Richmond vs Carlton:
Richmond: 116
Carlton: 86


Catboost
✅ Stage 1: Player Stats Prediction
What it does:
Before predicting scores, your model first estimates team-level player statistics for the match (such as total disposals, goals, tackles, etc. for both home and away teams).

How it works:

For each player stat (like Disposals, Goals, etc.), you trained a separate CatBoost regression model.

These models take match metadata as input:

Home team

Away team

Round

Venue

Year

Temperature (max and min)

Purpose:
This mimics the real-world situation where you won’t know player performance before the game — so you predict it first.

✅ Stage 2: Final Score Prediction
What it does:
Using the predicted team-level player stats + pre-match metadata, your second stage predicts the final home and away scores.

How it works:

You trained two CatBoost models:

One for predicting the home team score.

One for predicting the away team score.

Inputs:

Match metadata (teams, round, venue, temperatures)

Predicted player aggregate stats from Stage 1

Outputs:

Predicted home team score

Predicted away team score

In [None]:
!pip install --upgrade --force-reinstall numpy==1.23.5 catboost


Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting matplotlib (from catboost)
  Downloading matplotlib-3.10.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting pandas>=0.24 (from catboost)
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy (from catboost)
  Downloading scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00

In [None]:
import pandas as pd

# Load player data
player_df = pd.read_csv("/content/stats.csv")

# Aggregate player statistics by GameId and Team
agg_player_stats = player_df.groupby(['GameId', 'Team']).agg({
    'Disposals': 'sum',
    'Kicks': 'sum',
    'Marks': 'sum',
    'Handballs': 'sum',
    'Goals': 'sum',
    'Behinds': 'sum',
    'HitOuts': 'sum',
    'Tackles': 'sum',
    'Inside50s': 'sum',
    'Clearances': 'sum',
    'Rebounds': 'sum',
    # Add more as needed!
}).reset_index()

# Map team names to their full names
team_name_mapping = {
    "Richmond": "Richmond Tigers",
    "Essendon": "Essendon Bombers",
    "St Kilda": "St Kilda Saints",
    "Port Adelaide": "Port Adelaide Power",
    "Gold Coast": "Gold Coast Suns",
    "Hawthorn": "Hawthorn Hawks",
    "Greater Western Sydney": "GWS Giants",
    "Melbourne": "Melbourne Demons",
    "West Coast": "West Coast Eagles",
    "Adelaide": "Adelaide Crows",
    "North Melbourne": "North Melbourne Kangaroos",
    "Carlton": "Carlton Blues",
    "Collingwood": "Collingwood Magpies",
    "Brisbane Lions": "Brisbane Lions",
    "Fremantle": "Fremantle Dockers",
    "Western Bulldogs": "Western Bulldogs",
    "Sydney": "Sydney Swans",
    "Geelong": "Geelong Cats"
}

# Update team names in the DataFrame
agg_player_stats['Team'] = agg_player_stats['Team'].map(team_name_mapping).fillna(agg_player_stats['Team'])

print(agg_player_stats.head())

     GameId                       Team  Disposals  Kicks  Marks  Handballs  \
0  2012EF01          Fremantle Dockers        325    211     94        114   
1  2012EF01               Geelong Cats        321    182     52        139   
2  2012EF02  North Melbourne Kangaroos        289    170     57        119   
3  2012EF02          West Coast Eagles        358    231    112        127   
4  2012GF01             Hawthorn Hawks        336    194     56        142   

   Goals  Behinds  HitOuts  Tackles  Inside50s  Clearances  Rebounds  
0     14        8       44       77         47          34        35  
1     11       11       31       86         54          38        25  
2      9       10       29       36         43          42        41  
3     24       13       62       38         71          43        28  
4     11       13       60       84         61          58        26  


In [None]:
# Assuming your match data is already loaded as df_matches
df_matches = pd.read_csv("/content/cleaned_games_2018_2022 (1).csv")

# First, make sure GameId exists and matches between datasets
# You may need to create GameId from your match data (e.g., Year + Round + match number)

# Merge home team stats
df_matches = df_matches.merge(
    agg_player_stats,
    left_on=['gameid', 'hometeam'],
    right_on=['GameId', 'Team'],
    how='left',
    suffixes=('', '_home')
)

# Merge away team stats
df_matches = df_matches.merge(
    agg_player_stats,
    left_on=['GameId', 'awayteam'],
    right_on=['GameId', 'Team'],
    how='left',
    suffixes=('', '_away')
)

print(df_matches.head())


      gameid  year round        date             hometeam  \
0  2018R0101  2018    R1  2018-03-22      Richmond Tigers   
1  2018R0102  2018    R1  2018-03-23     Essendon Bombers   
2  2018R0103  2018    R1  2018-03-24      St Kilda Saints   
3  2018R0104  2018    R1  2018-03-24  Port Adelaide Power   
4  2018R0105  2018    R1  2018-03-24      Gold Coast Suns   

                    awayteam  hometeamscoreft  awayteamscoreft  \
0              Carlton Blues            17.19            15.50   
1             Adelaide Crows            14.15            12.15   
2             Brisbane Lions            16.11            12.10   
3          Fremantle Dockers            16.14             9.60   
4  North Melbourne Kangaroos             7.13             5.90   

              venue  attendance  ...  Kicks_away  Marks_away  Handballs_away  \
0            M.C.G.       90151  ...         207          88             169   
1         Docklands       43016  ...         204          83             183

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder


print(df_matches.head())
# Ensure columns are strings
categorical_columns = ['hometeam', 'awayteam', 'venue', 'round']
for col in categorical_columns:
    df_matches[col] = df_matches[col].astype(str)

# ✅ Step 1: Encode categorical variables (before training!)
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    le.fit(df_matches[col])
    df_matches[col] = le.transform(df_matches[col])
    label_encoders[col] = le

# ✅ Step 2: Compute final scores
def compute_final_score(score):
    goals, points = map(int, str(score).split('.'))
    return (goals * 6) + points

df_matches['home_score'] = df_matches['hometeamscoreft'].apply(compute_final_score)
df_matches['away_score'] = df_matches['awayteamscoreft'].apply(compute_final_score)

# ✅ Step 3: Define features
metadata_features = ['year', 'round', 'hometeam', 'awayteam', 'venue', 'maxtemp', 'mintemp']
player_stats = ['Disposals', 'Kicks', 'Marks', 'Handballs', 'Goals', 'Behinds', 'HitOuts',
                'Tackles', 'Inside50s', 'Clearances', 'Rebounds', 'Disposals_away', 'Kicks_away',
                'Marks_away', 'Handballs_away', 'Goals_away', 'Behinds_away', 'HitOuts_away',
                'Tackles_away', 'Inside50s_away', 'Clearances_away', 'Rebounds_away']

X_metadata = df_matches[metadata_features]

# ✅ Step 4: Train models for player stats
stat_models = {}
for stat in player_stats:
    y_stat = df_matches[stat]
    model = CatBoostRegressor(iterations=300, learning_rate=0.1, depth=6, random_seed=42, verbose=0)
    model.fit(X_metadata, y_stat, cat_features=[metadata_features.index(col) for col in categorical_columns])
    stat_models[stat] = model

print("✅ Player stat models trained.")

# ✅ Step 5: Predict team stats for a new match
def predict_team_stats(home_team_name, away_team_name, round_name, venue_name, year=2024, maxtemp=20, mintemp=10):
    home_team_encoded = label_encoders['hometeam'].transform([home_team_name])[0]
    away_team_encoded = label_encoders['awayteam'].transform([away_team_name])[0]
    round_encoded = label_encoders['round'].transform([round_name])[0]
    venue_encoded = label_encoders['venue'].transform([venue_name])[0]

    input_metadata = pd.DataFrame({
        'year': [year],
        'round': [round_encoded],
        'hometeam': [home_team_encoded],
        'awayteam': [away_team_encoded],
        'venue': [venue_encoded],
        'maxtemp': [maxtemp],
        'mintemp': [mintemp],
    })

    predicted_stats = {}
    for stat, model in stat_models.items():
        predicted_value = model.predict(input_metadata)[0]
        predicted_stats[stat] = predicted_value

    return predicted_stats

# ✅ Step 6: Train final score models
X_full = pd.concat([X_metadata, df_matches[player_stats]], axis=1)
y_home = df_matches['home_score']
y_away = df_matches['away_score']

X_train, X_test, y_train_home, y_test_home = train_test_split(X_full, y_home, test_size=0.2, random_state=42)
_, _, y_train_away, y_test_away = train_test_split(X_full, y_away, test_size=0.2, random_state=42)

model_home = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, random_seed=42, verbose=0)
model_away = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, random_seed=42, verbose=0)

model_home.fit(X_train, y_train_home, cat_features=[metadata_features.index(col) for col in categorical_columns])
model_away.fit(X_train, y_train_away, cat_features=[metadata_features.index(col) for col in categorical_columns])

print("✅ Final score models trained.")
# Save home score model
model_home.save_model("/content/model_home.cbm")

# Save away score model
model_away.save_model("/content/model_away.cbm")

print("✅ Models saved!")

# ✅ Step 7: Final match prediction function
def predict_match(home_team_name, away_team_name, round_name, venue_name, year=2024, maxtemp=20, mintemp=10):
    predicted_stats = predict_team_stats(home_team_name, away_team_name, round_name, venue_name, year, maxtemp, mintemp)

    home_team_encoded = label_encoders['hometeam'].transform([home_team_name])[0]
    away_team_encoded = label_encoders['awayteam'].transform([away_team_name])[0]
    round_encoded = label_encoders['round'].transform([round_name])[0]
    venue_encoded = label_encoders['venue'].transform([venue_name])[0]

    input_full = pd.DataFrame({
        'year': [year],
        'round': [round_encoded],
        'hometeam': [home_team_encoded],
        'awayteam': [away_team_encoded],
        'venue': [venue_encoded],
        'maxtemp': [maxtemp],
        'mintemp': [mintemp],
        **{k: [v] for k, v in predicted_stats.items()}
    })

    home_score = model_home.predict(input_full)[0]
    away_score = model_away.predict(input_full)[0]

    print(f"Predicted Score:\n{home_team_name}: {int(round(home_score))}\n{away_team_name}: {int(round(away_score))}")
    return int(round(home_score)), int(round(away_score))




      gameid  year round        date             hometeam  \
0  2018R0101  2018    R1  2018-03-22      Richmond Tigers   
1  2018R0102  2018    R1  2018-03-23     Essendon Bombers   
2  2018R0103  2018    R1  2018-03-24      St Kilda Saints   
3  2018R0104  2018    R1  2018-03-24  Port Adelaide Power   
4  2018R0105  2018    R1  2018-03-24      Gold Coast Suns   

                    awayteam  hometeamscoreft  awayteamscoreft  \
0              Carlton Blues            17.19            15.50   
1             Adelaide Crows            14.15            12.15   
2             Brisbane Lions            16.11            12.10   
3          Fremantle Dockers            16.14             9.60   
4  North Melbourne Kangaroos             7.13             5.90   

              venue  attendance  ...  Kicks_away  Marks_away  Handballs_away  \
0            M.C.G.       90151  ...         207          88             169   
1         Docklands       43016  ...         204          83             183

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder

print(df_matches.head())




      gameid  year round        date             hometeam  \
0  2018R0101  2018    R1  2018-03-22      Richmond Tigers   
1  2018R0102  2018    R1  2018-03-23     Essendon Bombers   
2  2018R0103  2018    R1  2018-03-24      St Kilda Saints   
3  2018R0104  2018    R1  2018-03-24  Port Adelaide Power   
4  2018R0105  2018    R1  2018-03-24      Gold Coast Suns   

                    awayteam  hometeamscoreft  awayteamscoreft  \
0              Carlton Blues            17.19            15.50   
1             Adelaide Crows            14.15            12.15   
2             Brisbane Lions            16.11            12.10   
3          Fremantle Dockers            16.14             9.60   
4  North Melbourne Kangaroos             7.13             5.90   

              venue  attendance  ...  Kicks_away  Marks_away  Handballs_away  \
0            M.C.G.       90151  ...         207          88             169   
1         Docklands       43016  ...         204          83             183

In [None]:

# Ensure columns are strings
categorical_columns = ['hometeam', 'awayteam', 'venue', 'round']
for col in categorical_columns:
    df_matches[col] = df_matches[col].astype(str)

# ✅ Step 1: Encode categorical variables (before training!)
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    le.fit(df_matches[col])
    df_matches[col] = le.transform(df_matches[col])
    label_encoders[col] = le

# ✅ Step 2: Compute final scores
def compute_final_score(score):
    goals, points = map(int, str(score).split('.'))
    return (goals * 6) + points

df_matches['home_score'] = df_matches['hometeamscoreft'].apply(compute_final_score)
df_matches['away_score'] = df_matches['awayteamscoreft'].apply(compute_final_score)

# ✅ Step 3: Define features
metadata_features = ['year', 'round', 'hometeam', 'awayteam', 'venue', 'maxtemp', 'mintemp']
player_stats = ['Disposals', 'Kicks', 'Marks', 'Handballs', 'Goals', 'Behinds', 'HitOuts',
                'Tackles', 'Inside50s', 'Clearances', 'Rebounds', 'Disposals_away', 'Kicks_away',
                'Marks_away', 'Handballs_away', 'Goals_away', 'Behinds_away', 'HitOuts_away',
                'Tackles_away', 'Inside50s_away', 'Clearances_away', 'Rebounds_away']

X_metadata = df_matches[metadata_features]

# ✅ Step 4: Train models for player stats
stat_models = {}
for stat in player_stats:
    y_stat = df_matches[stat]
    model = CatBoostRegressor(iterations=300, learning_rate=0.1, depth=6, random_seed=42, verbose=0)
    model.fit(X_metadata, y_stat, cat_features=[metadata_features.index(col) for col in categorical_columns])
    stat_models[stat] = model

print("✅ Player stat models trained.")

# ✅ Step 5: Predict team stats for a new match
def predict_team_stats(home_team_name, away_team_name, round_name, venue_name, year=2024, maxtemp=20, mintemp=10):
    home_team_encoded = label_encoders['hometeam'].transform([home_team_name])[0]
    away_team_encoded = label_encoders['awayteam'].transform([away_team_name])[0]
    round_encoded = label_encoders['round'].transform([round_name])[0]
    venue_encoded = label_encoders['venue'].transform([venue_name])[0]

    input_metadata = pd.DataFrame({
        'year': [year],
        'round': [round_encoded],
        'hometeam': [home_team_encoded],
        'awayteam': [away_team_encoded],
        'venue': [venue_encoded],
        'maxtemp': [maxtemp],
        'mintemp': [mintemp],
    })

    predicted_stats = {}
    for stat, model in stat_models.items():
        predicted_value = model.predict(input_metadata)[0]
        predicted_stats[stat] = predicted_value

    return predicted_stats

# ✅ Step 6: Train final score models
X_full = pd.concat([X_metadata, df_matches[player_stats]], axis=1)
y_home = df_matches['home_score']
y_away = df_matches['away_score']

X_train, X_test, y_train_home, y_test_home = train_test_split(X_full, y_home, test_size=0.2, random_state=42)
_, _, y_train_away, y_test_away = train_test_split(X_full, y_away, test_size=0.2, random_state=42)

model_home =CatBoostRegressor(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function='RMSEWithUncertainty',  # Required for uncertainty estimates
    random_seed=42,
    verbose=0
)
model_away = CatBoostRegressor(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function='RMSEWithUncertainty',  # Required for uncertainty estimates
    random_seed=42,
    verbose=0
)

model_home.fit(X_train, y_train_home, cat_features=[metadata_features.index(col) for col in categorical_columns])
model_away.fit(X_train, y_train_away, cat_features=[metadata_features.index(col) for col in categorical_columns])

print("✅ Final score models trained.")
# Save home score model
model_home.save_model("/content/model_home.cbm")

# Save away score model
model_away.save_model("/content/model_away.cbm")

print("✅ Models saved!")

# ✅ Step 7: Final match prediction function
def predict_match(home_team_name, away_team_name, round_name, venue_name, year=2024, maxtemp=20, mintemp=10):
    predicted_stats = predict_team_stats(home_team_name, away_team_name, round_name, venue_name, year, maxtemp, mintemp)

    # Encode inputs
    home_team_encoded = label_encoders['hometeam'].transform([home_team_name])[0]
    away_team_encoded = label_encoders['awayteam'].transform([away_team_name])[0]
    round_encoded = label_encoders['round'].transform([round_name])[0]
    venue_encoded = label_encoders['venue'].transform([venue_name])[0]

    input_full = pd.DataFrame({
        'year': [year],
        'round': [round_encoded],
        'hometeam': [home_team_encoded],
        'awayteam': [away_team_encoded],
        'venue': [venue_encoded],
        'maxtemp': [maxtemp],
        'mintemp': [mintemp],
        **{k: [v] for k, v in predicted_stats.items()}
    })

    # Predict with uncertainty
    home_pred = model_home.predict(input_full, prediction_type='RMSEWithUncertainty')
    away_pred = model_away.predict(input_full, prediction_type='RMSEWithUncertainty')

    home_score_mean, home_score_std = home_pred[0]
    away_score_mean, away_score_std = away_pred[0]

    # Calculate relative confidence
    MAX_STD = 3.0  # adjust based on your highest observed std dev
    home_confidence_pct = max(0, min(100, 100 - (home_score_std / MAX_STD) * 100))
    away_confidence_pct = max(0, min(100, 100 - (away_score_std / MAX_STD) * 100))

    # Match-level average confidence
    match_confidence_pct = max(home_confidence_pct,away_confidence_pct)

    # # Badge function
    # def confidence_badge(pct):
    #     if pct >= 65:
    #         return f"{pct:.0f}% 🟢 High"
    #     elif pct >= 40:
    #         return f"{pct:.0f}% 🟡 Medium"
    #     else:
    #         return f"{pct:.0f}% 🔴 Low"

    # # Print results
    # print("🔢 Predicted Score:")
    # print(f"{home_team_name}: {int(round(home_score_mean))} ± {home_score_std:.2f} → {confidence_badge(home_confidence_pct)}")
    # print(f"{away_team_name}: {int(round(away_score_mean))} ± {away_score_std:.2f} → {confidence_badge(away_confidence_pct)}")
    # print(f"\n🧠 Match Confidence: {confidence_badge(match_confidence_pct)}")

    return {
        "home_score": round(home_score_mean),
        "away_score": round(away_score_mean),
        "match_confidence": match_confidence_pct
    }


✅ Player stat models trained.
✅ Final score models trained.
✅ Models saved!


In [None]:
# ✅ Test
predict_match('Richmond Tigers', 'Carlton Blues', 'R1', 'M.C.G.',2022)

{'home_score': 87, 'away_score': 81, 'match_confidence': 66.26880925048664}

In [None]:
predict_match('Richmond Tigers', 'Carlton Blues', 'R1', 'M.C.G.',2021)

{'home_score': 83, 'away_score': 80, 'match_confidence': 70.26044187779178}

In [None]:
predict_match('Richmond Tigers', 'Carlton Blues', 'R1', 'M.C.G.',2018)

{'home_score': 91, 'away_score': 74, 'match_confidence': 88.68983647438453}

In [None]:
predict_match('Gold Coast Suns', 'Fremantle Dockers', 'R8', 'Perth Stadium')

{'home_score': 68, 'away_score': 74, 'match_confidence': 71.75671882411986}

In [None]:
predict_match('Gold Coast Suns', 'Sydney Swans', 'R8', 'Perth Stadium')

{'home_score': 68, 'away_score': 86, 'match_confidence': 68.17367780942956}

In [None]:
predict_match('Gold Coast Suns', 'Geelong Cats', 'R8', 'Perth Stadium')

{'home_score': 67, 'away_score': 81, 'match_confidence': 78.75111974863937}

In [None]:
predict_match('Richmond Tigers', 'Carlton Blues', 'R1', 'M.C.G.')

{'home_score': 87, 'away_score': 81, 'match_confidence': 66.26880925048664}

In [None]:
predict_match('Richmond Tigers', 'Carlton Blues', 'R1', 'Adelaide Oval')

{'home_score': 86, 'away_score': 68, 'match_confidence': 56.18834496069047}

In [None]:
predict_match('Port Adelaide Power', 'Fremantle Dockers', 'R23', 'M.C.G.')

{'home_score': 86, 'away_score': 63, 'match_confidence': 54.35258954631658}

In [None]:
predict_match('Port Adelaide Power', 'Fremantle Dockers', 'R23', 'Perth Stadium')

{'home_score': 85, 'away_score': 56, 'match_confidence': 32.7499248361391}

In [None]:
predict_match('Melbourne Demons', 'Geelong Cats', 'R1', 'M.C.G.')

{'home_score': 83, 'away_score': 83, 'match_confidence': 69.56574689562068}

In [None]:
predict_match('Melbourne Demons', 'Geelong Cats', 'R1', 'Perth Stadium')

{'home_score': 76, 'away_score': 70, 'match_confidence': 71.37910817877076}

In [None]:
predict_match('Collingwood Magpies', 'GWS Giants', 'R1', 'M.C.G.')

{'home_score': 76, 'away_score': 80, 'match_confidence': 59.25456090499605}

In [None]:
predict_match('Collingwood Magpies', 'GWS Giants', 'R1', 'Perth Stadium')

{'home_score': 74, 'away_score': 75, 'match_confidence': 73.96345683448537}

In [None]:
import pandas as pd

df_batch = pd.DataFrame([
    ("Brisbane Lions", "Melbourne Demons", "R23", "Gabba", 28.1, 8.4),
    ("GWS Giants", "Fremantle Dockers", "R23", "Manuka Oval", 13.0, 4.4),
    ("North Melbourne Kangaroos", "Gold Coast Suns", "R23", "Docklands", 13.2, 9.7),
    ("Geelong Cats", "West Coast Eagles", "R23", "Kardinia Park", 13.7, 8.9),
    ("Essendon Bombers", "Richmond Tigers", "R23", "M.C.G.", 13.2, 9.7),
    ("Port Adelaide Power", "Adelaide Crows", "R23", "Adelaide Oval", 15.1, 12.4),
    ("Hawthorn Hawks", "Western Bulldogs", "R23", "York Park", 12.1, 1.5),
    ("Carlton Blues", "Collingwood Magpies", "R23", "M.C.G.", 14.2, 6.3)
], columns=["hometeam", "awayteam", "round", "venue", "maxtemp", "mintemp"])
df_batch["year"] = 2022  # Add the year

# Now loop and call your prediction function
for _, row in df_batch.iterrows():
    print("=" * 50)
    print(predict_match(
        home_team_name=row['hometeam'],
        away_team_name=row['awayteam'],
        round_name=row['round'],
        venue_name=row['venue'],
        year=row['year'],
        maxtemp=row['maxtemp'],
        mintemp=row['mintemp']
    ))


{'home_score': 86, 'away_score': 88, 'match_confidence': 78.34531327721537}
{'home_score': 81, 'away_score': 67, 'match_confidence': 61.55625145328327}
{'home_score': 80, 'away_score': 92, 'match_confidence': 66.13787539378765}
{'home_score': 111, 'away_score': 62, 'match_confidence': 94.76872809110465}
{'home_score': 66, 'away_score': 95, 'match_confidence': 80.37984064170706}
{'home_score': 101, 'away_score': 74, 'match_confidence': 88.03214912475704}
{'home_score': 72, 'away_score': 87, 'match_confidence': 73.74553625692673}
{'home_score': 74, 'away_score': 87, 'match_confidence': 82.17311031599931}


In [None]:
import pickle

# Save player stat models
with open("/content/player_stat_models.pkl", "wb") as f:
    pickle.dump(stat_models, f)

# Save label encoders
with open("/content/label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)

print("✅ Player stat models and encoders saved!")


✅ Player stat models and encoders saved!


In [None]:
predict_match("Port Adelaide Power", "Fremantle Dockers", "R1", "Adelaide Oval", year=2018, maxtemp=28.7, mintemp=14)

Predicted Score:
Port Adelaide Power: 100
Fremantle Dockers: 73


(100, 73)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Step 1: Prepare test data for evaluation
# Important: test data must include both metadata and player stats
X_test_full = pd.concat([X_test[metadata_features], df_matches.loc[X_test.index, player_stats]], axis=1)

# Step 2: Predict on test set
y_pred_home = model_home.predict(X_test_full)
y_pred_away = model_away.predict(X_test_full)

# Step 3: Round predictions (optional for realism)
y_pred_home_int = np.round(y_pred_home)
y_pred_away_int = np.round(y_pred_away)

# Step 4: Calculate evaluation metrics

# Home score metrics
mae_home = mean_absolute_error(y_test_home, y_pred_home_int)
rmse_home = np.sqrt(mean_squared_error(y_test_home, y_pred_home_int))
r2_home = r2_score(y_test_home, y_pred_home)

# Away score metrics
mae_away = mean_absolute_error(y_test_away, y_pred_away_int)
rmse_away = np.sqrt(mean_squared_error(y_test_away, y_pred_away_int))
r2_away = r2_score(y_test_away, y_pred_away)

# Step 5: Print results
print(f"🏠 Home Score - MAE: {mae_home:.2f}, RMSE: {rmse_home:.2f}, R²: {r2_home:.4f}")
print(f"🧳 Away Score - MAE: {mae_away:.2f}, RMSE: {rmse_away:.2f}, R²: {r2_away:.4f}")


🏠 Home Score - MAE: 2.76, RMSE: 3.62, R²: 0.9803
🧳 Away Score - MAE: 2.74, RMSE: 3.80, R²: 0.9786
