In [103]:
import pandas as pd

matches = pd.read_csv("matches.csv", index_col=0)

# Clean column names by stripping whitespace
matches.columns = matches.columns.str.strip()

# Verify and process the 'date' column
if 'date' in matches.columns:
    matches['date'] = pd.to_datetime(matches['date'], errors='coerce')

# Encode categorical variables
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches['team'] = matches['team'].str.replace(r'\n2024-2025', '', regex=True)

# Map day of the week to numerical codes
day_mapping = {
    "Mon": 1, "Tue": 2, "Wed": 3, "Thu": 4,
    "Fri": 5, "Sat": 6, "Sun": 7
}
matches['day'] = matches['day'].str.strip().str.title()
matches['day_code'] = matches['day'].map(day_mapping)

# Create target column
matches["target"] = (matches["result"] == "W").astype("int")

# Process the 'round' column
matches['round'] = matches['round'].str.extract('(\d+)').astype(int)
matches = matches.sort_values(by='round').reset_index(drop=True)
matches["team"] = matches["team"].str.strip()

# Group matches by team
grouped_matches = matches.groupby("team")


# Define rolling average function
def rolling_averages(group, cols, new_cols):
    """
    Calculate rolling averages for specified columns within a group, 
    ensuring no rows are dropped by filling NaN values with 0.
    """
    group = group.sort_values("round")  # Ensure rows are sorted by round
    rolling_stats = group[cols].rolling(3, closed="left").mean()  # Calculate 3-period rolling averages
    group[new_cols] = rolling_stats  # Assign rolling averages to new columns
    group[new_cols] = group[new_cols].fillna(0)  # Fill NaN values with 0 to retain all rows
    return group


# Define the columns for rolling averages
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt", "xg", "xga", 'poss']
new_cols = [f"{c}_rolling" for c in cols]

# Apply rolling averages
matches_rolling = matches.groupby("team", group_keys=False).apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling.reset_index(drop=True, inplace=True)


# Add win/loss streaks and interactions
def calculate_streaks_and_interactions(matches_rolling):
    matches_rolling['win_streak'] = 0
    matches_rolling['loss_streak'] = 0
    matches_rolling['win_xg_interaction'] = 0.0
    matches_rolling['loss_xga_interaction'] = 0.0

    for team in matches_rolling['team'].unique():
        team_data = matches_rolling[matches_rolling['team'] == team]
        win_streak, loss_streak = 0, 0

        for index, row in team_data.iterrows():
            if row['target'] == 1:  # Win
                win_streak += 1
                loss_streak = 0
            elif row['target'] == 0:  # Loss
                loss_streak += 1
                win_streak = 0

            matches_rolling.at[index, 'win_streak'] = win_streak
            matches_rolling.at[index, 'loss_streak'] = loss_streak
            matches_rolling.at[index, 'win_xg_interaction'] = win_streak * row.get('xg_rolling', 0)
            matches_rolling.at[index, 'loss_xga_interaction'] = loss_streak * row.get('xga_rolling', 0)

    matches_rolling['win_streak'] = matches_rolling['win_streak'].astype(int)
    matches_rolling['loss_streak'] = matches_rolling['loss_streak'].astype(int)
    matches_rolling['win_xg_interaction'] = matches_rolling['win_xg_interaction'].astype(float)
    matches_rolling['loss_xga_interaction'] = matches_rolling['loss_xga_interaction'].astype(float)
    return matches_rolling


matches_rolling = calculate_streaks_and_interactions(matches_rolling)


# Add advanced features, including Low Goals Conceded, xG Difference, and Combined xG
def add_advanced_features(matches_rolling):
    df = matches_rolling.copy()

    # Low Goals Conceded Rolling Average
    df['low_ga_rolling'] = df['ga_rolling']

    # Rolling xG Difference
    df['xg_diff_rolling'] = abs(df['xg_rolling'] - df['xga_rolling'])

    # Combined xG
    df['combined_xg_rolling'] = df['xg_rolling'] + df['xga_rolling']

    # Recent form
    form = df.groupby('team')['target'].rolling(window=5, closed='left').mean()
    df['recent_form'] = form.reset_index(level=0, drop=True)

    # Goal difference
    df['goal_diff_rolling'] = df['gf_rolling'] - df['ga_rolling']

    # Shot accuracy
    df['shot_accuracy'] = df['sot_rolling'] / df['sh_rolling'].replace(0, 1)

    # Fill NaN values with 0
    df = df.fillna(0)

    return df


# Apply advanced features
matches_rolling = add_advanced_features(matches_rolling)
# Add interaction terms for draw-specific features
# Combined xG
matches_rolling['combined_xg_rolling'] = matches_rolling['xg_rolling'] + matches_rolling['xga_rolling']
matches_rolling['xg_low_ga_interaction'] = matches_rolling['xg_diff_rolling'] * matches_rolling['low_ga_rolling']
matches_rolling['xg_combined_interaction'] = matches_rolling['xg_diff_rolling'] * matches_rolling['combined_xg_rolling']
matches_rolling['low_ga_combined_interaction'] = matches_rolling['low_ga_rolling'] * matches_rolling['combined_xg_rolling']


# Function to calculate H2H statistics
def calculate_h2h_stats(historical_data, team, opponent):
    """
    Calculate head-to-head (H2H) statistics between two teams.
    """
    h2h_matches = historical_data[
        ((historical_data['team'] == team) & (historical_data['opponent'] == opponent)) |
        ((historical_data['team'] == opponent) & (historical_data['opponent'] == team))
    ]
    
    if h2h_matches.empty:
        return 0, 0  # Default values if no H2H matches found

    # Calculate H2H win percentage for the "team"
    team_wins = len(h2h_matches[(h2h_matches['team'] == team) & (h2h_matches['result'] == 'W')])
    total_h2h_matches = len(h2h_matches)
    h2h_win_percentage = team_wins / total_h2h_matches if total_h2h_matches > 0 else 0

    # Calculate H2H average goal difference (from the "team" perspective)
    team_goals = h2h_matches[h2h_matches['team'] == team]['gf'].sum()
    opponent_goals = h2h_matches[h2h_matches['team'] == team]['ga'].sum()
    h2h_avg_goal_diff = (team_goals - opponent_goals) / total_h2h_matches if total_h2h_matches > 0 else 0

    return h2h_win_percentage, h2h_avg_goal_diff

# Function to add H2H features to the dataset
def add_h2h_features(matches, historical_data):
    """
    Add H2H statistics as features to the dataset.
    """
    matches['h2h_win_percentage'] = 0.0
    matches['h2h_avg_goal_diff'] = 0.0

    for i, row in matches.iterrows():
        team = row['team']
        opponent = row['opponent']
        h2h_win_percentage, h2h_avg_goal_diff = calculate_h2h_stats(historical_data, team, opponent)
        matches.at[i, 'h2h_win_percentage'] = h2h_win_percentage
        matches.at[i, 'h2h_avg_goal_diff'] = h2h_avg_goal_diff

    return matches

# Apply H2H features to matches_rolling
matches_rolling = add_h2h_features(matches_rolling, matches_rolling)

# Update feature list to include H2H stats
features = [
    'venue_code', 'opp_code', 'hour', 'day_code',
    'gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 
    'dist_rolling', 'fk_rolling', 'pk_rolling', 'pkatt_rolling',
    'xg_rolling', 'xga_rolling', 'recent_form', 'goal_diff_rolling', 
    'shot_accuracy', 'low_ga_rolling','xg_diff_rolling','combined_xg_rolling',
    'xg_low_ga_interaction',
    'xg_combined_interaction','low_ga_combined_interaction','h2h_win_percentage', 'h2h_avg_goal_diff','poss'  # Add H2H features
]
target = 'target'
# Display the updated DataFrame
print(matches_rolling.columns.tolist()
)
print(matches_rolling.head())

['time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga', 'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation', 'opp formation', 'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt', 'season', 'team', 'venue_code', 'opp_code', 'hour', 'day_code', 'target', 'gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'dist_rolling', 'fk_rolling', 'pk_rolling', 'pkatt_rolling', 'xg_rolling', 'xga_rolling', 'poss_rolling', 'win_streak', 'loss_streak', 'win_xg_interaction', 'loss_xga_interaction', 'low_ga_rolling', 'xg_diff_rolling', 'combined_xg_rolling', 'recent_form', 'goal_diff_rolling', 'shot_accuracy', 'xg_low_ga_interaction', 'xg_combined_interaction', 'low_ga_combined_interaction', 'h2h_win_percentage', 'h2h_avg_goal_diff']
    time            comp  round  day venue result   gf   ga         opponent  \
0  16:30  Premier League      1  Sun  Away      L  0.0  1.0        Tottenham   
1  20:00  Premier League      1  Fri  Away      L  0.0  1.0

In [105]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import xgboost as xgb

def create_match_prediction_model(matches_rolling):
    # Define features for the model
    features = [
        'venue_code', 'opp_code', 'hour', 'day_code',
        'gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 
        'dist_rolling', 'fk_rolling', 'pk_rolling', 'pkatt_rolling',
        'xg_rolling', 'xga_rolling', 'recent_form', 'goal_diff_rolling', 
        'shot_accuracy','low_ga_rolling','xg_diff_rolling','combined_xg_rolling',
        'xg_low_ga_interaction','xg_combined_interaction','low_ga_combined_interaction',
        'h2h_win_percentage', 'h2h_avg_goal_diff','poss_rolling'
    ]
    
    # Prepare X and y
    X = matches_rolling[features]
    y = matches_rolling['target']
    
    # Initialize time series split
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Initialize scaler and model
    scaler = StandardScaler()
    xgb_model = xgb.XGBClassifier(random_state=42)
    
    xgb_scores = []
    
    # Perform time series cross-validation
    for train_idx, test_idx in tscv.split(X):
        # Split data
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        # Scale features
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train and evaluate XGBoost
        xgb_model.fit(X_train_scaled, y_train)
        xgb_pred = xgb_model.predict(X_test_scaled)
        xgb_scores.append(accuracy_score(y_test, xgb_pred))
    
    # Print results
    print("XGBoost Average Accuracy:", np.mean(xgb_scores))
    
    # Train final model on all data
    X_scaled = scaler.fit_transform(X)
    final_model = xgb.XGBClassifier(random_state=42)
    final_model.fit(X_scaled, y)
    print("\nFinal Model: XGBoost")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': final_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
    
    return scaler, final_model, feature_importance

# Example usage
# Assuming `matches_rolling` is your dataset
scaler, model, importance = create_match_prediction_model(matches_rolling)


XGBoost Average Accuracy: 0.8573363431151242

Final Model: XGBoost

Top 10 Most Important Features:
               feature  importance
23  h2h_win_percentage    0.754338
0           venue_code    0.049963
10          pk_rolling    0.011292
12          xg_rolling    0.010060
3             day_code    0.009792
15   goal_diff_rolling    0.009672
4           gf_rolling    0.009508
2                 hour    0.009465
8         dist_rolling    0.009332
25        poss_rolling    0.009231


In [14]:
matches_rolling

Unnamed: 0,time,comp,round,day,venue,result,gf,ga,opponent,xg,...,xg_diff_rolling,combined_xg_rolling,recent_form,goal_diff_rolling,shot_accuracy,xg_low_ga_interaction,xg_combined_interaction,low_ga_combined_interaction,h2h_win_percentage,h2h_avg_goal_diff
0,16:30,Premier League,1,Sun,Away,L,0.0,1.0,Tottenham,1.8,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,-1.0
1,20:00,Premier League,1,Fri,Away,L,0.0,1.0,Manchester Utd,0.4,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,-1.0
2,15:00,Premier League,1,Sat,Away,L,2.0,3.0,Watford,1.2,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,-1.0
3,15:00,Premier League,1,Sat,Away,D,1.0,1.0,Nott'ham Forest,1.2,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
4,15:00,Premier League,1,Sat,Away,L,1.0,3.0,Everton,0.8,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2653,16:30,Premier League,38,Sun,Away,L,0.0,1.0,Brentford,1.6,...,0.533333,2.533333,1.0,1.666667,0.292683,0.177778,1.351111,0.844444,0.0,-1.0
2654,16:00,Premier League,38,Sun,Away,W,2.0,0.0,Brighton,1.3,...,0.566667,2.966667,0.2,-1.666667,0.291667,1.133333,1.681111,5.933333,0.5,0.0
2655,16:30,Premier League,38,Sun,Away,L,1.0,2.0,Manchester Utd,1.8,...,0.033333,3.166667,0.4,1.333333,0.361111,0.055556,0.105556,5.277778,0.0,-1.0
2656,16:00,Premier League,38,Sun,Home,L,2.0,4.0,Fulham,2.0,...,1.366667,2.500000,0.0,-1.000000,0.387097,2.733333,3.416667,5.000000,0.0,-1.5


In [147]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb

# Assume matches_rolling is your dataset
# Define features and target
features = [
    'venue_code', 'opp_code', 'hour', 'day_code',
    'gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 
    'dist_rolling', 'fk_rolling', 'pk_rolling', 'pkatt_rolling',
    'xg_rolling', 'xga_rolling', 'recent_form', 'goal_diff_rolling', 
    'shot_accuracy','low_ga_rolling','xg_diff_rolling','combined_xg_rolling','xg_low_ga_interaction',
    'xg_combined_interaction','low_ga_combined_interaction','h2h_win_percentage', 'h2h_avg_goal_diff','poss_rolling'
]

# Update target: 0 = Loss, 1 = Draw, 2 = Win
# Ensure your `matches_rolling['result']` has values mapped as follows:
# "L" -> 0, "D" -> 1, "W" -> 2
target = 'target'
matches_rolling['target'] = matches_rolling['result'].map({'L': 0, 'D': 1, 'W': 2})

X = matches_rolling[features]
y = matches_rolling[target]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# Define best parameters for multi-class classification
best_params = {
    'colsample_bytree': 0.9,
    'gamma': 0.1,
    'learning_rate': 0.01,
    'min_child_weight': 1,
    'max_depth': 3,  # Reduce depth for simpler trees
    'n_estimators': 500,
    'subsample': 1.0,
    'random_state': 42,
    'objective': 'multi:softprob',  # Multi-class classification
    'num_class': 3  # Number of classes: Loss, Draw, Win
}

# Retrain XGBoost model with best parameters
xgb_model = xgb.XGBClassifier(**best_params)
xgb_model.fit(X_train_scaled, y_train)

# Predict on test set
y_proba = xgb_model.predict_proba(X_test_scaled)  # Probabilities for each class
y_pred = np.argmax(y_proba, axis=1)  # Get class with highest probability

# Evaluate the model
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Display probabilities for the first few samples (example)
print("\nSample Predictions:")
for i in range(5):
    print(f"Match {i+1}: Loss={y_proba[i][0]:.2f}, Draw={y_proba[i][1]:.2f}, Win={y_proba[i][2]:.2f} -> Predicted: {y_pred[i]}")



Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       204
           1       0.88      0.40      0.54       124
           2       0.76      0.94      0.84       204

    accuracy                           0.79       532
   macro avg       0.81      0.74      0.74       532
weighted avg       0.80      0.79      0.77       532


Confusion Matrix:
[[180   4  20]
 [ 34  49  41]
 [ 10   3 191]]

Sample Predictions:
Match 1: Loss=0.01, Draw=0.01, Win=0.99 -> Predicted: 2
Match 2: Loss=0.22, Draw=0.32, Win=0.45 -> Predicted: 2
Match 3: Loss=0.01, Draw=0.01, Win=0.98 -> Predicted: 2
Match 4: Loss=0.01, Draw=0.01, Win=0.99 -> Predicted: 2
Match 5: Loss=0.01, Draw=0.01, Win=0.99 -> Predicted: 2


In [161]:
# Optional: Save the trained model
import joblib
joblib.dump(xgb_model, "xgb_model.joblib")
print("\nModel saved as 'xgb_model.joblib'")


Model saved as 'xgb_model.joblib'


In [167]:
import pandas as pd
import numpy as np
from datetime import datetime

# Create DataFrame for new matches with all games from Jan 14-16
new_matches = pd.DataFrame({
    'date': [
        '2025-01-14', '2025-01-14', '2025-01-14', '2025-01-14',  # Jan 14 matches
        '2025-01-15', '2025-01-15', '2025-01-15', '2025-01-15',  # Jan 15 matches
        '2025-01-16', '2025-01-16'  # Jan 16 matches
    ],
    'time': [
        '19:30', '19:30', '19:30', '20:00',  # Jan 14 times
        '19:30', '19:30', '19:30', '20:00',  # Jan 15 times
        '19:30', '20:00'  # Jan 16 times
    ],
    'team': [
        'Brentford', 'West Ham', 'Chelsea', "Nott'ham Forest",  # Jan 14 home teams
        'Everton', 'Newcastle Utd', 'Leicester City', 'Arsenal',  # Jan 15 home teams
        'Ipswich Town', 'Manchester Utd'  # Jan 16 home teams
    ],
    'opponent': [
        'Manchester City', 'Fulham', 'Bournemouth', 'Liverpool',  # Jan 14 away teams
        'Aston Villa', 'Wolves', 'Crystal Palace', 'Tottenham',  # Jan 15 away teams
        'Brighton', 'Southampton'  # Jan 16 away teams
    ],
    'venue': ['Home'] * 10,  # All matches are home games for the first team
    'day': [
        'Tue', 'Tue', 'Tue', 'Tue',  # Jan 14 days
        'Wed', 'Wed', 'Wed', 'Wed',  # Jan 15 days
        'Thu', 'Thu'  # Jan 16 days
    ]
})

def prepare_new_matches(new_matches, matches_rolling, scaler, features):
    """
    Prepare new matches data using the same preprocessing steps as training data
    """
    # Copy the new matches dataframe
    pred_matches = new_matches.copy()
    
    # Encode categorical variables
    venue_mapping = {'Home': 1, 'Away': 0}
    pred_matches['venue_code'] = pred_matches['venue'].map(venue_mapping)
    
    # Get opponent codes from existing data
    opp_mapping = dict(zip(matches_rolling['opponent'].unique(), 
                          matches_rolling['opp_code'].unique()))
    pred_matches['opp_code'] = pred_matches['opponent'].map(opp_mapping)
    
    # Extract hour from time
    pred_matches['hour'] = pred_matches['time'].str.split(':').str[0].astype(int)
    
    # Map day codes
    day_mapping = {'Mon': 1, 'Tue': 2, 'Wed': 3, 'Thu': 4, 'Fri': 5, 'Sat': 6, 'Sun': 7}
    pred_matches['day_code'] = pred_matches['day'].map(day_mapping)
    
    # Get latest rolling averages for each team
    latest_stats = matches_rolling.groupby('team').last().reset_index()
    
    # List of all rolling and derived features we need
    required_features = [
        'gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 
        'dist_rolling', 'fk_rolling', 'pk_rolling', 'pkatt_rolling',
        'xg_rolling', 'xga_rolling', 'recent_form', 'goal_diff_rolling', 
        'shot_accuracy', 'low_ga_rolling', 'xg_diff_rolling', 'combined_xg_rolling',
        'xg_low_ga_interaction', 'xg_combined_interaction', 
        'low_ga_combined_interaction', 'h2h_win_percentage', 
        'h2h_avg_goal_diff', 'poss_rolling'
    ]
    
    # Merge rolling statistics
    pred_matches = pred_matches.merge(
        latest_stats[['team'] + required_features], 
        on='team', 
        how='left'
    )
    
    # Fill any missing values with 0
    pred_matches = pred_matches.fillna(0)
    
    # Ensure we have exactly the same features in the same order as training
    X_pred = pred_matches[features].copy()
    
    # Verify we have all required features before scaling
    missing_features = set(features) - set(X_pred.columns)
    if missing_features:
        raise ValueError(f"Missing features: {missing_features}")
    
    # Scale features
    X_pred_scaled = scaler.transform(X_pred[features])
    
    return X_pred_scaled, pred_matches

def predict_matches(X_pred_scaled, pred_matches, xgb_model):
    """
    Make predictions and format results
    """
    # Get probability predictions
    y_proba = xgb_model.predict_proba(X_pred_scaled)
    
    # Create results DataFrame
    results = pd.DataFrame({
        'Date': pd.to_datetime(pred_matches['date']).dt.strftime('%Y-%m-%d'),
        'Time': pred_matches['time'],
        'Home Team': pred_matches['team'],
        'Away Team': pred_matches['opponent'],
        'Venue': pred_matches['venue'],
        'Loss Prob': [f"{prob[0]:.2%}" for prob in y_proba],
        'Draw Prob': [f"{prob[1]:.2%}" for prob in y_proba],
        'Win Prob': [f"{prob[2]:.2%}" for prob in y_proba],
        'Predicted Result': ['Win' if np.argmax(prob) == 2 else 'Draw' if np.argmax(prob) == 1 else 'Loss' 
                           for prob in y_proba]
    })
    
    # Sort by date and time
    results = results.sort_values(['Date', 'Time'])
    
    return results

# Print features to verify
print("Features being used:", features)

# Make predictions
X_pred_scaled, pred_matches = prepare_new_matches(new_matches, matches_rolling, scaler, features)
results = predict_matches(X_pred_scaled, pred_matches, xgb_model)

# Display results with better formatting
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print("\nMatch Predictions:")
print(results.to_string(index=False))

# Optional: Save predictions to CSV
# results.to_csv('match_predictions.csv', index=False)

Features being used: ['venue_code', 'opp_code', 'hour', 'day_code', 'gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'dist_rolling', 'fk_rolling', 'pk_rolling', 'pkatt_rolling', 'xg_rolling', 'xga_rolling', 'recent_form', 'goal_diff_rolling', 'shot_accuracy', 'low_ga_rolling', 'xg_diff_rolling', 'combined_xg_rolling', 'xg_low_ga_interaction', 'xg_combined_interaction', 'low_ga_combined_interaction', 'h2h_win_percentage', 'h2h_avg_goal_diff', 'poss_rolling']

Match Predictions:
      Date  Time       Home Team       Away Team Venue Loss Prob Draw Prob Win Prob Predicted Result
2025-01-14 19:30       Brentford Manchester City  Home    90.46%     8.23%    1.31%             Loss
2025-01-14 19:30        West Ham          Fulham  Home    67.32%    31.00%    1.68%             Loss
2025-01-14 19:30         Chelsea     Bournemouth  Home    91.71%     7.14%    1.15%             Loss
2025-01-14 20:00 Nott'ham Forest       Liverpool  Home    67.32%    31.00%    1.68%             Loss
2025-

In [169]:
import pandas as pd
import numpy as np

# First, let's check team names in our dataset
print("Unique teams in matches_rolling:")
print(sorted(matches_rolling['team'].unique()))

# Modified prepare_new_matches function with additional checks
def prepare_new_matches(new_matches, matches_rolling, scaler, features):
    """
    Prepare new matches data with additional validation
    """
    pred_matches = new_matches.copy()
    
    # Print team mapping check
    print("\nChecking team mappings...")
    for team in pred_matches['team'].unique():
        matches = matches_rolling[matches_rolling['team'] == team]
        if len(matches) == 0:
            print(f"Warning: No data found for team {team}")
        else:
            print(f"Found {len(matches)} matches for {team}")
    
    # Encode categorical variables
    venue_mapping = {'Home': 1, 'Away': 0}
    pred_matches['venue_code'] = pred_matches['venue'].map(venue_mapping)
    
    # Get opponent codes with verification
    all_teams = set(matches_rolling['team'].unique()) | set(matches_rolling['opponent'].unique())
    opp_mapping = dict(zip(matches_rolling['opponent'].unique(), 
                          matches_rolling['opp_code'].unique()))
    
    # Verify opponent mapping
    for opp in pred_matches['opponent']:
        if opp not in opp_mapping:
            print(f"Warning: No opponent code found for {opp}")
    
    pred_matches['opp_code'] = pred_matches['opponent'].map(opp_mapping)
    
    # Time and day processing
    pred_matches['hour'] = pred_matches['time'].str.split(':').str[0].astype(int)
    day_mapping = {'Mon': 1, 'Tue': 2, 'Wed': 3, 'Thu': 4, 'Fri': 5, 'Sat': 6, 'Sun': 7}
    pred_matches['day_code'] = pred_matches['day'].map(day_mapping)
    
    # Get latest statistics with verification
    latest_stats = matches_rolling.groupby('team').last().reset_index()
    
    # Print sample of rolling averages for verification
    print("\nSample of rolling averages:")
    print(latest_stats[['team', 'gf_rolling', 'ga_rolling', 'xg_rolling']].head())
    
    # Merge statistics
    pred_matches = pred_matches.merge(
        latest_stats[['team'] + [col for col in features if col not in ['venue_code', 'opp_code', 'hour', 'day_code']]], 
        on='team', 
        how='left'
    )
    
    # Check for missing values
    missing_cols = pred_matches.columns[pred_matches.isna().any()].tolist()
    if missing_cols:
        print(f"\nWarning: Missing values in columns: {missing_cols}")
    
    pred_matches = pred_matches.fillna(0)
    
    # Prepare features in correct order
    X_pred = pred_matches[features].copy()
    
    # Print feature ranges before and after scaling
    print("\nFeature ranges before scaling:")
    print(X_pred.describe().round(2))
    
    X_pred_scaled = scaler.transform(X_pred)
    
    print("\nFeature ranges after scaling:")
    print(pd.DataFrame(X_pred_scaled, columns=features).describe().round(2))
    
    return X_pred_scaled, pred_matches

# Create new matches DataFrame (same as before)
new_matches = pd.DataFrame({
    'date': [
        '2025-01-14', '2025-01-14', '2025-01-14', '2025-01-14',
        '2025-01-15', '2025-01-15', '2025-01-15', '2025-01-15',
        '2025-01-16', '2025-01-16'
    ],
    'time': [
        '19:30', '19:30', '19:30', '20:00',
        '19:30', '19:30', '19:30', '20:00',
        '19:30', '20:00'
    ],
    'team': [
        'Brentford', 'West Ham', 'Chelsea', "Nott'ham Forest",
        'Everton', 'Newcastle Utd', 'Leicester City', 'Arsenal',
        'Ipswich Town', 'Manchester Utd'
    ],
    'opponent': [
        'Manchester City', 'Fulham', 'Bournemouth', 'Liverpool',
        'Aston Villa', 'Wolves', 'Crystal Palace', 'Tottenham',
        'Brighton', 'Southampton'
    ],
    'venue': ['Home'] * 10,
    'day': [
        'Tue', 'Tue', 'Tue', 'Tue',
        'Wed', 'Wed', 'Wed', 'Wed',
        'Thu', 'Thu'
    ]
})

# Make predictions with enhanced debugging
X_pred_scaled, pred_matches = prepare_new_matches(new_matches, matches_rolling, scaler, features)
results = predict_matches(X_pred_scaled, pred_matches, xgb_model)

print("\nFinal Predictions:")
print(results.to_string(index=False))

Unique teams in matches_rolling:
['2021-2022 Arsenal', '2021-2022 Aston Villa', '2021-2022 Brentford', '2021-2022 Brighton & Hove Albion', '2021-2022 Burnley', '2021-2022 Chelsea', '2021-2022 Crystal Palace', '2021-2022 Everton', '2021-2022 Leeds United', '2021-2022 Leicester City', '2021-2022 Liverpool', '2021-2022 Manchester City', '2021-2022 Manchester United', '2021-2022 Newcastle United', '2021-2022 Norwich City', '2021-2022 Southampton', '2021-2022 Tottenham Hotspur', '2021-2022 Watford', '2021-2022 West Ham United', '2021-2022 Wolverhampton Wanderers', '2022-2023 Arsenal', '2022-2023 Aston Villa', '2022-2023 Bournemouth', '2022-2023 Brentford', '2022-2023 Brighton & Hove Albion', '2022-2023 Chelsea', '2022-2023 Crystal Palace', '2022-2023 Everton', '2022-2023 Fulham', '2022-2023 Leeds United', '2022-2023 Leicester City', '2022-2023 Liverpool', '2022-2023 Manchester City', '2022-2023 Manchester United', '2022-2023 Newcastle United', '2022-2023 Nottingham Forest', '2022-2023 South

In [171]:
# Check some training predictions
train_pred = xgb_model.predict_proba(X_train_scaled[:5])
print("\nSample training predictions:")
for i in range(5):
    print(f"Sample {i+1}: Loss={train_pred[i][0]:.2%}, Draw={train_pred[i][1]:.2%}, Win={train_pred[i][2]:.2%}")


Sample training predictions:
Sample 1: Loss=44.13%, Draw=14.77%, Win=41.09%
Sample 2: Loss=80.94%, Draw=17.37%, Win=1.69%
Sample 3: Loss=94.58%, Draw=4.59%, Win=0.83%
Sample 4: Loss=71.35%, Draw=26.02%, Win=2.63%
Sample 5: Loss=94.11%, Draw=5.01%, Win=0.88%


In [173]:
import pandas as pd
import numpy as np

# Create mapping dictionary for team names
team_name_mapping = {
    'West Ham': '2023-2024 West Ham United',
    'Newcastle Utd': '2023-2024 Newcastle United',
    'Manchester Utd': '2023-2024 Manchester United',
    "Nott'ham Forest": '2023-2024 Nottingham Forest',
    'Brentford': '2023-2024 Brentford',
    'Chelsea': '2023-2024 Chelsea',
    'Everton': '2023-2024 Everton',
    'Leicester City': '2023-2024 Leicester City',
    'Arsenal': '2023-2024 Arsenal',
    'Ipswich Town': '2023-2024 Ipswich Town'
}

# Update new_matches DataFrame
new_matches = pd.DataFrame({
    'date': [
        '2025-01-14', '2025-01-14', '2025-01-14', '2025-01-14',
        '2025-01-15', '2025-01-15', '2025-01-15', '2025-01-15',
        '2025-01-16', '2025-01-16'
    ],
    'time': [
        '19:30', '19:30', '19:30', '20:00',
        '19:30', '19:30', '19:30', '20:00',
        '19:30', '20:00'
    ],
    'team': [
        'Brentford', 'West Ham', 'Chelsea', "Nott'ham Forest",
        'Everton', 'Newcastle Utd', 'Leicester City', 'Arsenal',
        'Ipswich Town', 'Manchester Utd'
    ],
    'opponent': [
        'Manchester City', 'Fulham', 'Bournemouth', 'Liverpool',
        'Aston Villa', 'Wolves', 'Crystal Palace', 'Tottenham',
        'Brighton', 'Southampton'
    ],
    'venue': ['Home'] * 10,
    'day': [
        'Tue', 'Tue', 'Tue', 'Tue',
        'Wed', 'Wed', 'Wed', 'Wed',
        'Thu', 'Thu'
    ]
})

def prepare_new_matches(new_matches, matches_rolling, scaler, features):
    """
    Prepare new matches data with season-aware team names
    """
    pred_matches = new_matches.copy()
    
    # Map team names to include season
    pred_matches['team'] = pred_matches['team'].map(team_name_mapping)
    
    # Map opponent names similarly
    opponent_mapping = {name: f'2023-2024 {name}' for name in pred_matches['opponent'].unique()}
    pred_matches['opponent'] = pred_matches['opponent'].map(opponent_mapping)
    
    # Standard feature preparation
    pred_matches['venue_code'] = 1  # All home games
    
    # Get opponent codes from most recent season only
    recent_matches = matches_rolling[matches_rolling['team'].str.startswith('2023-2024')]
    opp_mapping = dict(zip(recent_matches['opponent'].unique(), 
                          recent_matches['opp_code'].unique()))
    pred_matches['opp_code'] = pred_matches['opponent'].map(opp_mapping)
    
    pred_matches['hour'] = pred_matches['time'].str.split(':').str[0].astype(int)
    day_mapping = {'Mon': 1, 'Tue': 2, 'Wed': 3, 'Thu': 4, 'Fri': 5, 'Sat': 6, 'Sun': 7}
    pred_matches['day_code'] = pred_matches['day'].map(day_mapping)
    
    # Get latest statistics from 2023-2024 season only
    latest_stats = recent_matches.groupby('team').last().reset_index()
    
    # Merge statistics
    pred_matches = pred_matches.merge(
        latest_stats[['team'] + [col for col in features if col not in ['venue_code', 'opp_code', 'hour', 'day_code']]], 
        on='team', 
        how='left'
    )
    
    # Fill any remaining missing values with medians from recent season
    for col in features:
        if col in pred_matches.columns and pred_matches[col].isna().any():
            median_val = recent_matches[col].median()
            pred_matches[col] = pred_matches[col].fillna(median_val)
    
    # Prepare features in correct order
    X_pred = pred_matches[features].copy()
    X_pred_scaled = scaler.transform(X_pred)
    
    return X_pred_scaled, pred_matches

def predict_matches(X_pred_scaled, pred_matches, xgb_model):
    """
    Make predictions with more detailed output
    """
    # Get probability predictions
    y_proba = xgb_model.predict_proba(X_pred_scaled)
    
    # Create results DataFrame
    results = pd.DataFrame({
        'Date': pd.to_datetime(pred_matches['date']).dt.strftime('%Y-%m-%d'),
        'Time': pred_matches['time'],
        'Home Team': pred_matches['team'].str.replace('2023-2024 ', ''),
        'Away Team': pred_matches['opponent'].str.replace('2023-2024 ', ''),
        'Loss Prob': [f"{prob[0]:.2%}" for prob in y_proba],
        'Draw Prob': [f"{prob[1]:.2%}" for prob in y_proba],
        'Win Prob': [f"{prob[2]:.2%}" for prob in y_proba],
        'Predicted Result': ['Win' if np.argmax(prob) == 2 else 'Draw' if np.argmax(prob) == 1 else 'Loss' 
                           for prob in y_proba]
    })
    
    return results.sort_values(['Date', 'Time'])

# Make predictions
X_pred_scaled, pred_matches = prepare_new_matches(new_matches, matches_rolling, scaler, features)
results = predict_matches(X_pred_scaled, pred_matches, xgb_model)

print("\nMatch Predictions:")
print(results.to_string(index=False))


Match Predictions:
      Date  Time         Home Team       Away Team Loss Prob Draw Prob Win Prob Predicted Result
2025-01-14 19:30         Brentford Manchester City    91.75%     7.09%    1.16%             Loss
2025-01-14 19:30   West Ham United          Fulham    91.41%     7.44%    1.15%             Loss
2025-01-14 19:30           Chelsea     Bournemouth    21.64%    41.49%   36.87%             Draw
2025-01-14 20:00 Nottingham Forest       Liverpool    24.34%    32.76%   42.89%              Win
2025-01-15 19:30           Everton     Aston Villa    91.24%     7.59%    1.17%             Loss
2025-01-15 19:30  Newcastle United          Wolves    18.22%    36.16%   45.62%              Win
2025-01-15 19:30    Leicester City  Crystal Palace    64.84%     5.80%   29.36%             Loss
2025-01-15 20:00           Arsenal       Tottenham    18.11%    41.60%   40.29%             Draw
2025-01-16 19:30      Ipswich Town        Brighton    65.90%     5.63%   28.48%             Loss
2025-01-16

In [177]:
import pandas as pd
import numpy as np
from datetime import datetime

# ELO Rating class remains the same
class EloRating:
    def __init__(self):
        self.base_ratings = {
            'Manchester City': 2000,
            'Arsenal': 1950,
            'Liverpool': 1950,
            'Aston Villa': 1900,
            'Tottenham': 1900,
            'Manchester United': 1880,
            'Newcastle United': 1870,
            'Brighton': 1850,
            'West Ham United': 1830,
            'Chelsea': 1820,
            'Brentford': 1800,
            'Wolves': 1780,
            'Crystal Palace': 1770,
            'Fulham': 1760,
            'Nottingham Forest': 1750,
            'Everton': 1740,
            'Bournemouth': 1730,
            'Leicester City': 1700,
            'Ipswich Town': 1650,
            'Southampton': 1650
        }
    
    def get_win_probability(self, team_a, team_b, home_advantage=100):
        rating_a = self.base_ratings.get(team_a, 1500) + home_advantage
        rating_b = self.base_ratings.get(team_b, 1500)
        return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

def calculate_form_factor(team, matches_rolling):
    recent_matches = matches_rolling[matches_rolling['team'].str.contains(team, case=False)].tail(5)
    if len(recent_matches) == 0:
        return 1.0
    form_points = sum(recent_matches['target'].map({2: 3, 1: 1, 0: 0}))
    max_points = len(recent_matches) * 3
    return 0.5 + (form_points / max_points)

def prepare_new_matches_enhanced(new_matches, matches_rolling, scaler, features, elo_rater):
    """Enhanced match preparation with fixed column handling"""
    pred_matches = new_matches.copy()
    
    # Store original team names before any modifications
    pred_matches['original_team'] = pred_matches['team']
    pred_matches['original_opponent'] = pred_matches['opponent']
    
    # Map team names
    team_name_mapping = {
        'West Ham': 'West Ham United',
        'Newcastle Utd': 'Newcastle United',
        'Manchester Utd': 'Manchester United',
        "Nott'ham Forest": 'Nottingham Forest'
    }
    
    # Update team names
    pred_matches['team'] = pred_matches['team'].map(
        lambda x: team_name_mapping.get(x, x)
    )
    
    # Calculate ELO probabilities and form factors
    pred_matches['elo_win_prob'] = pred_matches.apply(
        lambda row: elo_rater.get_win_probability(row['team'], row['opponent']),
        axis=1
    )
    
    pred_matches['form_factor'] = pred_matches['team'].apply(
        lambda x: calculate_form_factor(x, matches_rolling)
    )
    
    # Add season prefix for database matching
    pred_matches['team_season'] = '2023-2024 ' + pred_matches['team']
    
    # Basic feature preparation
    pred_matches['venue_code'] = 1
    pred_matches['hour'] = pred_matches['time'].str.split(':').str[0].astype(int)
    pred_matches['day_code'] = pred_matches['day'].map(
        {'Mon': 1, 'Tue': 2, 'Wed': 3, 'Thu': 4, 'Fri': 5, 'Sat': 6, 'Sun': 7}
    )
    
    # Get recent statistics
    recent_matches = matches_rolling[matches_rolling['team'].str.startswith('2023-2024')]
    latest_stats = recent_matches.groupby('team').last().reset_index()
    
    # Merge statistics
    pred_matches = pred_matches.merge(
        latest_stats[['team'] + [col for col in features if col not in ['venue_code', 'hour', 'day_code']]],
        left_on='team_season',
        right_on='team',
        how='left',
        suffixes=('', '_y')
    )
    
    # Keep original columns
    pred_matches['team'] = pred_matches['original_team']
    pred_matches['opponent'] = pred_matches['original_opponent']
    
    # Fill missing values with medians
    for col in features:
        if col in pred_matches.columns and pred_matches[col].isna().any():
            median_val = recent_matches[col].median()
            pred_matches[col] = pred_matches[col].fillna(median_val)
    
    # Prepare features
    X_pred = pred_matches[features].copy()
    X_pred_scaled = scaler.transform(X_pred)
    
    return X_pred_scaled, pred_matches

def blend_predictions(xgb_proba, elo_prob, form_factor):
    xgb_array = np.array(xgb_proba)
    elo_array = np.array([1 - elo_prob - 0.2, 0.2, elo_prob])
    blended = (0.6 * xgb_array + 0.4 * elo_array) * form_factor
    return blended / blended.sum()

def predict_matches_enhanced(X_pred_scaled, pred_matches, xgb_model):
    """Enhanced prediction function with fixed column references"""
    results = []
    
    for i in range(len(X_pred_scaled)):
        xgb_proba = xgb_model.predict_proba(X_pred_scaled[i:i+1])[0]
        
        final_proba = blend_predictions(
            xgb_proba,
            pred_matches['elo_win_prob'].iloc[i],
            pred_matches['form_factor'].iloc[i]
        )
        
        pred_result = ['Loss', 'Draw', 'Win'][np.argmax(final_proba)]
        
        results.append({
            'Date': pd.to_datetime(pred_matches['date'].iloc[i]).strftime('%Y-%m-%d'),
            'Time': pred_matches['time'].iloc[i],
            'Home Team': pred_matches['original_team'].iloc[i],
            'Away Team': pred_matches['original_opponent'].iloc[i],
            'Loss Prob': f"{final_proba[0]:.2%}",
            'Draw Prob': f"{final_proba[1]:.2%}",
            'Win Prob': f"{final_proba[2]:.2%}",
            'Predicted Result': pred_result,
            'Form Factor': f"{pred_matches['form_factor'].iloc[i]:.2f}",
            'ELO Win Prob': f"{pred_matches['elo_win_prob'].iloc[i]:.2%}"
        })
    
    return pd.DataFrame(results)

# Initialize ELO rater
elo_rater = EloRating()

# Make predictions
X_pred_scaled, pred_matches = prepare_new_matches_enhanced(new_matches, matches_rolling, scaler, features, elo_rater)
results = predict_matches_enhanced(X_pred_scaled, pred_matches, xgb_model)

# Display results
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print("\nEnhanced Match Predictions:")
print(results.to_string(index=False))


Enhanced Match Predictions:
      Date  Time       Home Team       Away Team Loss Prob Draw Prob Win Prob Predicted Result Form Factor ELO Win Prob
2025-01-14 19:30       Brentford Manchester City    72.65%    12.25%   15.10%             Loss        1.10       35.99%
2025-01-14 19:30        West Ham          Fulham    57.78%    12.46%   29.76%             Loss        0.77       72.68%
2025-01-14 19:30         Chelsea     Bournemouth    14.35%    34.08%   51.57%              Win        1.03       74.91%
2025-01-14 20:00 Nott'ham Forest       Liverpool    32.35%    27.27%   40.38%              Win        1.03       35.99%
2025-01-15 19:30         Everton     Aston Villa    69.65%    12.99%   17.35%             Loss        0.90       41.45%
2025-01-15 19:30   Newcastle Utd          Wolves    12.89%    29.37%   57.74%              Win        1.23       74.91%
2025-01-15 19:30  Leicester City  Crystal Palace    49.18%    11.48%   39.34%             Loss        1.17       54.31%
2025-01-15 