In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, RidgeCV
from scipy import sparse
from datetime import datetime, timedelta

In [2]:
matches = []

# Define teams with exactly 5 players each (1 GK, 2 DF, 1 MF, 1 FW)
teams = {
    'Team A': {
        'GK': ['Neuer'],
        'DF': ['Van Dijk', 'Ramos'],
        'MF': ['Modric'],
        'FW': ['Messi']
    },
    'Team B': {
        'GK': ['Oblak'],
        'DF': ['Sergio', 'Varane'],
        'MF': ['Kroos'],
        'FW': ['Mbappe']
    },
    'Team C': {
        'GK': ['Courtois'],
        'DF': ['Alaba', 'Silva'],
        'MF': ['Pogba'],
        'FW': ['Neymar']
    },
    'Team D': {
        'GK': ['Ter Stegen'],
        'DF': ['Marquinhos', 'Koulibaly'],
        'MF': ['Goretzka'],
        'FW': ['Lewandowski']
    }
}

# Define substitutes for each team (2 substitutes per team)
substitutes = {
    'Team A': ['Ronaldo', 'De Bruyne'],
    'Team B': ['Kane', 'Salah'],
    'Team C': ['Haaland', 'Benzema'],
    'Team D': ['Salah', 'Mané']
}

# Helper function to get all active players from a team
def get_active_players(team):
    return team['GK'] + team['DF'] + team['MF'] + team['FW']

# Calculate dates starting from 6 month ago
base_date = datetime.now() - timedelta(days=180)
match_dates = {
    1: base_date,
    2: base_date + timedelta(days=30),
    3: base_date + timedelta(days=60),
    4: base_date + timedelta(days=90),
    5: base_date + timedelta(days=120),
    6: base_date + timedelta(days=150)
}

# Match 1: Team A vs Team B
matches.append({
    'match_id': 1,
    'date': match_dates[1],
    'teamA': 'Team A',
    'teamB': 'Team B',
    'teamA_players': get_active_players(teams['Team A']),
    'teamB_players': get_active_players(teams['Team B']),
    'teamA_xg': 1.2,
    'teamB_xg': 0.9,
    'minutes_played': 90
})

# Match 2: Team A vs Team C
matches.append({
    'match_id': 2,
    'date': match_dates[2],
    'teamA': 'Team A',
    'teamB': 'Team C',
    'teamA_players': get_active_players(teams['Team A']),
    'teamB_players': get_active_players(teams['Team C']),
    'teamA_xg': 1.6,
    'teamB_xg': 1.3,
    'minutes_played': 90
})

# Match 3: Team B vs Team C
matches.append({
    'match_id': 3,
    'date': match_dates[3],
    'teamA': 'Team B',
    'teamB': 'Team C',
    'teamA_players': get_active_players(teams['Team B']),
    'teamB_players': get_active_players(teams['Team C']),
    'teamA_xg': 0.9,
    'teamB_xg': 0.6,
    'minutes_played': 90
})

# Match 4: Team A vs Team D
matches.append({
    'match_id': 4,
    'date': match_dates[4],
    'teamA': 'Team A',
    'teamB': 'Team D',
    'teamA_players': get_active_players(teams['Team A']),
    'teamB_players': get_active_players(teams['Team D']),
    'teamA_xg': 1.4,
    'teamB_xg': 1.1,
    'minutes_played': 90
})

# Match 5: Team B vs Team D
matches.append({
    'match_id': 5,
    'date': match_dates[5],
    'teamA': 'Team B',
    'teamB': 'Team D',
    'teamA_players': get_active_players(teams['Team B']),
    'teamB_players': get_active_players(teams['Team D']),
    'teamA_xg': 1.0,
    'teamB_xg': 1.2,
    'minutes_played': 90
})

# Match 6: Team C vs Team D
matches.append({
    'match_id': 6,
    'date': match_dates[6],
    'teamA': 'Team C',
    'teamB': 'Team D',
    'teamA_players': get_active_players(teams['Team C']),
    'teamB_players': get_active_players(teams['Team D']),
    'teamA_xg': 0.8,
    'teamB_xg': 0.7,
    'minutes_played': 90
})

matches_df = pd.DataFrame(matches)

# Convert date column to datetime if not already
matches_df['date'] = pd.to_datetime(matches_df['date'])

# Calculate days since match
matches_df['days_ago'] = (datetime.now() - matches_df['date']).dt.days
matches_df


Unnamed: 0,match_id,date,teamA,teamB,teamA_players,teamB_players,teamA_xg,teamB_xg,minutes_played,days_ago
0,1,2025-04-18 21:02:22.989863,Team A,Team B,"[Neuer, Van Dijk, Ramos, Modric, Messi]","[Oblak, Sergio, Varane, Kroos, Mbappe]",1.2,0.9,90,180
1,2,2025-05-18 21:02:22.989863,Team A,Team C,"[Neuer, Van Dijk, Ramos, Modric, Messi]","[Courtois, Alaba, Silva, Pogba, Neymar]",1.6,1.3,90,150
2,3,2025-06-17 21:02:22.989863,Team B,Team C,"[Oblak, Sergio, Varane, Kroos, Mbappe]","[Courtois, Alaba, Silva, Pogba, Neymar]",0.9,0.6,90,120
3,4,2025-07-17 21:02:22.989863,Team A,Team D,"[Neuer, Van Dijk, Ramos, Modric, Messi]","[Ter Stegen, Marquinhos, Koulibaly, Goretzka, ...",1.4,1.1,90,90
4,5,2025-08-16 21:02:22.989863,Team B,Team D,"[Oblak, Sergio, Varane, Kroos, Mbappe]","[Ter Stegen, Marquinhos, Koulibaly, Goretzka, ...",1.0,1.2,90,60
5,6,2025-09-15 21:02:22.989863,Team C,Team D,"[Courtois, Alaba, Silva, Pogba, Neymar]","[Ter Stegen, Marquinhos, Koulibaly, Goretzka, ...",0.8,0.7,90,30


In [3]:
# Exponential decay - more recent matches get exponentially higher weights
half_life = 180  # Weight halves every 180 days
matches_df['time_weight'] = np.exp(-np.log(2) * matches_df['days_ago'] / half_life)

# Normalize weights
total_weight = matches_df['time_weight'].sum()
matches_df['time_weight'] = matches_df['time_weight'] / total_weight * len(matches_df)

# Get all unique players
players_set = set()
for idx, row in matches_df.iterrows():
    players_set.update(row['teamA_players'])
    players_set.update(row['teamB_players'])

players = sorted(list(players_set))
num_players = len(players)
players_to_index = {player: idx for idx, player in enumerate(players)}
print(f"Unique players: {players}")
print(f"Total players: {num_players}")

Unique players: ['Alaba', 'Courtois', 'Goretzka', 'Koulibaly', 'Kroos', 'Lewandowski', 'Marquinhos', 'Mbappe', 'Messi', 'Modric', 'Neuer', 'Neymar', 'Oblak', 'Pogba', 'Ramos', 'Sergio', 'Silva', 'Ter Stegen', 'Van Dijk', 'Varane']
Total players: 20


In [4]:
# Build the design matrix and target variable
rows = []
cols = []
data_vals = []
y = []
sample_weights = []
row_num = 0

for idx, row in matches_df.iterrows():
    minutes = row['minutes_played']
    if minutes == 0:
        continue

    time_weight = row['time_weight']

    teamA_players = row['teamA_players']
    teamB_players = row['teamB_players']
    teamA_xg = row['teamA_xg']
    teamB_xg = row['teamB_xg']
    
    # Team A's offensive possessions (positive for offense, negative for defense)
    for p in teamA_players:
        rows.append(row_num)
        cols.append(players_to_index[p])  # Offensive coefficient
        data_vals.append(1)
    for p in teamB_players:
        rows.append(row_num)
        cols.append(num_players + players_to_index[p])  # Defensive coefficient
        data_vals.append(-1)
    y.append(teamA_xg / minutes)  # xG per minute
    sample_weights.append(np.sqrt(minutes * time_weight))
    row_num += 1
    
    # Team B's offensive possessions
    for p in teamB_players:
        rows.append(row_num)
        cols.append(players_to_index[p])  # Offensive coefficient
        data_vals.append(1)
    for p in teamA_players:
        rows.append(row_num)
        cols.append(num_players + players_to_index[p])  # Defensive coefficient
        data_vals.append(-1)
    y.append(teamB_xg / minutes)  # xG per minute
    sample_weights.append(np.sqrt(minutes * time_weight))
    row_num += 1

# Create sparse matrix
X = sparse.csr_matrix((data_vals, (rows, cols)), shape=(row_num, 2 * num_players))
y_array = np.array(y)
sample_weights_array = np.array(sample_weights)

print(f"Design matrix shape: {X.shape}")
print(f"Target variable shape: {y_array.shape}")

Design matrix shape: (12, 40)
Target variable shape: (12,)


In [5]:
# Center y by weighted mean
y_mean = np.average(y_array, weights=sample_weights_array)
y_centered = y_array - y_mean

alphas_to_try = [0.01, 0.1, 1.0, 10.0, 100.0]
ridge_cv = RidgeCV(alphas=alphas_to_try, fit_intercept=True, cv=5)
ridge_cv.fit(X, y_centered, sample_weight=sample_weights_array)
print(f"Alpha={ridge_cv.alpha_}")

# Fit Ridge regression
ridge = Ridge(alpha=ridge_cv.alpha_, fit_intercept=True)

ridge.fit(X, y_centered, sample_weight=sample_weights_array)

# Extract coefficients
intercept_adjusted = ridge.intercept_ + y_mean
offensive_ratings = dict(zip(players, ridge.coef_[:num_players]))
defensive_ratings = dict(zip(players, ridge.coef_[num_players:]))

print(f"Baseline xG per 90min: {intercept_adjusted * 90:.4f}")

print("\n=== PLAYER RATINGS (per 90 minutes) ===")
print(f"{'Player':<12} : {'Defensive':>10} | {'Offensive':>10} | {'Net':>10}")
print("-" * 55)

net_impact = {player: offensive_ratings[player] + defensive_ratings[player] for player in players}
for player in sorted(players, key=lambda x: net_impact[x], reverse=True):
    def_90 = defensive_ratings[player] * 90
    off_90 = offensive_ratings[player] * 90
    net_90 = net_impact[player] * 90
    print(f"{player:<12} : {def_90:>+10.4f} | {off_90:>+10.4f} | {net_90:>+10.4f}")



Alpha=100.0
Baseline xG per 90min: 1.0532

=== PLAYER RATINGS (per 90 minutes) ===
Player       :  Defensive |  Offensive |        Net
-------------------------------------------------------
Messi        :    -0.0136 |    +0.0423 |    +0.0287
Modric       :    -0.0136 |    +0.0423 |    +0.0287
Neuer        :    -0.0136 |    +0.0423 |    +0.0287
Ramos        :    -0.0136 |    +0.0423 |    +0.0287
Van Dijk     :    -0.0136 |    +0.0423 |    +0.0287
Goretzka     :    +0.0005 |    -0.0073 |    -0.0068
Koulibaly    :    +0.0005 |    -0.0073 |    -0.0068
Lewandowski  :    +0.0005 |    -0.0073 |    -0.0068
Marquinhos   :    +0.0005 |    -0.0073 |    -0.0068
Ter Stegen   :    +0.0005 |    -0.0073 |    -0.0068
Kroos        :    +0.0078 |    -0.0146 |    -0.0068
Mbappe       :    +0.0078 |    -0.0146 |    -0.0068
Oblak        :    +0.0078 |    -0.0146 |    -0.0068
Sergio       :    +0.0078 |    -0.0146 |    -0.0068
Varane       :    +0.0078 |    -0.0146 |    -0.0068
Alaba        :    +0.0053 |  

In [6]:
def predict_match(team_a_players, team_b_players, offensive_ratings, defensive_ratings, baseline_xg_per_min):
    """
    Predict xG for a match between two teams of players
    
    Parameters:
    - team_a_players: list of player names for team A
    - team_b_players: list of player names for team B  
    - offensive_ratings: dict of offensive ratings from trained model
    - defensive_ratings: dict of defensive ratings from trained model
    - baseline_xg_per_min: baseline xG per minute from model intercept
    
    Returns:
    - dict with predicted xG for both teams and individual contributions
    """
    
    # Validate that all players exist in the model
    all_players = set(team_a_players) | set(team_b_players)
    missing_players = [p for p in all_players if p not in offensive_ratings]
    
    if missing_players:
        print(f"Warning: The following players are not in the model and will be treated as average (0 impact):")
        for player in missing_players:
            print(f"  - {player}")
    
    # Calculate team strengths (per minute)
    # Team A's offensive strength = sum of their offensive ratings + Team B's defensive weakness
    team_a_offensive_strength = sum(offensive_ratings.get(p, 0) for p in team_a_players)
    team_b_defensive_weakness = sum(defensive_ratings.get(p, 0) for p in team_b_players)
    
    # Team B's offensive strength = sum of their offensive ratings + Team A's defensive weakness  
    team_b_offensive_strength = sum(offensive_ratings.get(p, 0) for p in team_b_players)
    team_a_defensive_weakness = sum(defensive_ratings.get(p, 0) for p in team_a_players)
    
    # Calculate predicted xG per minute for each team
    team_a_xg_per_min = baseline_xg_per_min + team_a_offensive_strength - team_b_defensive_weakness
    team_b_xg_per_min = baseline_xg_per_min + team_b_offensive_strength - team_a_defensive_weakness
    
    # Convert to per 90 minutes
    team_a_xg_per_90 = team_a_xg_per_min * 90
    team_b_xg_per_90 = team_b_xg_per_min * 90
    
    # Calculate individual contributions
    team_a_contributions = {
        p: {
            'offensive_impact_90': offensive_ratings.get(p, 0) * 90,
            'defensive_impact_90': defensive_ratings.get(p, 0) * 90,
            'net_impact_90': (offensive_ratings.get(p, 0) + defensive_ratings.get(p, 0)) * 90
        } for p in team_a_players
    }
    
    team_b_contributions = {
        p: {
            'offensive_impact_90': offensive_ratings.get(p, 0) * 90,
            'defensive_impact_90': defensive_ratings.get(p, 0) * 90, 
            'net_impact_90': (offensive_ratings.get(p, 0) + defensive_ratings.get(p, 0)) * 90
        } for p in team_b_players
    }
    
    return {
        'prediction': {
            'team_a_xg_per_90': team_a_xg_per_90,
            'team_b_xg_per_90': team_b_xg_per_90,
            'expected_goal_difference': team_a_xg_per_90 - team_b_xg_per_90,
            'team_a_win_probability': None,  # Could add win probability model later
            'team_b_win_probability': None,
            'draw_probability': None
        },
        'team_breakdown': {
            'team_a': {
                'offensive_strength': team_a_offensive_strength * 90,
                'defensive_strength': team_a_defensive_weakness * 90,  # Note: defensive weakness is negative
                'player_contributions': team_a_contributions
            },
            'team_b': {
                'offensive_strength': team_b_offensive_strength * 90,
                'defensive_strength': team_b_defensive_weakness * 90,
                'player_contributions': team_b_contributions
            }
        },
        'baseline_xg_per_90': baseline_xg_per_min * 90
    }

def print_prediction(prediction, team_a_name="Team A", team_b_name="Team B"):
    """Pretty print the prediction results"""
    
    pred = prediction['prediction']
    breakdown = prediction['team_breakdown']
    
    print("=" * 60)
    print(f"MATCH PREDICTION: {team_a_name} vs {team_b_name}")
    print("=" * 60)
    
    print(f"\n📊 EXPECTED GOALS (per 90 minutes):")
    print(f"  {team_a_name}: {pred['team_a_xg_per_90']:.2f} xG")
    print(f"  {team_b_name}: {pred['team_b_xg_per_90']:.2f} xG")
    print(f"  Expected Goal Difference: {pred['expected_goal_difference']:+.2f}")
    
    print(f"\n🏆 TEAM STRENGTH BREAKDOWN:")
    print(f"  {team_a_name}:")
    print(f"    Offensive: {breakdown['team_a']['offensive_strength']:+.2f}")
    print(f"    Defensive: {breakdown['team_a']['defensive_strength']:+.2f}")
    
    print(f"  {team_b_name}:")
    print(f"    Offensive: {breakdown['team_b']['offensive_strength']:+.2f}") 
    print(f"    Defensive: {breakdown['team_b']['defensive_strength']:+.2f}")
    
    print(f"\n📈 BASELINE (average team): {prediction['baseline_xg_per_90']:.2f} xG per 90min")

# Example usage after training your model:
if __name__ == "__main__":
    # Example teams (replace with actual player names from your model)
    team_a = ["Neuer", "Van Dijk", "Ramos", "Modric", "Messi"]		
    
    team_b = ["Oblak", "Sergio", "Varane", "Kroos", "Mbappe"]
    
    # Make prediction (using the model outputs from your training code)
    result = predict_match(
        team_a_players=team_a,
        team_b_players=team_b,
        offensive_ratings=offensive_ratings,  # From your trained model
        defensive_ratings=defensive_ratings,  # From your trained model  
        baseline_xg_per_min=intercept_adjusted  # From your trained model
    )
    
    # Print results
    print_prediction(result, "Team A", "Team B")
    
    # Optional: Print individual player impacts
    print("\n👤 KEY PLAYER IMPACTS:")
    print("\nTeam A Top Contributors:")
    team_a_players_sorted = sorted(
        team_a, 
        key=lambda p: result['team_breakdown']['team_a']['player_contributions'][p]['net_impact_90'], 
        reverse=True
    )[:3]  # Top 3
    
    for player in team_a_players_sorted:
        impact = result['team_breakdown']['team_a']['player_contributions'][player]['net_impact_90']
        print(f"  {player}: {impact:+.2f} net xG impact")
    
    print("\nTeam B Top Contributors:")
    team_b_players_sorted = sorted(
        team_b,
        key=lambda p: result['team_breakdown']['team_b']['player_contributions'][p]['net_impact_90'],
        reverse=True
    )[:3]  # Top 3
    
    for player in team_b_players_sorted:
        impact = result['team_breakdown']['team_b']['player_contributions'][player]['net_impact_90']
        print(f"  {player}: {impact:+.2f} net xG impact")

MATCH PREDICTION: Team A vs Team B

📊 EXPECTED GOALS (per 90 minutes):
  Team A: 1.23 xG
  Team B: 1.05 xG
  Expected Goal Difference: +0.18

🏆 TEAM STRENGTH BREAKDOWN:
  Team A:
    Offensive: +0.21
    Defensive: -0.07
  Team B:
    Offensive: -0.07
    Defensive: +0.04

📈 BASELINE (average team): 1.05 xG per 90min

👤 KEY PLAYER IMPACTS:

Team A Top Contributors:
  Neuer: +0.03 net xG impact
  Van Dijk: +0.03 net xG impact
  Ramos: +0.03 net xG impact

Team B Top Contributors:
  Oblak: -0.01 net xG impact
  Sergio: -0.01 net xG impact
  Varane: -0.01 net xG impact
