In [None]:
# Assuming the result of your evaluation is stored here:
print("--- Event Classification Probabilities (P_outcome) ---")
event_probs = metrics['event_probs']
print(f"Shape of Event Probabilities (N, 3): {event_probs.shape}")

# Average predicted probability for each class (overall confidence)
avg_P_keep = np.mean(event_probs[:, 0])
avg_P_lose = np.mean(event_probs[:, 1])
avg_P_shot = np.mean(event_probs[:, 2])

print(f"Average Predicted P(Keep Possession): {avg_P_keep:.4f}")
print(f"Average Predicted P(Lose Possession): {avg_P_lose:.4f}")
print(f"Average Predicted P(Shot): {avg_P_shot:.4f}")

print("\n--- Goal Prediction Probabilities (xG) ---")
goal_probs = metrics['goal_probs']
print(f"Number of Shots Evaluated: {goal_probs.shape[0]}")

# Average Predicted xG
avg_xg = np.mean(goal_probs)
print(f"Average Predicted xG per Shot: {avg_xg:.4f}")

# Total Predicted xG (sum of all probabilities for the shot events)
total_xg = np.sum(goal_probs)
print(f"Total Predicted xG for all Shots: {total_xg:.2f}")

# The actual number of goals scored in the test set (True Goals)
true_goals = np.sum(metrics['goal_labels'])
print(f"Actual Goals Scored (True Goals): {true_goals:.2f}")

# Print AUC Score (Should be in your metrics dictionary now)
print(f"Goal Prediction AUC-ROC Score: {metrics.get('goal_auc', 'N/A')}")

In [None]:
import torch
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from tqdm import tqdm

# Define the columns that cause the Parquet error
LAYER_COLUMNS_TO_DROP = ["ball_layer", "teammates_layer", "opponents_layer"]

def predict_and_save_probabilities(
    model, 
    full_dataset, 
    original_df: pd.DataFrame, 
    output_filepath: str,
    device: str = 'cpu',
    batch_size: int = 1024
) -> pd.DataFrame:
    """
    Runs the model, computes probabilities, assigns them to the original DataFrame,
    drops the problematic pitch layer columns, and saves the result as a Parquet file.
    """
    print(f"Starting prediction on {len(full_dataset)} samples...")

    model.eval() 
    model.to(device)
    pred_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=False)
    
    event_probs_list = []
    goal_probs_list = []
    
    with torch.no_grad():
        for X, _, _ in tqdm(pred_loader, desc="Predicting Probabilities"):
            # If using Contextual Model, adjust the unpacking: for X, ctx, _, _ in ...
            X = X.to(device)
            event_logits, goal_logits = model(X)
            
            event_probs = F.softmax(event_logits, dim=1) 
            event_probs_list.append(event_probs.cpu().numpy())
            
            goal_probs = torch.sigmoid(goal_logits)
            goal_probs_list.append(goal_probs.cpu().numpy())

    all_event_probs = np.concatenate(event_probs_list, axis=0)
    all_goal_probs = np.concatenate(goal_probs_list, axis=0).flatten()

    # 1. Assign new columns to the original DataFrame
    result_df = original_df.copy()
    
    result_df.loc[:, 'P_Lose'] = all_event_probs[:, 1]
    result_df.loc[:, 'P_Keep'] = all_event_probs[:, 0]
    result_df.loc[:, 'P_Shot'] = all_event_probs[:, 2]
    result_df.loc[:, 'xG'] = all_goal_probs
    
    # 2. CRITICAL FIX: Drop the complex object columns before saving!
    # These columns contain lists-of-lists (the pitch layers) which Parquet cannot serialize.
    columns_to_keep = [col for col in result_df.columns if col not in LAYER_COLUMNS_TO_DROP]
    final_df_to_save = result_df[columns_to_keep]

    # 3. Save the enriched DataFrame to Parquet
    final_df_to_save.to_parquet(output_filepath, index=False)
    
    print(f"\n✅ Prediction complete. Data saved to: {output_filepath}")
    return final_df_to_save

# Example: Run the function again
# final_df_with_probs = predict_and_save_probabilities(...) # using the fixed function

# --- Example Usage (Requires your context setup) ---
# NOTE: You will need to create the 'full_dataset' object here:
full_dataset = PitchDatasetMultiTask(nn_dataset[layer_columns], event_targets, goal_flags) 

final_df_with_probs = predict_and_save_probabilities(
    model=baseline_model,
    full_dataset=full_dataset,
    original_df=nn_dataset.copy(),
    output_filepath='baseline_cnn_predictions.parquet',
    device=DEVICE,
    batch_size=1024
)

In [None]:
# 1. Define the team name
TEAM_NAME = "Switzerland Women's"

# 2. Filter the DataFrame where Switzerland Women's is either the home or away team
switzerland_matches = df_merged[
    (df_merged['team'] == TEAM_NAME)]

# 3. Get all unique match IDs
unique_match_ids = switzerland_matches['match_id'].unique()

# 4. Print the result
print(unique_match_ids)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def get_possession_sequence(
    df_merged: pd.DataFrame, 
    match_id: int, 
    possession_id: int
) -> pd.DataFrame:
    """
    Filters the merged DataFrame (raw event data + model predictions)
    to return a single, chronologically sorted possession sequence.

    Args:
        df_merged (pd.DataFrame): The pre-loaded DataFrame containing all merged data.
        match_id (int): The match identifier to filter on.
        possession_id (int): The possession identifier to filter on.

    Returns:
        pd.DataFrame: A filtered and sorted sequence DataFrame.
    """
    
    # 1. Filter for the Specific Possession Sequence
    # This filters the full DataFrame down to just the events of interest
    df_sequence = df_merged[
        (df_merged['match_id'] == match_id) & 
        (df_merged['possession_id'] == possession_id)
    ].copy()
    
    if df_sequence.empty:
        print(f"⚠️ Warning: Sequence not found for Match ID {match_id}, Possession ID {possession_id}.")
        return pd.DataFrame()

    # 2. Sort the sequence chronologically
    # Assumes a column like 'event_sequence_in_possession' or a reliable timestamp exists.
    # If not, use df_sequence.sort_values(by='timestamp', inplace=True)
    if 'event_sequence_in_possession' in df_sequence.columns:
        df_sequence.sort_values(by='event_sequence_in_possession', inplace=True)
    
    # 3. Create a clean chronological index for plotting (X-axis)
    df_sequence['seq_index'] = np.arange(len(df_sequence))

    print(f"Sequence extracted with {len(df_sequence)} events, ready for plotting.")
    return df_sequence

# --- Next Step: Visualization Function ---

def plot_possession_threat_stack(df_sequence: pd.DataFrame, title_suffix: str = ""):
    """
    Generates a Stacked Area Chart for the Event Head probabilities (P_outcome).
    """
    if df_sequence.empty:
        print("Cannot plot: DataFrame is empty.")
        return

    events = df_sequence['seq_index']
    
    # Ensure probabilities are present and in the correct order for stacking (Lose at the bottom)
    # The stackplot inherently calculates the cumulative lines you requested.
    y_lose = df_sequence['P_Lose'].values
    y_keep = df_sequence['P_Keep'].values
    y_shot = df_sequence['P_Shot'].values
    
    fig, ax = plt.subplots(figsize=(12, 6))

    ax.stackplot(
        events,
        y_lose,
        y_keep,
        y_shot,
        labels=['P(Lose Possession)', 'P(Keep Possession)', 'P(Shot)'],
        colors=['#ff7f0e', '#1f77b4', '#2ca02c'], # Orange, Blue, Green
        alpha=0.8
    )

    # Add xG values as a secondary line plot for context
    ax.plot(events, df_sequence['xG'].values, color='red', linestyle='--', linewidth=2, label='xG (P(Goal) | Shot)')

    # --- Add Labels and Title ---
    match_id = df_sequence['match_id'].iloc[0]
    possession_id = df_sequence['possession_id'].iloc[0]
    
    ax.set_xlabel(f"Event Index (Relative to Possession Start) | Total Events: {len(df_sequence)}", fontsize=12)
    ax.set_ylabel("Probability / Risk Profile")
    ax.set_title(f"Threat Model Output (P_outcome) for Match {match_id}, Possession {possession_id} {title_suffix}", fontsize=14)
    
    ax.legend(loc='upper right', frameon=True)
    ax.set_ylim(0, 1.0)
    ax.grid(True, linestyle='--', alpha=0.6)
    
    # Set X-ticks clearly for every 5th event, or just the start/end if the possession is very long
    if len(events) < 30:
        ax.set_xticks(events[::2])
    
    plt.tight_layout()
    plt.show()

# --- Example Usage (How you would run this in your notebook) ---

# 1. ASSUME df_merged IS AVAILABLE
# 2. Define your target sequence
# TARGET_MATCH = 12345
# TARGET_POSSESSION = 50

# 3. Get the sequence data
# sequence_data = get_possession_sequence(
#     df_merged=df_merged,
#     match_id=TARGET_MATCH,
#     possession_id=TARGET_POSSESSION
# )

# 4. Plot the results
# if not sequence_data.empty:
#     plot_possession_threat_stack(sequence_data)