In [26]:
# 1. Load standard libraries FIRST
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import sys
import importlib
import inspect
import os

# 2. Verify standard libraries are healthy
print(f"Pandas version: {pd.__version__}")

# --- 1. Path Setup ---
# Try to locate the repository root by searching upward for a 'src' directory (or .git)
def find_repo_root(start_path=None, marker_dirs=('src', '.git')):
    p = os.path.abspath(start_path or os.getcwd())
    while True:
        if any(os.path.isdir(os.path.join(p, m)) for m in marker_dirs):
            return p
        parent = os.path.dirname(p)
        if parent == p:
            return None
        p = parent

repo_root = find_repo_root()
# Fallback to previous hardcoded path working on nuvolos
if repo_root is None:
    repo_root = "/files/pixlball"

if repo_root not in sys.path:
    sys.path.insert(0, repo_root)
print(f"Using repo_root: {repo_root}")

import src.data as data
import src.model as model
import src.train as train
import src.config as config
import src.dataset as dataset
import src.evaluate as evaluate
import src.utils as utils

from src.config import DEVICE 


# 4. Force a clean reload of your specific logic
importlib.reload(data)
importlib.reload(train)

# 5. THE SMOKE TEST
print("Signature check:", inspect.signature(data.prepare_nn_dataset))

Pandas version: 2.3.3
Using repo_root: c:\Users\jonas\Desktop\repos\pixlball
Signature check: (events_df, nn_layers_df, target_cols=['nn_target'], id_col='id', context_cols=False, temporal_context=True, keep_context_ids=False)


In [19]:
data_events = pd.read_parquet(os.path.join(repo_root, "data", "events_data.parquet"), engine="fastparquet")
data_360 = pd.read_parquet(os.path.join(repo_root, "data", "sb360_data.parquet"), engine="fastparquet")

In [20]:
df_with_targets = data.event_data_loader(data_events)
df_with_targets = data.add_ball_trajectory_features(df_with_targets)

2462 events.
counts of each outcome nn_target
Keep Possession    70920
Lose Possession    27465
Shot                4764
Name: count, dtype: int64


# Prepare 360 Data

In [21]:
df_360 = data.assign_grid_cells(data_360)
nn_final = data.aggregate_nn_layers_vectorized(df_360)

# Finalize NN Df

In [22]:
nn_dataset = data.prepare_nn_dataset(df_with_targets, nn_final, target_cols=['nn_target', 'goal_flag'], context_cols = True, keep_context_ids = True ) # adjust cols depending on model
nn_dataset = data.add_context_cols(nn_dataset)
nn_dataset = data.add_target_as_int(nn_dataset)

# Neural Network final Data Prep

In [None]:
import pandas as pd


# --- Usage ---
# nn_dataset = add_ball_coordinates(nn_dataset)

In [None]:
nn_dataset, vector_names = data.add_ball_coordinates(nn_dataset)

# The Goal Multi Task CNN

In [27]:
layer_columns = ["ball_layer", "teammates_layer", "opponents_layer"]
class_weights_event, goal_pos_weight = utils.get_multitask_loss_weights(nn_dataset, DEVICE)

print(f"Goal Positive Weight (0/1 ratio): {goal_pos_weight.item():.2f}")

Goal Positive Weight (0/1 ratio): 5.00


# Preparing the Context CNN

In [28]:
import numpy as np
from sklearn.model_selection import train_test_split


# 1. Define your split parameters
VALIDATION_SIZE = 0.20
RANDOM_SEED = 42
layer_columns = ["ball_layer", "teammates_layer", "opponents_layer"]

# 2. Split the entire DataFrame first
# This keeps features, event targets, and goal flags bundled together
train_df, val_df = train_test_split(
    nn_dataset, 
    test_size=VALIDATION_SIZE, 
    random_state=RANDOM_SEED, 
    stratify=nn_dataset['nn_target_int']
)

# -------------------------------------------------------------
# 3. Extract the arrays and Instantiate the Datasets (FIXED)
# -------------------------------------------------------------
context_features = ['under_pressure', 'counterpress', 'dribble_nutmeg']

# Training Dataset extraction - Pass only the values in the correct order
train_dataset_temporal_context = dataset.ContextBallVectorPitchDatasetMultiTask(
    train_df[layer_columns],             # This maps to the 1st argument (features)
    train_df['nn_target_int'].values,    # This maps to the 2nd argument (events)
    train_df['goal_flag'].values,
    train_df[vector_names]        # This maps to the 3rd argument (goals)
)

# Validation Dataset extraction
validation_dataset_temporal_context = dataset.ContextBallVectorPitchDatasetMultiTask(
    val_df[layer_columns], 
    val_df['nn_target_int'].values, 
    val_df['goal_flag'].values,
    val_df[vector_names]  
)

print(f"Total training samples: {len(train_dataset_temporal_context)}")
print(f"Total validation samples: {len(validation_dataset_temporal_context)}")

Total training samples: 72117
Total validation samples: 18030


In [29]:
# Assuming event_class_weights and goal_pos_weight are defined from previous cells
NUM_CONTEXT_FEATURES = 8 

print("Starting training for Contextual CNN Baseline...")

# Modified the Function in Loss to take correct loss function -> needs to be changed for baseline model again

context_baseline_model = train.train_model_context_threat(
    dataset=train_dataset_temporal_context, 
    event_class_weights=class_weights_event, # Use your calculated weights
    goal_pos_weight=goal_pos_weight,         # Use your calculated pos_weight
    num_context_features=NUM_CONTEXT_FEATURES
)

print("\nContextual CNN Training complete.")

Starting training for Contextual CNN Baseline...


Context CNN Epoch 1: 100%|██████████| 2254/2254 [00:46<00:00, 48.76it/s, ev_loss=0.3339, loss=1.1469, sh_loss=0.5420] 


Contextual CNN Training complete.





In [30]:
# Assuming evaluate_model_context is imported and available

print("\nEvaluating Contextual CNN Model...")

metrics = evaluate.evaluate_model_context_threat(
    model=context_baseline_model, 
    dataset=validation_dataset_temporal_context # Evaluate on the contextual dataset
)


Evaluating Contextual CNN Model...

--- Event Outcome Metrics ---
Event Accuracy: 0.22584581253466446
Event Balanced Accuracy: 0.5220945566421485
Event Confusion Matrix:
 [[   0 9647 2841]
 [   0 3316 1341]
 [   0  129  756]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     12488
           1       0.25      0.71      0.37      4657
           2       0.15      0.85      0.26       885

    accuracy                           0.23     18030
   macro avg       0.14      0.52      0.21     18030
weighted avg       0.07      0.23      0.11     18030


--- Goal Prediction (xG) Metrics ---
Goal Accuracy: 0.5932203389830508
Goal Balanced Accuracy: 0.5402127562015682
Goal AUC-ROC Score: 0.5774909441056445
Goal Confusion Matrix:
 [[472 300]
 [ 60  53]]
              precision    recall  f1-score   support

         0.0       0.89      0.61      0.72       772
         1.0       0.15      0.47      0.23       113

    accuracy               

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [31]:
import numpy as np
# Assuming metrics contains the result from evaluate_model_context_threat

event_probs = metrics['event_probs']

print("P(Keep) | P(Lose) | P(Shot)")
print("-------------------------------")
print(event_probs[:5])

# You can look at the average predicted probability for the Shot class across all events:
avg_p_shot = np.mean(event_probs[:, 2])
print(f"\nAverage Predicted P(Shot) across all events: {avg_p_shot:.4f}")

P(Keep) | P(Lose) | P(Shot)
-------------------------------
[[0.35842028 0.5013404  0.1402393 ]
 [0.28389135 0.5476516  0.16845714]
 [0.13027388 0.33651263 0.53321356]
 [0.35667905 0.50527805 0.1380429 ]
 [0.15343137 0.23481005 0.6117586 ]]

Average Predicted P(Shot) across all events: 0.3139


In [32]:
import numpy as np
import pandas as pd
# Assuming metrics contains the result from evaluate_model_context_threat

print("--- Goal Prediction Probabilities (xG) Analysis ---")

goal_probs = metrics['goal_probs']
goal_labels = metrics['goal_labels'] # Actual outcome (0=No Goal, 1=Goal)

print(f"Number of Shots Evaluated: {len(goal_probs)}")

# 1. Total xG vs. Actual Goals
total_predicted_xg = np.sum(goal_probs)
total_true_goals = np.sum(goal_labels)
avg_xg_per_shot = np.mean(goal_probs)

print(f"\nTotal Predicted xG: {total_predicted_xg:.2f}")
print(f"Total True Goals Scored: {total_true_goals:.2f}")
print(f"Average Predicted xG per Shot: {avg_xg_per_shot:.4f}")

# 2. Calibration Check (Optional but helpful)
# Compare the average predicted xG for shots that were goals vs. shots that were misses.

# Create a DataFrame for easy slicing
xg_df = pd.DataFrame({'xg': goal_probs, 'goal': goal_labels})

avg_xg_goal = xg_df[xg_df['goal'] == 1]['xg'].mean()
avg_xg_miss = xg_df[xg_df['goal'] == 0]['xg'].mean()

print("\n-- Calibration Check --")
print(f"Average xG for True Goals (should be high): {avg_xg_goal:.4f}")
print(f"Average xG for Missed Shots (should be low): {avg_xg_miss:.4f}")

--- Goal Prediction Probabilities (xG) Analysis ---
Number of Shots Evaluated: 885

Total Predicted xG: 422.83
Total True Goals Scored: 113.00
Average Predicted xG per Shot: 0.4778

-- Calibration Check --
Average xG for True Goals (should be high): 0.5005
Average xG for Missed Shots (should be low): 0.4745
