In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import os
import json
import wandb
import glob
import librosa
from moviepy.editor import VideoFileClip

  if event.key is 'enter':



In [None]:
import wandb
# Initialize WandB
wandb.login(key='add api key')

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msupraja2010341[0m ([33mfyproject[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
# Model Definition
class TemporalEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.conv = nn.Conv1d(input_dim, hidden_dim, kernel_size=3, padding=1)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim // 2, bidirectional=True, batch_first=True)
        self.norm = nn.LayerNorm(hidden_dim)

    def forward(self, x):
        x = x.permute(0, 2, 1)  # (batch, time, dim) -> (batch, dim, time)
        x = self.conv(x).relu()
        x = x.permute(0, 2, 1)  # (batch, time, hidden_dim)
        x, _ = self.lstm(x)
        x = self.norm(x)
        return x

In [4]:
class HighlightModel(nn.Module):
    def __init__(self, video_dim, audio_dim, hidden_dim):
        super().__init__()
        self.video_encoder = TemporalEncoder(video_dim, hidden_dim)
        self.audio_encoder = TemporalEncoder(audio_dim, hidden_dim)
        self.fusion = nn.Linear(hidden_dim * 2, hidden_dim)
        self.scorer = nn.Linear(hidden_dim, 1)

    def forward(self, video, audio, event_timestamps):
        event_timestamps=event_timestamps.long()
        video_features = self.video_encoder(video)  # (batch, time, hidden)
        audio_features = self.audio_encoder(audio)  # (batch, time, hidden)
        fused_features = torch.cat([video_features, audio_features], dim=-1)  # (batch, time, hidden*2)
        fused_features = self.fusion(fused_features).relu()  # (batch, time, hidden)
        event_features = fused_features[torch.arange(fused_features.size(0))[:, None], event_timestamps]
        scores = self.scorer(event_features).sigmoid()  # (batch, num_events, 1)
        return scores.squeeze(-1)  # (batch, num_events)

In [None]:
class SoccerNetDataset:
    def __init__(self, data_dir, split_file, feature_rate=1, max_events=50, device="cuda", top=10):
        self.data_dir = data_dir
        self.feature_rate = feature_rate
        self.max_events = max_events
        self.device = device
        self.game_dirs = self._load_games(split_file)[:top]

    def _load_games(self, split_file):
        with open(split_file) as f:
            game_dirs = json.load(f)
        valid_dirs = [d for d in game_dirs if os.path.exists(d)]
        if len(valid_dirs) < len(game_dirs):
            print(f"Warning: {len(game_dirs) - len(valid_dirs)} game directories not found")
        return valid_dirs

    def _load_video_features(self, game_dir):
        feature_files = sorted(glob.glob(os.path.join(game_dir, "*_ResNET_TF2.npy")))
        if len(feature_files) < 2:
            raise FileNotFoundError(f"Expected 1_ResNET_TF2.npy and 2_ResNET_TF2.npy in {game_dir}, found {len(feature_files)}")
        features1 = np.load(feature_files[0])
        features2 = np.load(feature_files[1])
        video_features = np.concatenate([features1, features2], axis=0)
        return torch.tensor(video_features, dtype=torch.float32).to(self.device)

    def _load_audio_features(self, game_dir, target_length):
        audio_files = sorted(glob.glob(os.path.join(game_dir, "*_224p.wav")))
        if len(audio_files) < 2:
            raise FileNotFoundError(f"Expected 1_224p.wav and 2_224p.wav in {game_dir}, found {len(audio_files)}")
        mfccs = []
        audio_signals = []
        sr = None
        for audio_file in audio_files:
            y, sr = librosa.load(audio_file, sr=None)
            hop_length = int(sr / self.feature_rate)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20, hop_length=hop_length)
            mfccs.append(mfcc.T)
            audio_signals.append(y)
        audio_features = np.concatenate(mfccs, axis=0)
        audio_features = librosa.util.fix_length(audio_features.T, size=target_length, axis=1).T
        audio_signal = np.concatenate(audio_signals)
        return (torch.tensor(audio_features, dtype=torch.float32).to(self.device),
                audio_signal, sr)

    def _parse_game_time(self, game_time):
        half, time_str = game_time.split(" - ")
        minutes, seconds = map(int, time_str.split(":"))
        total_seconds = minutes * 60 + seconds
        return int(half), total_seconds

    def _load_event_timestamps(self, game_dir, video_length, audio_signal, sr):
        annotation_file = os.path.join(game_dir, "Labels-v2.json")
        if not os.path.exists(annotation_file):
            raise FileNotFoundError(f"No Labels-v2.json found in {game_dir}")
        with open(annotation_file) as f:
            data = json.load(f)
        events = data.get("annotations", [])
        half1_duration = self._get_half1_duration(os.path.join(game_dir, "1_224p.mp4"))
        half1_duration_frames = int(half1_duration * self.feature_rate)

        timestamps = []
        audio_scores = []
        event_names = []
        window_seconds = 5
        window_samples = int(window_seconds * sr)

        for event in events:
            half, time_seconds = self._parse_game_time(event["gameTime"])
            timestamp = int(time_seconds * self.feature_rate)
            if half == 2:
                timestamp += half1_duration_frames
            if timestamp < video_length:
                timestamps.append(timestamp)
                event_names.append(event.get("label", "Unknown"))
                audio_time = time_seconds + (half1_duration if half == 2 else 0)
                center_sample = int(audio_time * sr)
                start_sample = max(0, center_sample - window_samples)
                end_sample = min(len(audio_signal), center_sample + window_samples)
                window = audio_signal[start_sample:end_sample]
                rms = librosa.feature.rms(y=window, frame_length=2048, hop_length=512)
                score = np.mean(rms) if rms.size > 0 else 0.0
                audio_scores.append(score)

        if audio_scores:
            audio_scores = np.array(audio_scores)
            min_score, max_score = audio_scores.min(), audio_scores.max()
            if max_score > min_score:
                audio_scores = (audio_scores - min_score) / (max_score - min_score)
            else:
                audio_scores = np.zeros_like(audio_scores)

        num_events = len(timestamps)
        if num_events == 0:
            timestamps = [0]
            audio_scores = [0.0]
            event_names = ["None"]
            num_events = 0
        if num_events > self.max_events:
            timestamps = timestamps[:self.max_events]
            audio_scores = audio_scores[:self.max_events]
            event_names = event_names[:self.max_events]
            num_events = self.max_events
        else:
            timestamps += [0] * (self.max_events - num_events)
            audio_scores += [0.0] * (self.max_events - num_events)
            event_names += ["Padding"] * (self.max_events - num_events)

        return (torch.tensor(timestamps, dtype=torch.long),
                torch.tensor(audio_scores, dtype=torch.float32),
                num_events,
                event_names)

    def _get_half1_duration(self, video_path):
        try:
            video = VideoFileClip(video_path)
            duration = video.duration
            video.close()
            return duration
        except Exception:
            return 2700

    def __len__(self):
        return len(self.game_dirs)

    def __getitem__(self, idx):
        game_dir = self.game_dirs[idx]
        try:
            video_tensor = self._load_video_features(game_dir)
            audio_tensor, audio_signal, sr = self._load_audio_features(game_dir, video_tensor.shape[0])
            timestamps, scores, num_events, event_names = self._load_event_timestamps(game_dir, video_tensor.shape[0], audio_signal, sr)
            return {
                "video": video_tensor,
                "audio": audio_tensor,
                "timestamps": timestamps.to(self.device),
                "scores": scores.to(self.device),
                "num_events": num_events,
                "event_names": event_names
            }
        except Exception as e:
            print(f"Skipping {game_dir} due to error: {e}")
            return None




In [None]:
def custom_collate(batch):
    batch = [b for b in batch if b is not None]
    if not batch:
        return None

    videos = [b["video"].clone().detach() for b in batch]
    audios = [b["audio"].clone().detach() for b in batch]
    timestamps = [b["timestamps"].clone().detach().long() for b in batch]
    scores = [b["scores"].clone().detach() for b in batch]
    num_events = [b["num_events"] for b in batch]
    event_names = [b["event_names"] for b in batch]

    padded_videos = pad_sequence(videos, batch_first=True)
    padded_audios = pad_sequence(audios, batch_first=True)
    padded_timestamps = pad_sequence(timestamps, batch_first=True)
    padded_scores = pad_sequence(scores, batch_first=True)

    return {
        "video": padded_videos,
        "audio": padded_audios,
        "timestamps": padded_timestamps,
        "scores": padded_scores,
        "num_events": num_events,
        "event_names": event_names
    }


In [13]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from collections import defaultdict
import wandb

def test_model(data_dir, test_split, batch_size=4, device="cuda"):
    wandb.init(project="soccernet_highlights-test", config={
        "batch_size": batch_size,
        "video_dim": 2048,
        "audio_dim": 20,
        "hidden_dim": 512,
        "phase": "test"
    })

    test_dataset = SoccerNetDataset(data_dir, test_split, device=device, top=10)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=custom_collate)

    print(f"Test dataset size: {len(test_dataset)} games")

    model = HighlightModel(video_dim=2048, audio_dim=20, hidden_dim=512).to(device)
    model.load_state_dict(torch.load("highlight_model.pth"))
    model.eval()

    criterion = nn.BCELoss()

    test_loss = 0.0
    test_mae = 0.0
    test_count = 0
    test_batches = 0
    test_correct = 0
    test_total = 0

    # Dictionary to store event names and scores grouped by match index
    match_events_scores = defaultdict(list)

    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            if batch is None:
                continue

            video = batch["video"].to(device)
            audio = batch["audio"].to(device)
            timestamps = batch["timestamps"].to(device)
            scores = batch["scores"].to(device)
            num_events = batch["num_events"]
            event_names = batch["event_names"]

            pred_scores = model(video, audio, timestamps)

            loss = torch.tensor(0.0, device=device)
            for i in range(len(num_events)):
                if num_events[i] > 0:
                    valid_scores = pred_scores[i, :num_events[i]]
                    valid_gt = scores[i, :num_events[i]]
                    valid_names = event_names[i][:num_events[i]]
                    loss = loss + criterion(valid_scores, valid_gt)

                    test_mae += torch.mean(torch.abs(valid_scores - valid_gt)).item()
                    test_count += 1

                    pred_labels = (valid_scores >= 0.5).float()
                    test_correct += (pred_labels == valid_gt).float().sum().item()
                    test_total += valid_gt.numel()

                    # Store match-wise event names and predicted scores
                    for name, score in zip(valid_names, valid_scores.cpu().numpy()):
                        match_events_scores[batch_idx * batch_size + i].append((name, score))

            if test_count > 0:
                batch_loss = (loss / len(num_events)).item()
                test_loss += batch_loss
                test_batches += 1

                print(f"Test Batch {batch_idx + 1}/{len(test_loader)}, "
                      f"Test Loss: {batch_loss:.4f}, Test MAE: {test_mae / test_count:.4f}")

    # Print all events and their scores match-wise in decreasing order
    print("\nAll events and their predicted scores (sorted match-wise in decreasing order):")
    for match_id in sorted(match_events_scores.keys()):
        print(f"\nMatch {match_id + 1}:")
        events = sorted(match_events_scores[match_id], key=lambda x: x[1], reverse=True)
        for event, score in events:
            print(f"Event: {event}, Predicted Score: {score:.4f}")

    # Calculate final metrics
    test_loss = test_loss / test_batches if test_batches > 0 else 0.0
    test_mae = test_mae / test_count if test_count > 0 else 0.0
    test_accuracy = test_correct / test_total if test_total > 0 else 0.0

    wandb.log({
        "test_loss": test_loss,
        "test_mae": test_mae,
        "test_accuracy": test_accuracy
    })

    print(f"\nTest Results: Loss: {test_loss:.4f}, MAE: {test_mae:.4f}, Accuracy: {test_accuracy:.4f}")

    wandb.finish()


In [14]:
if __name__ == "__main__":
    data_dir = "/content/drive/MyDrive/soccernet"
    test_split = os.path.join(data_dir, "test.json")
    test_model(data_dir, test_split, batch_size=4, device="cuda")

Test dataset size: 10 games
Test Batch 1/3, Test Loss: 0.6673, Test MAE: 0.1472
Test Batch 2/3, Test Loss: 0.6937, Test MAE: 0.1594
Test Batch 3/3, Test Loss: 0.6939, Test MAE: 0.1525

All events and their predicted scores (sorted match-wise in decreasing order):

Match 1:
Event: Kick-off, Predicted Score: 0.4977
Event: Ball out of play, Predicted Score: 0.4401
Event: Ball out of play, Predicted Score: 0.4400
Event: Offside, Predicted Score: 0.4398
Event: Ball out of play, Predicted Score: 0.4391
Event: Throw-in, Predicted Score: 0.4364
Event: Indirect free-kick, Predicted Score: 0.4355
Event: Ball out of play, Predicted Score: 0.4344
Event: Shots on target, Predicted Score: 0.4340
Event: Clearance, Predicted Score: 0.4332
Event: Foul, Predicted Score: 0.4331
Event: Foul, Predicted Score: 0.4329
Event: Ball out of play, Predicted Score: 0.4326
Event: Clearance, Predicted Score: 0.4321
Event: Ball out of play, Predicted Score: 0.4320
Event: Ball out of play, Predicted Score: 0.4316
Even

0,1
test_accuracy,▁
test_loss,▁
test_mae,▁

0,1
test_accuracy,0.008
test_loss,0.68497
test_mae,0.15252
