# PlayerBERT Inference: Player Embeddings & Similarity

This notebook loads saved weights and runs inference to build player embeddings and nearest neighbors.


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.listdir('/content/drive/MyDrive/MLSE')

['events360_v4.jsonl', 'models']

In [3]:
from pathlib import Path
import json
import math
import random
from collections import defaultdict
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

DATA_PATH = Path('/content/drive/MyDrive/MLSE/events360_v4.jsonl')
EVENT_ENCODER_CKPT = Path('/content/drive/MyDrive/MLSE/models/event_encoder_mam.pt')
PLAYERBERT_CKPT = Path('/content/drive/MyDrive/MLSE/models/playerbert_mam.pt')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)


Device: cuda


In [None]:
# Robust JSONL reader

def iter_json_objects(fp):
    decoder = json.JSONDecoder()
    for line in fp:
        line = line.strip()
        if not line:
            continue
        idx = 0
        while idx < len(line):
            obj, end = decoder.raw_decode(line, idx)
            yield obj
            idx = end
            while idx < len(line) and line[idx].isspace():
                idx += 1


In [5]:
# EventEncoder components (must match train_event_encoder)

class PlayerMLP(nn.Module):
    def __init__(self, in_dim=6, hidden=64, out_dim=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, out_dim),
        )
    def forward(self, x):
        return self.net(x)

class SetEncoder(nn.Module):
    def __init__(self, player_dim=6, hidden=64, out_dim=128):
        super().__init__()
        self.player_mlp = PlayerMLP(in_dim=player_dim, hidden=hidden, out_dim=out_dim)
    def forward(self, freeze_frames, actor_locs, device):
        batch_embeds = []
        for ff, (ax, ay) in zip(freeze_frames, actor_locs):
            if ff is None or (hasattr(ff, '__len__') and len(ff) == 0):
                batch_embeds.append(torch.zeros(128, device=device))
                continue
            per_player = []
            for p in ff:
                loc = p.get('location')
                if loc is None or len(loc) < 2:
                    continue
                dx = float(loc[0]) - ax
                dy = float(loc[1]) - ay
                dist = math.sqrt(dx*dx + dy*dy)
                angle = math.atan2(dy, dx)
                is_teammate = 1.0 if p.get('teammate', False) else 0.0
                is_keeper = 1.0 if p.get('keeper', False) else 0.0
                vec = torch.tensor([dx, dy, dist, angle, is_teammate, is_keeper], device=device)
                per_player.append(vec)
            if not per_player:
                batch_embeds.append(torch.zeros(128, device=device))
                continue
            players = torch.stack(per_player, dim=0)
            emb = self.player_mlp(players).mean(dim=0)
            batch_embeds.append(emb)
        return torch.stack(batch_embeds, dim=0)

class EventTransformer(nn.Module):
    def __init__(self, vocab_sizes, d_model=128, nhead=4, num_layers=2):
        super().__init__()
        self.features = list(vocab_sizes.keys())
        self.safe_names = [f"f{i}" for i in range(len(self.features))]
        self.name_map = dict(zip(self.features, self.safe_names))
        self.value_embeds = nn.ModuleDict({
            self.name_map[f]: nn.Embedding(vocab_sizes[f], d_model) for f in self.features
        })
        self.feature_embeds = nn.Embedding(len(self.features), d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
    def forward(self, feat_ids):
        B, F = feat_ids.shape
        tokens = []
        for i, f in enumerate(self.features):
            v = self.value_embeds[self.name_map[f]](feat_ids[:, i])
            f_emb = self.feature_embeds(torch.tensor(i, device=feat_ids.device))
            tokens.append(v + f_emb)
        x = torch.stack(tokens, dim=1)
        h = self.encoder(x)
        z_event = h.mean(dim=1)
        return z_event

class EventEncoder(nn.Module):
    def __init__(self, vocab_sizes):
        super().__init__()
        self.event_encoder = EventTransformer(vocab_sizes)
        self.frame_encoder = SetEncoder()
        self.gate = nn.Sequential(
            nn.Linear(128 * 2, 128),
            nn.Sigmoid(),
        )
    def forward(self, feat_ids, freeze_frames, actor_locs, device):
        z_event = self.event_encoder(feat_ids)
        z_frame = self.frame_encoder(freeze_frames, actor_locs, device)
        g = self.gate(torch.cat([z_event, z_frame], dim=-1))
        z = g * z_event + (1 - g) * z_frame
        return z


In [6]:
# PlayerBERT

class PlayerBERT(nn.Module):
    def __init__(self, embed_dim=128, nhead=4, num_layers=2, max_len=256):
        super().__init__()
        self.pos_embed = nn.Embedding(max_len, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=nhead, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.mask_token = nn.Parameter(torch.zeros(embed_dim))
    def forward(self, x, attn_mask):
        B, T, D = x.shape
        pos = torch.arange(T, device=x.device).unsqueeze(0).repeat(B, 1)
        x = x + self.pos_embed(pos)
        src_key_padding_mask = ~attn_mask
        h = self.encoder(x, src_key_padding_mask=src_key_padding_mask)
        return h


In [7]:
# Load checkpoints

ckpt = torch.load(EVENT_ENCODER_CKPT, map_location='cpu')
feature_vocab = ckpt['feature_vocab']
vocab_sizes = {k: len(v) for k, v in feature_vocab.items()}

encoder = EventEncoder(vocab_sizes).to(device)
encoder.load_state_dict(ckpt['event_encoder'])
encoder.eval()

playerbert = PlayerBERT(embed_dim=128, nhead=4, num_layers=2, max_len=256).to(device)
playerbert.load_state_dict(torch.load(PLAYERBERT_CKPT, map_location='cpu')['playerbert'])
playerbert.eval()

print('Loaded models')


Loaded models


In [None]:
# Helper: build feature ids for a flattened event

UNK_TOKEN = '[UNK]'
FEATURE_LIST = list(feature_vocab.keys())

def build_feat_ids(ev):
    ids = []
    for feat in FEATURE_LIST:
        val = ev.get(feat, UNK_TOKEN)
        if isinstance(val, bool):
            val = str(val)
        if val is None:
            val = UNK_TOKEN
        idx = feature_vocab[feat].get(val, 0)
        if idx >= len(feature_vocab[feat]):
            idx = 0
        ids.append(idx)
    return torch.tensor(ids, dtype=torch.long)


In [9]:
# Build per-player, per-match sequences ordered by timestamp

sequences = defaultdict(list)
with DATA_PATH.open('r', encoding='utf-8') as f:
    for ev in iter_json_objects(f):
        match_id = ev.get('match_id')
        player_id = ev.get('player.id')
        if match_id is None or player_id is None:
            continue
        sequences[(match_id, player_id)].append(ev)

for key, events in sequences.items():
    events.sort(key=lambda e: (
        e.get('period', 0),
        e.get('minute', 0),
        e.get('second', 0),
        e.get('timestamp', ''),
        e.get('index', 0),
    ))

print('Total sequences:', len(sequences))


Total sequences: 2976


In [16]:
# Load built embeddings (fast search)

EMB_OUT = Path('/content/drive/MyDrive/MLSE/models/player_embeddings.pt')
NAME_OUT = Path('/content/drive/MyDrive/MLSE/models/player_embeddings_names.json')

emb_data = torch.load(EMB_OUT, map_location='cpu')
player_ids = emb_data['player_ids']
emb_matrix = emb_data['embeddings']

with NAME_OUT.open('r', encoding='utf-8') as f:
    player_names = json.load(f)

print('Loaded embeddings:', emb_matrix.shape)


Loaded embeddings: torch.Size([1090, 128])


In [35]:
for pid in player_ids:
    name = player_names.get(str(pid)) or player_names.get(pid)
    print(pid, name)

11086 Burak Yılmaz
8963 Caglar Söyüncü
8286 Leonardo Spinazzola
8541 Kenan Karaman
6954 Giorgio Chiellini
7036 Gianluigi Donnarumma
7173 Leonardo Bonucci
7037 Lorenzo Insigne
12555 Okay Yokuşlu
7038 Manuel Locatelli
7788 Ciro Immobile
11088 Ozan Tufan
10349 Mehmet Zeki Çelik
23558 Merih Demiral
29989 Yusuf Yazıcı
30357 Uğurcan Çakır
6964 Alessandro Florenzi
7024 Jorge Luiz Frello Filho
7131 Domenico Berardi
31042 Cengiz Umut Meraş
8181 Nicolò Barella
7039 Hakan Çalhanoğlu
11514 Giovanni Di Lorenzo
6971 Cengiz Ünder
12293 Kaan Ayhan
30311 İrfan Can Kahveci
6993 Bryan Cristante
21567 İbrahim Halil Dervişoğlu
7156 Federico Chiesa
7471 Andrea Belotti
7777 Federico Bernardeschi
21261 Jonas Older Wind
3043 Christian Dannemann Eriksen
6797 Daniel Wass
16550 Jere Juhani Uronen
38196 Daniel O''Shaughnessy
5536 Yussuf Yurary Poulsen
9725 Joel Pohjanpalo
3570 Pierre-Emile Højbjerg
24455 Paulus Arajuuri
4447 Martin Braithwaite Christensen
26939 Joona Toivio
9476 Teemu Pukki
5534 Simon Thorup Kjær


In [34]:
# Similarity search interface

PLAYER_QUERY = 'Wojciech Szczęsny'
TOP_K = 10

def find_similar_players(player_query, top_k=10):
    # map name -> id
    query_id = None
    for pid in player_ids:
        name = player_names.get(str(pid)) or player_names.get(pid)
        if name == player_query:
            query_id = pid
            break

    if query_id is None:
        print('Player not found:', player_query)
    else:
        # build id -> index
        id_to_idx = {pid: i for i, pid in enumerate(player_ids)}
        qidx = id_to_idx[query_id]
        query_vec = emb_matrix[qidx]

        # cosine similarity
        query_vec = query_vec / (query_vec.norm() + 1e-8)
        embs = emb_matrix / (emb_matrix.norm(dim=1, keepdim=True) + 1e-8)
        sims = torch.mv(embs, query_vec)

        # top-k
        topk = torch.topk(sims, k=top_k + 1)
        print(f"Top {top_k} similar players to {player_query}:")
        count = 0
        for idx in topk.indices.tolist():
            pid = player_ids[idx]
            name = player_names.get(str(pid)) or player_names.get(pid)
            if name == player_query:
                continue
            print(name, float(sims[idx]))
            count += 1
            if count >= top_k:
                break

find_similar_players(PLAYER_QUERY, TOP_K)

Top 10 similar players to Wojciech Szczęsny:
Robin Olsen 0.9992178678512573
Thibaut Courtois 0.999181866645813
Maarten Stekelenburg 0.9991317391395569
Milan Borjan 0.9991282224655151
Martin Dúbravka 0.9990377426147461
Francisco Guillermo Ochoa Magaña 0.9990240931510925
Lukáš Hrádecký 0.9990060329437256
Matvey Safonov 0.9988555312156677
Danny Ward 0.9987244009971619
Unai Simón Mendibil 0.9986366033554077


In [36]:
PLAYER_QUERY = 'Cristiano Ronaldo dos Santos Aveiro'
find_similar_players(PLAYER_QUERY, TOP_K)

Top 10 similar players to Cristiano Ronaldo dos Santos Aveiro:
Robert Lewandowski 0.9993612766265869
Goran Pandev 0.9988412261009216
Álvaro Borja Morata Martín 0.998831033706665
Roland Sallai 0.9987398982048035
Luka Jović 0.9986279606819153
Michael Gregoritsch 0.9986143708229065
Rifat Zhemaletdinov 0.9985833168029785
Roman Yaremchuk 0.9985340237617493
Ciro Immobile 0.9984487295150757
Burak Yılmaz 0.9984136819839478


In [37]:
PLAYER_QUERY = 'Lionel Andrés Messi Cuccittini'
find_similar_players(PLAYER_QUERY, TOP_K)

Top 10 similar players to Lionel Andrés Messi Cuccittini:
Victoria Pelova 0.9986688494682312
John McGinn 0.9986059069633484
Marie Therese Höbinger 0.9985845685005188
Matheus Luiz Nunes 0.9984534978866577
Frederico Rodrigues Santos 0.9984248280525208
Hirving Rodrigo Lozano Bahena 0.9983970522880554
Ingrid Filippa Angeldal 0.9983841180801392
Ángel Fabián Di María Hernández 0.9982596635818481
Dries Mertens 0.998258113861084
Ao Tanaka 0.9982374310493469


In [39]:
PLAYER_QUERY = "N''Golo Kanté"
find_similar_players(PLAYER_QUERY, TOP_K)

Top 10 similar players to N''Golo Kanté:
Aurélien Djani Tchouaméni 0.9990404844284058
Sherida Spitse 0.9988766312599182
Taras Stepanenko 0.9987573027610779
William Silva de Carvalho 0.9987404942512512
Chloe Mccarron 0.9980334043502808
Kacper Kozłowski 0.9980299472808838
Dolores Isabel Jacome Silva 0.9979968070983887
Patrik Hrošovský 0.9977272748947144
Kang-In Lee 0.997724175453186
Stephen Antunes Eustáquio 0.9976423382759094
