In [14]:
import numpy as np
import kmedoids
import random
import pickle
import matplotlib.pyplot as plt
import math
from gensim import models, corpora
from pathlib import Path
import sys

ROOT = Path.cwd().parent 
sys.path.insert(0, str(ROOT))

import importlib
import playstyle_utils
importlib.reload(playstyle_utils)

from playstyle_utils import compute_dtw_distance_matrix, assign_to_nearest_medoids, split, compute_stability_metric, compute_club_topic_distributions, aitchison_similarity
PROJECT_ROOT = Path.cwd().resolve().parent 
DATA = PROJECT_ROOT / "data" / "derived"

In [2]:
with open(DATA / "sample_movement_chains.pkl", "rb") as f:
    sample_movement_chains = pickle.load(f)

with open(DATA / "movement_chain_coords.pkl", "rb") as f:
    movement_chain_coords = pickle.load(f)

with open(DATA / "movement_chain_clusters.pkl", "rb") as f:
    movement_chain_clusters = pickle.load(f)

with open(DATA / "match_movement_chains_coords.pkl", "rb") as f:
    match_movement_chains_coords = pickle.load(f)

In [None]:
sample_size = 20000
cluster_candidates = range(10, 130, 10)
n_samples = 2

cluster_stability = {}
cluster_stability_3 = {}

for k in cluster_candidates:
    print(f"Number of clusters: {k}")

    stability_n_samples= []
    stability_n_samples_3 = []
    for run in range(n_samples): 
        print(f"Run: {run}")
        random.seed(run)
        array_coordinates_list = (random.sample(movement_chain_coords, k = 1000))
        distance_matrix = compute_dtw_distance_matrix(array_coordinates_list)
        medoid_indices = kmedoids.fastpam1(distance_matrix, k, init="build")

        medoid_data = [array_coordinates_list[m] for m in medoid_indices.medoids]

        assignments = assign_to_nearest_medoids(medoid_indices.medoids, medoid_data, match_movement_chains_coords)
        scores = []
        scores_top3 = []
        for t in range(2):
            split_result = split(assignments, seed=t)
            score, score3 = compute_stability_metric(split_result)
            scores.append(score)
            scores_top3.append(score3)
        avg_score = float(np.mean(scores))
        avg_score_3 = float(np.mean(scores_top3))
        stability_n_samples.append(avg_score)
        stability_n_samples_3.append(avg_score_3)

    cluster_stability[k] = np.mean(stability_n_samples)
    cluster_stability_3[k] = np.mean(stability_n_samples_3)

In [None]:
stabilities1 = [cluster_stability[k] for k in cluster_candidates]
stabilities3 = [cluster_stability_3[k] for k in cluster_candidates]

plt.figure(figsize=(10, 6))
plt.plot(cluster_candidates, stabilities3, marker='o', linestyle='-', color='blue', label='Top-3 Stability')
plt.plot(cluster_candidates, stabilities1, marker='o', linestyle='-', color='red', label='Top-1 Stability')

plt.xlabel('Number of Clusters')
plt.ylabel('Team Stability')
plt.xticks(cluster_candidates)
plt.grid(True)

plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), frameon = False)

plt.tight_layout()
plt.show()

In [None]:
movement_chain_clusters = {key: [str(token) for token in doc] for key, doc in movement_chain_clusters.items()}
club_matches = {}
for match in movement_chain_clusters:
    club = match.split('_')[1]
    club_matches.setdefault(club, []).append(match)
club_matches

dictionary = corpora.Dictionary(list(movement_chain_clusters.values()))

results = []

for num_topics in range(2, 11):       
    scores = []
    for run in range(10):  
        print(f"Number of topics: {num_topics}, run: {run}")          
        # Data splitting
        train, test = [], []
        for matches in club_matches.values():
            m = matches.copy()
            random.shuffle(m)
            split = len(m)//2 + (len(m)%2 and random.choice([0,1]))
            train += m[:split]
            test  += m[split:]
        train_distr = {m: movement_chain_clusters[m] for m in train}
        test_distr  = {m: movement_chain_clusters[m] for m in test}

        # Prepare corpus and train LDA 
        corpus_train = [dictionary.doc2bow(doc) for doc in train_distr.values()]
        lda = models.LdaModel(
            corpus=corpus_train,
            id2word=dictionary,
            num_topics=num_topics,
            random_state=run,
            passes=100,
            iterations=100,
            alpha=1/num_topics,
            eta=1/num_topics,
            chunksize = 3652
        )

        # Compute composite score
        train_topics = compute_club_topic_distributions(train_distr, dictionary, lda, num_topics)
        test_topics  = compute_club_topic_distributions(test_distr,  dictionary, lda, num_topics)

        run_scores = []
        for club, v_train in train_topics.items():
            if club in test_topics:
                S_self   = aitchison_similarity(v_train, test_topics[club])
                others   = [aitchison_similarity(v_train, v) 
                            for c,v in test_topics.items() if c != club]
                S_others = np.mean(others) if others else 0
                run_scores.append(S_self * (1 - S_others))

        scores.append(np.mean(run_scores) if run_scores else 0)
        print(np.mean(run_scores))

    mean_score = np.mean(scores)
    var_score  = np.var(scores, ddof=1)  
    results.append((num_topics, mean_score, var_score))
    print(num_topics, mean_score, var_score)

# Print the table
for K, m, v in results:
    print(f"{K}  {m}  {v}")