In [4]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import silhouette_score
from IPython.display import clear_output

# Load the CSV File
file_path = "tournament_decks.csv"
data = pd.read_csv(file_path)

# Parse the "cards" column to extract card links and counts as features
def parse_cards(cards):
    card_list = cards.split('|')
    card_features = {}
    for card in card_list:
        parts = card.split('#')
        if len(parts) >= 4:
            card_link = parts[3]  # Use the card link as the feature
            card_count = int(parts[2])  # Use the card count as the value
            card_features[card_link] = card_count
    return card_features

# Apply the parsing function to create a list of dictionaries
data['card_features'] = data['cards'].apply(parse_cards)

# Filter out decks with rank less than 16
data = data[data['rank'] <= 16]

# Convert the list of dictionaries into a feature matrix
vectorizer = DictVectorizer(sparse=True)
X = vectorizer.fit_transform(data['card_features'])

# Normalize the feature matrix
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)

In [None]:
# Find the optimal number of clusters
silhouette_scores = []
wcss = []
cluster_range = range(100, 1000)
for n_clusters in cluster_range:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(X_scaled)
    silhouette_scores.append(silhouette_score(X_scaled, cluster_labels))
    wcss.append(kmeans.inertia_)

    clear_output(wait=True)
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    # Plot silhouette scores
    axes[0].plot(range(100, n_clusters + 1), silhouette_scores, marker='o')
    axes[0].set_title('Silhouette Scores vs. Number of Clusters')
    axes[0].set_xlabel('Number of Clusters')
    axes[0].set_ylabel('Silhouette Score')

    # Plot WCSS
    axes[1].plot(range(100, n_clusters + 1), wcss, marker='o')
    axes[1].set_title('WCSS vs. Number of Clusters')
    axes[1].set_xlabel('Number of Clusters')
    axes[1].set_ylabel('WCSS')

    # Show the plots
    plt.show()

plt.show()

In [5]:
num_clusters = 400
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
data['cluster'] = kmeans.fit_predict(X_scaled)

In [6]:
# Calculate cluster statistics
cluster_stats = data.groupby('cluster').agg(
    cluster_size=('cluster', 'size'),  # Number of decks in each cluster
    mean_rank=('rank', 'mean'),       # Average rank of decks in each cluster
    min_rank=('rank', 'min'),         # Minimum rank in each cluster
    max_rank=('rank', 'max')          # Maximum rank in each cluster
).reset_index()

# Print cluster statistics
cluster_stats

Unnamed: 0,cluster,cluster_size,mean_rank,min_rank,max_rank
0,0,2,2.000000,2,2
1,1,271,7.771218,1,16
2,2,213,8.136150,1,16
3,3,1,6.000000,6,6
4,4,4,12.000000,5,16
...,...,...,...,...,...
395,395,1,5.000000,5,5
396,396,1,6.000000,6,6
397,397,3,8.000000,5,11
398,398,16,7.687500,1,14


In [77]:
# Calculate the median rank for each cluster
cluster_mean_rank = data.groupby('cluster').agg(
    mean_rank=('rank', 'mean'),  # Mean rank of decks in each cluster
    cluster_size=('cluster', 'size'), # Number of decks in each cluster
    main_pokemon=('mainpokemon', lambda x: x.mode()[0] if not x.mode().empty else None),  # Most common main Pokemon
    secondary_pokemon=('secondarypokemon', lambda x: x.mode()[0] if not x.mode().empty else None),  # Most common secondary Pokemon
).reset_index()

# Sort clusters by median rank
cluster_mean_rank = cluster_mean_rank.sort_values(by='mean_rank')
pd.options.display.max_rows = None
cluster_mean_rank[(cluster_mean_rank['cluster_size'] > 20) & ((cluster_mean_rank['main_pokemon'] == 'charizard') | (cluster_mean_rank['secondary_pokemon'] == 'charizard'))].head(20)


Unnamed: 0,cluster,mean_rank,cluster_size,main_pokemon,secondary_pokemon
247,247,7.604167,48,charizard,dudunsparce
23,23,7.925,80,noctowl,charizard
38,38,7.968553,159,charizard,noctowl
39,39,8.235294,34,charizard,pidgeot
382,382,8.731707,82,charizard,pidgeot
240,240,8.818182,77,charizard,pidgeot
176,176,8.833333,72,dragapult,charizard
322,322,8.987179,78,noctowl,charizard
399,399,9.30303,33,charizard,pidgeot


In [79]:
# Sort values by rank then by tournamenturl
data[data['cluster'] == 247].sort_values(by=['rank', 'tournamenturl'], ascending=[True, False])

Unnamed: 0,tournamenturl,playername,mainpokemon,secondarypokemon,deckurl,rank,cards,card_features,cluster
3802,https://limitlesstcg.com/tournaments/jp/2082,はにゅ,charizard,dudunsparce,https://limitlesstcg.com/decks/list/jp/31099,1,P#Charmander#3#https://limitlesstcg.com/cards/...,"{'https://limitlesstcg.com/cards/MEW/4': 3, 'h...",247
7092,https://limitlesstcg.com/tournaments/jp/2291,しげ,charizard,dudunsparce,https://limitlesstcg.com/decks/list/jp/34338,2,P#Charmander#3#https://limitlesstcg.com/cards/...,"{'https://limitlesstcg.com/cards/MEW/4': 3, 'h...",247
6348,https://limitlesstcg.com/tournaments/jp/2242,ソラト,charizard,dudunsparce,https://limitlesstcg.com/decks/list/jp/33604,2,P#Charmander#3#https://limitlesstcg.com/cards/...,"{'https://limitlesstcg.com/cards/MEW/4': 3, 'h...",247
3484,https://limitlesstcg.com/tournaments/jp/2061,マサシ,charizard,dudunsparce,https://limitlesstcg.com/decks/list/jp/30787,2,P#Charmander#4#https://limitlesstcg.com/cards/...,"{'https://limitlesstcg.com/cards/MEW/4': 4, 'h...",247
3308,https://limitlesstcg.com/tournaments/jp/2050,しょ,charizard,pidgeot,https://limitlesstcg.com/decks/list/jp/30613,2,P#Charmander#3#https://limitlesstcg.com/cards/...,"{'https://limitlesstcg.com/cards/MEW/4': 3, 'h...",247
3148,https://limitlesstcg.com/tournaments/jp/2040,ぶっちー,charizard,dudunsparce,https://limitlesstcg.com/decks/list/jp/30455,2,P#Charmander#3#https://limitlesstcg.com/cards/...,"{'https://limitlesstcg.com/cards/MEW/4': 3, 'h...",247
1004,https://limitlesstcg.com/tournaments/jp/1906,すん,dudunsparce,charizard,https://limitlesstcg.com/decks/list/jp/28326,2,P#Charmander#3#https://limitlesstcg.com/cards/...,"{'https://limitlesstcg.com/cards/MEW/4': 3, 'h...",247
972,https://limitlesstcg.com/tournaments/jp/1904,おぐら,charizard,dudunsparce,https://limitlesstcg.com/decks/list/jp/28294,2,P#Dunsparce#3#https://limitlesstcg.com/cards/P...,"{'https://limitlesstcg.com/cards/PAL/156': 3, ...",247
6333,https://limitlesstcg.com/tournaments/jp/2241,しらたま,charizard,pidgeot,https://limitlesstcg.com/decks/list/jp/33589,3,P#Charmander#3#https://limitlesstcg.com/cards/...,"{'https://limitlesstcg.com/cards/MEW/4': 3, 'h...",247
3581,https://limitlesstcg.com/tournaments/jp/2067,りょうま,charizard,dudunsparce,https://limitlesstcg.com/decks/list/jp/30881,3,P#Charmander#2#https://limitlesstcg.com/cards/...,"{'https://limitlesstcg.com/cards/MEW/4': 2, 'h...",247
