<a href="https://colab.research.google.com/github/violinBhoy/Rank-Leagues/blob/main/RankLeagues.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [2]:
#tie the league numbers to a country
leagues = {
    0:"england",
    1:"scotland",
    2: "republic of ireland",
    3: "portugal",
    4: "spain",
    5: "italy",
    6: "switzerland",
    7: "austria",
    8: "germany",
    9: "czech republic",
    10: "poland",
    11: "slovakia",
    12: "hungary",
    13: "slovenia",
    14: "croatia",
    15: "bosnia",
    16: "montenegro",
    17: "albania",
    18: "greece",
    19: "serbia",
    20: "north macedonia",
    21: "bulgaria",
    22: "romania",
    23: "moldova",
    24: "turkey",
    25: "ukraine",
    26: "belarus",
    27: "lithuania",
    28: "latvia",
    29: "estonia",
    30: "norway",
    31: "sweden",
    32: "finland",
    33: "belgium",
    34: "netherlands",
    35: "france",
    36: "denmark",
    37: "israel",
    38: "azerbaijan",
    39: "cyprus",
    40: "iceland",
    41: "kazakhstan",
    42: "armenia",
    43: "wales",
    44: "northern ireland",
    45: "georgia",
    46: "luxembourg",
    47: "MLS"
}

Centroid: the middle of the cluster

In [9]:
# Load the data
df = pd.read_csv("https://raw.githubusercontent.com/violinBhoy/dataForML/refs/heads/main/leagues.csv")

# put the features into an array
features = df[['squad_difference', 'num_winners', 'squad_average', 'internationals', 'points_difference']].values

# scale the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
scaled_features_array = np.array(scaled_features)

# multiply a feature by 10
scaled_features_array[:, 0] *= 1
scaled_features_array[:, 1] *= 1
scaled_features_array[:, 2] *= 1
scaled_features_array[:, 3] *= 1
scaled_features_array[:, 4] *= 1

# Convert back to tensor
X = tf.convert_to_tensor(scaled_features_array, dtype=tf.float32)

# parameters
num_clusters = 4
num_iterations = 100
# Define random initial centroids
#random numbers of num_clusters with a length of scaled_features of type int to create centroids
points_idx = tf.random.uniform([num_clusters], 0, len(scaled_features_array), dtype=tf.int32)
# make the centroid one of the points
centroids = tf.gather(X, points_idx)

# K-means algorithm
for i in range(num_iterations):
    # Expand dimensions to compute distances between each point and centroid
    expanded_X = tf.expand_dims(X, 1)  # Shape becomes (n_samples, 1, n_features)
    expanded_centroids = tf.expand_dims(centroids, 0)  # Shape becomes (1, n_clusters, n_features)

    # Calculate Euclidean distances - distances between points
    distances = tf.reduce_sum(tf.square(expanded_X - expanded_centroids), axis=2)

    # assign the points to a centroid
    assignments = tf.argmin(distances, axis=1)

    # Update centroids based on assignments
    new_centroids = []
    for i in range(num_clusters):
        # Get points assigned to this cluster
        mask = tf.equal(assignments, i) #identify what the goes to what with boolean mask
        cluster_points = tf.boolean_mask(X, mask) #apply the mask to the points

        # If the cluster is not empty, update its centroid
        # find the average of the points, that is the centroid
        if tf.size(cluster_points) > 0:
            new_centroids.append(tf.reduce_mean(cluster_points, axis=0))
        else:
            # Keep the old centroid if no points are assigned to this cluster
            new_centroids.append(centroids[i])

    #put the centroids into one tf stack
    new_centroids = tf.stack(new_centroids)
    #if the centroids haven't changed, we found the right one
    if tf.reduce_all(tf.equal(centroids, new_centroids)):
        break

    centroids = new_centroids
    #num_iterations += 1

# Final cluster assignments
final_assignments = tf.argmin(tf.reduce_sum(tf.square(tf.expand_dims(X, 1) - tf.expand_dims(centroids, 0)), axis=2), axis=1)

print(final_assignments)

# give the data frame the assignments
df['cluster'] = final_assignments.numpy()

# Print some statistics about the clusters
total_size = 0
for i in range(num_clusters):
    cluster_size = np.sum(df['cluster'] == i)
    total_size += cluster_size
    print(f"Cluster {i} contains {cluster_size} samples")

cluster0 = []
cluster1 = []
cluster2 = []
cluster3 = []

for i in range(total_size):
  if final_assignments[i] == 0:
    cluster0.append(leagues[i])
  elif final_assignments[i] == 1:
    cluster1.append(leagues[i])
  elif final_assignments[i] == 2:
    cluster2.append(leagues[i])
  elif final_assignments[i] == 3:
    cluster3.append(leagues[i])

print(cluster0)
print(cluster1)
print(cluster2)
print(cluster3)
print(num_iterations)

tf.Tensor(
[3 2 0 1 3 3 2 2 3 2 0 2 2 0 2 2 0 0 2 2 0 2 0 0 1 2 2 0 2 2 0 0 0 1 1 3 0
 2 2 2 0 0 2 2 2 0 2 1], shape=(48,), dtype=int64)
Cluster 0 contains 16 samples
Cluster 1 contains 5 samples
Cluster 2 contains 22 samples
Cluster 3 contains 5 samples
['republic of ireland', 'poland', 'slovenia', 'montenegro', 'albania', 'north macedonia', 'romania', 'moldova', 'lithuania', 'norway', 'sweden', 'finland', 'denmark', 'iceland', 'kazakhstan', 'georgia']
['portugal', 'turkey', 'belgium', 'netherlands', 'MLS']
['scotland', 'switzerland', 'austria', 'czech republic', 'slovakia', 'hungary', 'croatia', 'bosnia', 'greece', 'serbia', 'bulgaria', 'ukraine', 'belarus', 'latvia', 'estonia', 'israel', 'azerbaijan', 'cyprus', 'armenia', 'wales', 'northern ireland', 'luxembourg']
['england', 'spain', 'italy', 'germany', 'france']
100
