In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import OPTICS
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import matplotlib.pyplot as plt
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import Pipeline
import seaborn as sns

In [2]:
# Load the dataset
data_path = '../data/events.csv'
data = pd.read_csv(data_path)

In [3]:
# Filter data to include only rows with valid shot_place and text
shots_data = data.dropna(subset=['shot_place', 'text'])

In [4]:
# Sample the dataset for quicker execution
shots_data = shots_data.sample(n=10000, random_state=42)
print("Reduced dataset shape:", shots_data.shape)

Reduced dataset shape: (10000, 22)


In [5]:
# Define features to include
categorical_columns = ['side', 'bodypart', 'assist_method', 'situation', 'fast_break', 'event_team', 'opponent']
numerical_columns = ['time', 'shot_outcome', 'location', 'is_goal']
text_column = 'text'

In [6]:
# Handle missing values in numerical columns
shots_data[numerical_columns] = shots_data[numerical_columns].fillna(shots_data[numerical_columns].mean())
print("Data after filling missing values:")
print(shots_data.head())

Data after filling missing values:
          id_odsp    id_event  sort_order  time  \
763533  MHZSkeoD/  MHZSkeoD88          88    84   
143031  CSIVz9Rt/  CSIVz9Rt60          60    77   
901230  AT8BU3Vt/  AT8BU3Vt97          97    87   
127033  dfsta8kI/  dfsta8kI38          38    39   
659151  Qa1FLFwr/  Qa1FLFwr20          20    29   

                                                     text  event_type  \
763533  Goal!  Nantes 2, Bordeaux 1. Cheick DiabatA© (...           1   
143031  Goal!  FC Nurnberg 1, Bayer Leverkusen 3. Andr...           1   
901230  Attempt blocked. Alexandre Mendy (Guingamp) ri...           1   
127033  Attempt blocked. Alejandro Alfaro (Mallorca) r...           1   
659151  Attempt missed. Ola Toivonen (Rennes) right fo...           1   

        event_type2  side        event_team          opponent  ... player_in  \
763533         12.0     2          Bordeaux            Nantes  ...       NaN   
143031         12.0     2  Bayer Leverkusen          Nurnbe

In [7]:
# Balance the dataset using RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_balanced, y_balanced = ros.fit_resample(shots_data, shots_data['shot_place'])
shots_data = pd.DataFrame(X_balanced, columns=shots_data.columns)
print("Balanced dataset shape:", shots_data.shape)

Balanced dataset shape: (30875, 22)


In [8]:
# Preprocessing for text
vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
text_features = vectorizer.fit_transform(shots_data[text_column]).toarray()
print("Text features shape:", text_features.shape)

Text features shape: (30875, 500)


In [9]:
# Preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_columns)
    ],
    remainder='drop'
)

In [10]:
# Apply preprocessing
processed_data = preprocessor.fit_transform(shots_data).toarray()
print("Processed numerical and categorical data shape:", processed_data.shape)

Processed numerical and categorical data shape: (30875, 297)


In [11]:
# Combine text features with numerical and categorical features
final_data = np.hstack([processed_data, text_features])
print("Final combined data shape:", final_data.shape)

Final combined data shape: (30875, 797)


In [12]:
# Dimensionality reduction with PCA
pca = PCA(n_components=0.95)
reduced_data = pca.fit_transform(final_data)
print("PCA reduced data shape:", reduced_data.shape)

PCA reduced data shape: (30875, 243)


In [13]:
# Alternative dimensionality reduction with TruncatedSVD (for text)
svd = TruncatedSVD(n_components=200, random_state=42)
text_reduced = svd.fit_transform(text_features)
alt_data = np.hstack([processed_data, text_reduced])
print("Data with SVD-reduced text shape:", alt_data.shape)

Data with SVD-reduced text shape: (30875, 497)


In [14]:
# Function to fit OPTICS and evaluate metrics
def fit_and_evaluate(data, description):
    print(f"\nClustering for: {description}")
    optics_model = OPTICS(min_samples=10, metric='euclidean', cluster_method='xi')
    optics_model.fit(data)
    labels = optics_model.labels_

    # Metrics
    unique_labels = set(labels)
    print(f"Clusters found: {unique_labels}")
    if len(unique_labels) > 1:
        silhouette = silhouette_score(data, labels, metric='euclidean')
        davies_bouldin = davies_bouldin_score(data, labels)
        calinski_harabasz = calinski_harabasz_score(data, labels)
        print(f"Silhouette Score: {silhouette:.4f}")
        print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")
        print(f"Calinski-Harabasz Score: {calinski_harabasz:.4f}")
    else:
        print("Clustering metrics cannot be computed with fewer than 2 clusters.")

    return labels

In [15]:
# Run clustering on different datasets
labels_pca = fit_and_evaluate(reduced_data, "PCA-Reduced Data")
labels_alt = fit_and_evaluate(alt_data, "SVD-Reduced Text Data")
labels_full = fit_and_evaluate(final_data, "Full Data (Numerical + Categorical + Text)")


Clustering for: PCA-Reduced Data


  ratio = reachability_plot[:-1] / reachability_plot[1:]


Clusters found: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218

KeyboardInterrupt: 

In [None]:
# Visualize clusters using PCA for reduced data
pca_2d = PCA(n_components=2)
pca_2d_data = pca_2d.fit_transform(reduced_data)
plt.figure(figsize=(12, 8))
sns.scatterplot(x=pca_2d_data[:, 0], y=pca_2d_data[:, 1], hue=labels_pca, palette='tab20', legend='full')
plt.title('PCA Visualization of Clusters (PCA-Reduced Data)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Save the best clustering result
shots_data['cluster'] = labels_pca
shots_data.to_csv('best_clustered_shots.csv', index=False)

In [None]:
# For curiosity, out of the 10000 samples, how many are from each shot_place? I want to know if it is balanced
shots_data['shot_place'].value_counts()