In [33]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import OPTICS
from sklearn.metrics import silhouette_score, adjusted_rand_score, davies_bouldin_score, homogeneity_score
from sklearn.manifold import TSNE
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import re
import nltk

In [34]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stoic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\stoic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [35]:
# Load the data
events = pd.read_csv("../data/events.csv")

In [36]:
events

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,...,player_in,player_out,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break
0,UFot0hit/,UFot0hit1,1,2,Attempt missed. Mladen Petric (Hamburg) left f...,1,12.0,2,Hamburg SV,Borussia Dortmund,...,,,6.0,2.0,0,9.0,2.0,1,1.0,0
1,UFot0hit/,UFot0hit2,2,4,"Corner, Borussia Dortmund. Conceded by Dennis...",2,,1,Borussia Dortmund,Hamburg SV,...,,,,,0,,,0,,0
2,UFot0hit/,UFot0hit3,3,4,"Corner, Borussia Dortmund. Conceded by Heiko ...",2,,1,Borussia Dortmund,Hamburg SV,...,,,,,0,,,0,,0
3,UFot0hit/,UFot0hit4,4,7,Foul by Sven Bender (Borussia Dortmund).,3,,1,Borussia Dortmund,Hamburg SV,...,,,,,0,,,0,,0
4,UFot0hit/,UFot0hit5,5,7,Gokhan Tore (Hamburg) wins a free kick in the ...,8,,2,Hamburg SV,Borussia Dortmund,...,,,,,0,2.0,,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941004,z5L2OT5E/,z5L2OT5E123,123,92,Lucas Torreira (Sampdoria) wins a free kick in...,8,,2,Sampdoria,Atalanta,...,,,,,0,2.0,,0,,0
941005,z5L2OT5E/,z5L2OT5E124,124,93,"Corner, Sampdoria. Conceded by Andrea Masiello.",2,,2,Sampdoria,Atalanta,...,,,,,0,,,0,,0
941006,z5L2OT5E/,z5L2OT5E125,125,93,Attempt missed. Fabio Quagliarella (Sampdoria)...,1,12.0,2,Sampdoria,Atalanta,...,,,8.0,2.0,0,9.0,1.0,1,3.0,0
941007,z5L2OT5E/,z5L2OT5E126,126,94,Alberto Grassi (Atalanta) wins a free kick on ...,8,,1,Atalanta,Sampdoria,...,,,,,0,4.0,,0,,0


In [37]:
# --- Step 1: Filter Rows ---
# Filter rows with at least 1000 occurrences per `event_type`
min_count = 1000
event_type_counts = events['event_type'].value_counts()
valid_event_types = event_type_counts[event_type_counts >= min_count].index
filtered_events = events[events['event_type'].isin(valid_event_types)]

In [38]:
# Select 1000 samples for each event_type
balanced_events = filtered_events.groupby('event_type').sample(n=1000, random_state=42)


In [39]:
# Drop rows with missing values in important columns
important_columns = ['text', 'is_goal', 'location', 'assist_method']
balanced_events = balanced_events.dropna(subset=important_columns)

In [40]:
balanced_events

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,...,player_in,player_out,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break
320920,0G8GNYoU/,0G8GNYoU67,67,68,"Goal! Nice 3, Valenciennes 0. Christian BrA¼l...",1,,1,Nice,Valenciennes,...,,,4.0,1.0,1,13.0,3.0,0,1.0,0
314869,2ZNAciJS/,2ZNAciJS11,11,14,Attempt missed. Pierre-Emerick Aubameyang (Bor...,1,12.0,1,Borussia Dortmund,Hamburg SV,...,,,6.0,2.0,0,15.0,1.0,1,1.0,0
751093,MHXMFuYj/,MHXMFuYj5,5,9,Attempt blocked. Odion Ighalo (Watford) right ...,1,12.0,1,Watford,Manchester City,...,,,2.0,3.0,0,11.0,1.0,1,1.0,0
277386,phvfdYJC/,phvfdYJC99,99,84,Attempt missed. Emanuel Herrera (Montpellier) ...,1,12.0,2,Montpellier,Bordeaux,...,,,8.0,2.0,0,15.0,2.0,1,1.0,0
345400,4CVSeAbj/,4CVSeAbj65,65,61,Attempt saved. Miguel Angel de las Cuevas (Osa...,1,12.0,1,Osasuna,Rayo Vallecano,...,,,3.0,1.0,0,8.0,1.0,1,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
678605,fVJ4q2Z4/,fVJ4q2Z41,1,2,Alexander Tettey (Norwich City) wins a free ki...,8,,1,Norwich City,Bournemouth,...,,,,,0,2.0,,0,,0
43597,42Hhbxp0/,42Hhbxp010,10,6,Andreas Ibertsberger (1899 Hoffenheim) wins a ...,8,,1,TSG Hoffenheim,Kaiserslautern,...,,,,,0,1.0,,0,,0
779011,0QHbS7IU/,0QHbS7IU2,2,5,Elseid Hysaj (Napoli) wins a free kick in the ...,8,,2,Napoli,Juventus,...,,,,,0,1.0,,0,,0
594601,4QlUIZ9H/,4QlUIZ9H37,37,43,Jan-Ingwer Callsen-Bracker (FC Augsburg) wins ...,8,,1,FC Augsburg,Bayer Leverkusen,...,,,,,0,2.0,,0,,0


In [41]:
# --- Step 2: Text Preprocessing ---
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [42]:
def preprocess_text(text):
    # Remove special characters and lower the text
    text = re.sub(r'\W', ' ', str(text).lower())
    # Tokenize and remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

In [43]:
balanced_events['processed_text'] = balanced_events['text'].apply(preprocess_text)


In [44]:
tfidf_vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
text_features = tfidf_vectorizer.fit_transform(balanced_events['processed_text'])

In [45]:
location_one_hot = pd.get_dummies(balanced_events['location'], prefix='location')
assist_method_one_hot = pd.get_dummies(balanced_events['assist_method'], prefix='assist_method')
numerical_features = balanced_events[['is_goal']].reset_index(drop=True)
numerical_features = pd.concat([numerical_features, location_one_hot, assist_method_one_hot], axis=1)

In [46]:
numerical_features = numerical_features.reset_index(drop=True)

In [47]:
# Standardize Numerical Features
scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(numerical_features)

In [48]:
# Check Shapes for Alignment
print(f"Text Features Shape: {text_features.shape}")
print(f"Numerical Features Shape: {scaled_numerical_features.shape}")

Text Features Shape: (2000, 500)
Numerical Features Shape: (3996, 24)


In [49]:
# Combine Text and Numerical Features
combined_features = np.hstack([text_features.toarray(), scaled_numerical_features])

# Final Shape Check
print(f"Combined Features Shape: {combined_features.shape}")

ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 2000 and the array at index 1 has size 3996