In [70]:
from gensim.parsing.preprocessing import stem_text
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim import corpora, models 
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
sys.path.append('../')
from utils import word_frequency_per_player
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
import pyLDAvis
from pyLDAvis.gensim_models import prepare



# Load data and convert to stemmed and BERT data 

In [47]:
# load data and drop NA's
df = pd.read_csv('https://github.com/svisel22/SS23-BIPM-Analytics-Lab---Group-4-repository/raw/main/Preprocessing/data_clean/de_clean_2.csv')
df = df.dropna(subset=['data'])

In [48]:
# create two copies 
df_stem = df.copy()

# create stemmed data
df_stem['data'] = df_stem['data'].apply(stem_text)

df_stem

Unnamed: 0,data,player,language,publishedAt
0,trainer xabi alonso mitgereisten fan monaco gl...,exequiel palacios,de,2023-02-24T09:33:31Z
1,sechser droht viereinhalb monaten paus bitter ...,exequiel palacios,de,2023-03-03T21:35:13Z
2,leverkusen sechser erklart budapest spiel jung...,exequiel palacios,de,2023-03-07T11:34:39Z
3,mitchel bakker mittwoch abschluss train leverk...,exequiel palacios,de,2023-03-08T14:25:18Z
5,warum sehe faz net nicht allerd robert andrich...,exequiel palacios,de,2023-03-09T19:53:46Z
...,...,...,...,...
627,bayer leverkusen wehrt europa leagu lang gut s...,piero hincapie,de,2023-05-12T10:02:12Z
628,xabi alonso taktik duell alten lehrmeist jose ...,piero hincapie,de,2023-05-11T21:17:00Z
629,trotz pleit halbfin hinspiel rom gibt leverkus...,piero hincapie,de,2023-05-11T21:09:16Z
630,hitzigen aufeinandertreffen stadio olimpico na...,piero hincapie,de,2023-05-11T20:58:58Z


In [49]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=0.02)
df_tfidf = tfidf.fit_transform(df_stem.data).toarray()

In [50]:
df_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [51]:
#Store the frequency matrix in data_tfidf. 
data_tfidf = pd.DataFrame(df_tfidf, columns=tfidf.vocabulary_.keys())

#data_tfidf['target_names']=sorted_df_ng.target_names.values #we can't use that because otherwise knn wouldn't work
data_tfidf.head()

Unnamed: 0,trainer,xabi,alonso,fan,monaco,exequiel,palacio,tor,euro,geht,...,tschechen,anspruchsvol,gehalten,vergaben,lienhart,kleinen,winter,lautstark,mykhaylo,mudryk
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.188202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
#Store the feature names in a words list.
words = data_tfidf.columns.tolist()[:-2] 
words

['trainer',
 'xabi',
 'alonso',
 'fan',
 'monaco',
 'exequiel',
 'palacio',
 'tor',
 'euro',
 'geht',
 'bundesliga',
 'sonntag',
 'uhr',
 'beim',
 'freiburg',
 'sechser',
 'droht',
 'paus',
 'leverkusen',
 'dafur',
 'schon',
 'lang',
 'zudem',
 'seit',
 'woch',
 'voll',
 'nadiem',
 'amiri',
 'kerem',
 'demirbai',
 'robert',
 'andrich',
 'extrem',
 'fur',
 'zuletzt',
 'zentral',
 'mittelfeld',
 'durft',
 'jahrigen',
 'sech',
 'hertha',
 'bsc',
 'nicht',
 'sicher',
 'mocht',
 'erklart',
 'budapest',
 'spiel',
 'bayer',
 'donnerstag',
 'hinspiel',
 'europa',
 'leagu',
 'sah',
 'defens',
 'mittelfeldspiel',
 'erfolg',
 'dritt',
 'gelb',
 'kart',
 'gesperrt',
 'werkself',
 'ersten',
 'phase',
 'saison',
 'uberzeugt',
 'fussbal',
 'spielt',
 'ware',
 'wahrscheinlich',
 'daher',
 'halbfinal',
 'sorgen',
 'jahrig',
 'mitchel',
 'bakker',
 'abschluss',
 'train',
 'jedoch',
 'startelf',
 'steht',
 'stehen',
 'bereit',
 'jeremi',
 'frimpong',
 'wegen',
 'problem',
 'minuten',
 'platz',
 'mussen',

In [53]:
kmeans = KMeans(n_clusters = 4, max_iter=1000, random_state=42)
kmeans.fit(data_tfidf)



In [54]:
common_words = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]

In [55]:
for num, centroid in enumerate(common_words):
    words_in_centroid = [words[word] for word in centroid if word < len(words)]
    print(f"{num}: {', '.join(words_in_centroid)}")
# By adding the if word < len(words) condition, we ensure that only valid indices are used to access the words list or dictionary.

0: richtig, schiedsricht, dagegen, spielfreud, halben, kassiert, budapest, juventu, folg, fuhrung
1: kollegen, begann, grossen, europa, halben, beiden, viertelfin, belgien, rheinland, uberragenden
2: europa, grund, abseit, kabin, thoma, folg, nehmen, starken, lief, sagt
3: begann, kollegen, halben, europa, dafur, spiel, trafen, sagt, hradecki, kicker


In [56]:
df_stem['cluster'] = kmeans.labels_
df_stem

Unnamed: 0,data,player,language,publishedAt,cluster
0,trainer xabi alonso mitgereisten fan monaco gl...,exequiel palacios,de,2023-02-24T09:33:31Z,1
1,sechser droht viereinhalb monaten paus bitter ...,exequiel palacios,de,2023-03-03T21:35:13Z,3
2,leverkusen sechser erklart budapest spiel jung...,exequiel palacios,de,2023-03-07T11:34:39Z,1
3,mitchel bakker mittwoch abschluss train leverk...,exequiel palacios,de,2023-03-08T14:25:18Z,3
5,warum sehe faz net nicht allerd robert andrich...,exequiel palacios,de,2023-03-09T19:53:46Z,0
...,...,...,...,...,...
627,bayer leverkusen wehrt europa leagu lang gut s...,piero hincapie,de,2023-05-12T10:02:12Z,3
628,xabi alonso taktik duell alten lehrmeist jose ...,piero hincapie,de,2023-05-11T21:17:00Z,3
629,trotz pleit halbfin hinspiel rom gibt leverkus...,piero hincapie,de,2023-05-11T21:09:16Z,3
630,hitzigen aufeinandertreffen stadio olimpico na...,piero hincapie,de,2023-05-11T20:58:58Z,3


In [57]:
clusters = df_stem.groupby(['cluster']).size()
clusters

cluster
0     61
1    242
2     62
3    257
dtype: int64

# LDA Model

In [59]:
# Split the texts into a list of words
texts = [text.split() for text in df_stem['data']]

# Create the Gensim dictionary
dictionary = corpora.Dictionary(texts)

# Remove rare and common words from the dictionary
dictionary.filter_extremes(no_below=118, no_above=0.95)

# Derive the absolute frequency matrix using doc2bow
corpus = [dictionary.doc2bow(text) for text in texts]


In [60]:
lda = models.LdaModel(corpus, num_topics=4, id2word=dictionary, chunksize=10, iterations=100, passes=10, random_state=42)


In [61]:
lda.show_topics()

[(0,
  '0.316*"alonso" + 0.171*"xabi" + 0.144*"seit" + 0.103*"beim" + 0.100*"fur" + 0.083*"trainer" + 0.078*"union" + 0.000*"spiel" + 0.000*"uber" + 0.000*"mehr"'),
 (1,
  '0.102*"diabi" + 0.093*"leverkusen" + 0.075*"bayer" + 0.067*"fur" + 0.065*"tor" + 0.061*"frimpong" + 0.060*"moussa" + 0.059*"nicht" + 0.053*"bakker" + 0.052*"uber"'),
 (2,
  '0.189*"leagu" + 0.181*"bayer" + 0.155*"europa" + 0.112*"schon" + 0.102*"leverkusen" + 0.075*"wirtz" + 0.072*"uber" + 0.061*"hinspiel" + 0.050*"florian" + 0.000*"spiel"'),
 (3,
  '0.168*"fur" + 0.156*"nicht" + 0.156*"spiel" + 0.149*"leverkusen" + 0.076*"adli" + 0.067*"bayer" + 0.059*"mehr" + 0.049*"trainer" + 0.040*"bundesliga" + 0.026*"sagt"')]

In [64]:
topics=lda[corpus]
df_stem['lda_predicted_clusters']= [max(topics[i],key=lambda item:item[1])[0] for i in range(len(topics))]

In [65]:
df_stem.head()

Unnamed: 0,data,player,language,publishedAt,cluster,lda_predicted_clusters
0,trainer xabi alonso mitgereisten fan monaco gl...,exequiel palacios,de,2023-02-24T09:33:31Z,1,0
1,sechser droht viereinhalb monaten paus bitter ...,exequiel palacios,de,2023-03-03T21:35:13Z,3,3
2,leverkusen sechser erklart budapest spiel jung...,exequiel palacios,de,2023-03-07T11:34:39Z,1,3
3,mitchel bakker mittwoch abschluss train leverk...,exequiel palacios,de,2023-03-08T14:25:18Z,3,1
5,warum sehe faz net nicht allerd robert andrich...,exequiel palacios,de,2023-03-09T19:53:46Z,0,3


In [71]:
pyLDAvis.enable_notebook()
LDAvis_prepared = prepare(lda, corpus, dictionary)
LDAvis_prepared

## The names of the players take a huge influence on the prediciton therefore we remove them in the next step