In [103]:
from gensim.parsing.preprocessing import stem_text
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim import corpora, models 
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
sys.path.append('../')
from utils import word_frequency_per_player, remove_similar_rows_per_player
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
import pyLDAvis
from pyLDAvis.gensim_models import prepare
import re




# Load data and convert to stemmed and BERT data 

In [104]:
# load data and drop NA's
df = pd.read_csv('https://github.com/svisel22/SS23-BIPM-Analytics-Lab---Group-4-repository/raw/main/Preprocessing/data_clean/de_clean_2.csv')
df = df.dropna(subset=['data'])
df = remove_similar_rows_per_player(df, df['player'])

In [105]:
# create two copies 
df_stem = df.copy()

# create stemmed data
df_stem['data'] = df_stem['data'].apply(stem_text)

df_stem

Unnamed: 0,data,player,language,publishedAt
0,trainer xabi alonso mitgereisten fan monaco gl...,exequiel palacios,de,2023-02-24T09:33:31Z
1,sechser droht viereinhalb monaten paus bitter ...,exequiel palacios,de,2023-03-03T21:35:13Z
2,leverkusen sechser erklart budapest spiel jung...,exequiel palacios,de,2023-03-07T11:34:39Z
3,mitchel bakker mittwoch abschluss train leverk...,exequiel palacios,de,2023-03-08T14:25:18Z
4,warum sehe faz net nicht allerd robert andrich...,exequiel palacios,de,2023-03-09T19:53:46Z
...,...,...,...,...
80,bayer leverkusen wehrt europa leagu lang gut s...,piero hincapie,de,2023-05-12T10:02:12Z
81,xabi alonso taktik duell alten lehrmeist jose ...,piero hincapie,de,2023-05-11T21:17:00Z
82,trotz pleit halbfin hinspiel rom gibt leverkus...,piero hincapie,de,2023-05-11T21:09:16Z
83,hitzigen aufeinandertreffen stadio olimpico na...,piero hincapie,de,2023-05-11T20:58:58Z


In [106]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=0.02)
df_tfidf = tfidf.fit_transform(df_stem.data).toarray()

In [107]:
df_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [108]:
#Store the frequency matrix in data_tfidf. 
data_tfidf = pd.DataFrame(df_tfidf, columns=tfidf.vocabulary_.keys())

#data_tfidf['target_names']=sorted_df_ng.target_names.values #we can't use that because otherwise knn wouldn't work
data_tfidf.head()

Unnamed: 0,trainer,xabi,alonso,fan,monaco,exequiel,palacio,tor,euro,geht,...,freistehend,anzukurbeln,tschechen,anspruchsvol,gehalten,vergaben,lienhart,kleinen,winter,lautstark
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.187187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
#Store the feature names in a words list.
words = data_tfidf.columns.tolist()[:-2] 
words

['trainer',
 'xabi',
 'alonso',
 'fan',
 'monaco',
 'exequiel',
 'palacio',
 'tor',
 'euro',
 'geht',
 'bundesliga',
 'sonntag',
 'uhr',
 'beim',
 'freiburg',
 'sechser',
 'droht',
 'monaten',
 'paus',
 'leverkusen',
 'dafur',
 'schon',
 'lang',
 'zudem',
 'seit',
 'woch',
 'voll',
 'nadiem',
 'amiri',
 'kerem',
 'demirbai',
 'robert',
 'andrich',
 'extrem',
 'fur',
 'zuletzt',
 'zentral',
 'mittelfeld',
 'durft',
 'jahrigen',
 'sech',
 'hertha',
 'bsc',
 'nicht',
 'sicher',
 'mocht',
 'erklart',
 'budapest',
 'spiel',
 'bayer',
 'donnerstag',
 'hinspiel',
 'europa',
 'leagu',
 'sah',
 'defens',
 'mittelfeldspiel',
 'erfolg',
 'dritt',
 'gelb',
 'kart',
 'gesperrt',
 'werkself',
 'ersten',
 'phase',
 'saison',
 'uberzeugt',
 'fussbal',
 'spielt',
 'ware',
 'wahrscheinlich',
 'daher',
 'halbfinal',
 'sorgen',
 'jahrig',
 'mitchel',
 'bakker',
 'abschluss',
 'train',
 'jedoch',
 'startelf',
 'steht',
 'stehen',
 'bereit',
 'jeremi',
 'frimpong',
 'wegen',
 'problem',
 'minuten',
 'platz'

In [110]:
kmeans = KMeans(n_clusters = 4, max_iter=1000, random_state=42)
kmeans.fit(data_tfidf)



In [111]:
common_words = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]

In [112]:
for num, centroid in enumerate(common_words):
    words_in_centroid = [words[word] for word in centroid if word < len(words)]
    print(f"{num}: {', '.join(words_in_centroid)}")
# By adding the if word < len(words) condition, we ensure that only valid indices are used to access the words list or dictionary.

0: leagu, vollig, viert, leistungen, folg, wissen, sagt, heut, nahm, arbeit
1: wohl, arbeit, paus, vollig, leagu, christoph, belgien, vorlag, artikel, weiteren
2: leagu, lief, vorgang, sagt, spiel, ware, folg, kassiert, setzt, ruckspiel
3: vollig, uhr, arbeit, da, piero, sagt, wohl, leagu, zweimal, macht


In [113]:
df_stem['cluster'] = kmeans.labels_
df_stem

Unnamed: 0,data,player,language,publishedAt,cluster
0,trainer xabi alonso mitgereisten fan monaco gl...,exequiel palacios,de,2023-02-24T09:33:31Z,1
1,sechser droht viereinhalb monaten paus bitter ...,exequiel palacios,de,2023-03-03T21:35:13Z,1
2,leverkusen sechser erklart budapest spiel jung...,exequiel palacios,de,2023-03-07T11:34:39Z,1
3,mitchel bakker mittwoch abschluss train leverk...,exequiel palacios,de,2023-03-08T14:25:18Z,1
4,warum sehe faz net nicht allerd robert andrich...,exequiel palacios,de,2023-03-09T19:53:46Z,3
...,...,...,...,...,...
80,bayer leverkusen wehrt europa leagu lang gut s...,piero hincapie,de,2023-05-12T10:02:12Z,0
81,xabi alonso taktik duell alten lehrmeist jose ...,piero hincapie,de,2023-05-11T21:17:00Z,0
82,trotz pleit halbfin hinspiel rom gibt leverkus...,piero hincapie,de,2023-05-11T21:09:16Z,0
83,hitzigen aufeinandertreffen stadio olimpico na...,piero hincapie,de,2023-05-11T20:58:58Z,0


In [114]:
clusters = df_stem.groupby(['cluster']).size()
clusters

cluster
0    12891
1    19479
2     4354
3    23901
dtype: int64

# LDA Model

In [115]:
# Split the texts into a list of words
texts = [text.split() for text in df_stem['data']]

# Create the Gensim dictionary
dictionary = corpora.Dictionary(texts)

# Remove rare and common words from the dictionary
dictionary.filter_extremes(no_below=118, no_above=0.95)

# Derive the absolute frequency matrix using doc2bow
corpus = [dictionary.doc2bow(text) for text in texts]


In [116]:
lda = models.LdaModel(corpus, num_topics=4, id2word=dictionary, chunksize=10, iterations=100, passes=10, random_state=42)


In [117]:
lda.show_topics()

[(0,
  '0.024*"leverkusen" + 0.017*"bayer" + 0.013*"uber" + 0.013*"demirbai" + 0.012*"fur" + 0.011*"hincapi" + 0.010*"tor" + 0.010*"piero" + 0.009*"nicht" + 0.009*"ball"'),
 (1,
  '0.027*"bayer" + 0.026*"leverkusen" + 0.025*"leagu" + 0.021*"europa" + 0.021*"union" + 0.018*"wirtz" + 0.016*"nicht" + 0.014*"halbfinal" + 0.014*"saint" + 0.014*"hinspiel"'),
 (2,
  '0.032*"fur" + 0.025*"leverkusen" + 0.023*"nicht" + 0.019*"hincapi" + 0.016*"piero" + 0.015*"bayer" + 0.014*"xabi" + 0.011*"trainer" + 0.010*"alonso" + 0.010*"monaco"'),
 (3,
  '0.025*"hincapi" + 0.020*"fur" + 0.020*"alonso" + 0.017*"piero" + 0.016*"spiel" + 0.010*"leverkusen" + 0.010*"mourinho" + 0.009*"bayer" + 0.007*"xabi" + 0.007*"immer"')]

In [118]:
topics=lda[corpus]
df_stem['lda_predicted_clusters']= [max(topics[i],key=lambda item:item[1])[0] for i in range(len(topics))]

In [119]:
df_stem.head()

Unnamed: 0,data,player,language,publishedAt,cluster,lda_predicted_clusters
0,trainer xabi alonso mitgereisten fan monaco gl...,exequiel palacios,de,2023-02-24T09:33:31Z,1,2
1,sechser droht viereinhalb monaten paus bitter ...,exequiel palacios,de,2023-03-03T21:35:13Z,1,2
2,leverkusen sechser erklart budapest spiel jung...,exequiel palacios,de,2023-03-07T11:34:39Z,1,3
3,mitchel bakker mittwoch abschluss train leverk...,exequiel palacios,de,2023-03-08T14:25:18Z,1,3
4,warum sehe faz net nicht allerd robert andrich...,exequiel palacios,de,2023-03-09T19:53:46Z,3,0


In [120]:
pyLDAvis.enable_notebook()
LDAvis_prepared = prepare(lda, corpus, dictionary)
LDAvis_prepared

## The names of the players take a huge influence on the prediciton therefore we remove them in the next step

In [121]:
df_stem['data']

0     trainer xabi alonso mitgereisten fan monaco gl...
1     sechser droht viereinhalb monaten paus bitter ...
2     leverkusen sechser erklart budapest spiel jung...
3     mitchel bakker mittwoch abschluss train leverk...
4     warum sehe faz net nicht allerd robert andrich...
                            ...                        
80    bayer leverkusen wehrt europa leagu lang gut s...
81    xabi alonso taktik duell alten lehrmeist jose ...
82    trotz pleit halbfin hinspiel rom gibt leverkus...
83    hitzigen aufeinandertreffen stadio olimpico na...
84    hitzigen aufeinandertreffen stadio olimpico na...
Name: data, Length: 60625, dtype: object

In [123]:
# Function to remove specific words from the string
def remove_words(text):
    pattern = r"\b(mitchel|bakker|exequiel|palacios|piero|hincapie|jeremie|frimpong|jonathan|tah|moussa|diaby|mykhaylo|mudryk)\b"
    return re.sub(pattern, "", text)

# Apply the function to the data column
df_stem['data'] = df_stem['data'].apply(lambda x: remove_words(str(x)))

df_stem


Unnamed: 0,data,player,language,publishedAt,cluster,lda_predicted_clusters
0,trainer xabi alonso mitgereisten fan monaco gl...,exequiel palacios,de,2023-02-24T09:33:31Z,1,2
1,sechser droht viereinhalb monaten paus bitter ...,exequiel palacios,de,2023-03-03T21:35:13Z,1,2
2,leverkusen sechser erklart budapest spiel jung...,exequiel palacios,de,2023-03-07T11:34:39Z,1,3
3,mittwoch abschluss train leverkusen nicht au...,exequiel palacios,de,2023-03-08T14:25:18Z,1,3
4,warum sehe faz net nicht allerd robert andrich...,exequiel palacios,de,2023-03-09T19:53:46Z,3,0
...,...,...,...,...,...,...
80,bayer leverkusen wehrt europa leagu lang gut s...,piero hincapie,de,2023-05-12T10:02:12Z,0,3
81,xabi alonso taktik duell alten lehrmeist jose ...,piero hincapie,de,2023-05-11T21:17:00Z,0,3
82,trotz pleit halbfin hinspiel rom gibt leverkus...,piero hincapie,de,2023-05-11T21:09:16Z,0,3
83,hitzigen aufeinandertreffen stadio olimpico na...,piero hincapie,de,2023-05-11T20:58:58Z,0,3


In [125]:
lda_2 = models.LdaModel(corpus, num_topics=10, id2word=dictionary, chunksize=10, iterations=100, passes=10, random_state=42)


## Finding the best number of topics

In [146]:
models_arr = {'iteration_' + str(iter): '' for iter in range(2, 3)}
for iter in range(2, 15): 
    models_arr.update({'iteration_'+ str(iter): models.LdaModel(corpus, num_topics=iter, id2word=dictionary, chunksize=10, iterations=100, passes=10, random_state=42)})


In [149]:
models_arr['iteration_5'].show_topics()

[(0,
  '0.044*"hincapi" + 0.023*"fur" + 0.020*"leverkusen" + 0.015*"beim" + 0.011*"weigl" + 0.011*"alonso" + 0.010*"adli" + 0.010*"gladbach" + 0.009*"spiel" + 0.009*"xabi"'),
 (1,
  '0.030*"bayer" + 0.029*"leagu" + 0.028*"europa" + 0.027*"wirtz" + 0.024*"diabi" + 0.023*"seit" + 0.022*"leverkusen" + 0.021*"tor" + 0.019*"union" + 0.016*"uber"'),
 (2,
  '0.035*"fur" + 0.024*"leverkusen" + 0.024*"nicht" + 0.016*"alonso" + 0.015*"bayer" + 0.014*"xabi" + 0.013*"trainer" + 0.011*"hincapi" + 0.011*"rolf" + 0.010*"innenverteidig"'),
 (3,
  '0.017*"uber" + 0.015*"bayer" + 0.014*"alonso" + 0.013*"leverkusen" + 0.013*"fur" + 0.011*"demirbai" + 0.010*"mourinho" + 0.009*"ausgleich" + 0.009*"kurz" + 0.008*"traf"'),
 (4,
  '0.035*"leverkusen" + 0.021*"nicht" + 0.019*"bayer" + 0.017*"fur" + 0.017*"leipzig" + 0.013*"trainer" + 0.011*"hincapi" + 0.010*"zwei" + 0.010*"bochum" + 0.009*"sorgt"')]

In [156]:
pyLDAvis.enable_notebook()
LDAvis_prepared = prepare(models_arr['iteration_5'], corpus, dictionary)
LDAvis_prepared

In [154]:
pyLDAvis.enable_notebook()
LDAvis_prepared = prepare(models_arr['iteration_7'], corpus, dictionary)
LDAvis_prepared