In [64]:
#%pip install pyLDAvis
#%pip install utils

In [65]:
from gensim.parsing.preprocessing import stem_text
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim import corpora, models 
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
#sys.path.append('../')

from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
#import pyLDAvis
from pyLDAvis.gensim_models import prepare
import re


#from utils import word_frequency_per_player, remove_similar_rows_per_player
def word_frequency_per_player(df, playerlist):
    #Function which returns the frequncy of words in articles for all players 
    
     # define empty df which will be returned in the end
    df_complete = pd.DataFrame()

    for player in playerlist:

         # create the df for the player
        df_player = df[df["player"] == player]
        df_player = df_player.dropna(subset=['data'])
        df_player = df_player.reset_index(drop=True)

        # create a stemmed data corpus
        df_player['stemmed_data'] = df_player['data'].apply(stem_text)
        data_stem = df_player['data'].apply(stem_text)
        data = data_stem.tolist()

        # create a corpus
        corpus_gen=[doc.split() for doc in data]

        # Assume `corpus` is a preprocessed corpus
        id2word = Dictionary(corpus_gen)

        # Filter out rare and common words
        id2word.filter_extremes(no_below=5, no_above=0.95)

        # Display features and their frequencies
        df_frequencies = pd.DataFrame(columns=['Word', 'Frequency', 'player'])
    
        i = 1
        for feature, frequency in id2word.cfs.items():

            # Append a new row to the DataFrame
            df_frequencies.loc[i]= [id2word[feature],frequency, player]
            i = i+1

        df_frequencies = df_frequencies.sort_values('Frequency', ascending=False)

        df_complete = pd.concat([df_complete, df_frequencies], axis=0)

    return df_complete
from difflib import SequenceMatcher
def remove_similar_rows_per_player(df, playerlist, threshold=0.9):
    # The procedure of deleting similiar articles needs to be done by each player because if an article writes about 
    # e.g. two players we want to keep it for both of the players

    # define empty df which will be returned in the end
    df_complete = pd.DataFrame()

    for player in playerlist:
        
        # create the df for the player
        df_player = df[df["player"] == player]
        df_player = df_player.reset_index(drop=True)
        column_as_df = pd.DataFrame(df_player['data'])


        
        # Compute similarity scores for each pair of rows
        similarity_scores = {}
        for i, row in column_as_df.iterrows():
            for j, other_row in column_as_df.iterrows():
                if i >= j:
                    continue
                score = SequenceMatcher(None, row, other_row).ratio()
                if score >= threshold:
                    similarity_scores[(i, j)] = score
        
        # Identify rows to remove
        rows_to_remove = []
        for (i, j), score in similarity_scores.items():
            if i not in rows_to_remove and j not in rows_to_remove:
                rows_to_remove.append(j if df_player.index[i] < df_player.index[j] else i)
        
        # Remove rows and concatenate df
        df_player = df_player.drop(rows_to_remove)
        df_complete = pd.concat([df_complete, df_player], axis=0)

        #return modified DataFrame
    return df_complete


# Load data and convert to stemmed and BERT data 

In [66]:
# load data and drop NA's
df = pd.read_csv('https://github.com/svisel22/SS23-BIPM-Analytics-Lab---Group-4-repository/raw/main/Preprocessing/data_clean/en_clean_2_1.csv')
df = df.dropna(subset=['data'])
df = remove_similar_rows_per_player(df, df['player'])

# Stemming

In [67]:
# create two copies 
df_stem = df.copy()

# create stemmed data
df_stem['data'] = df_stem['data'].apply(stem_text)

df_stem

Unnamed: 0,data,player,language,publishedAt
0,footbal florian wirtz goal bayer leverkusen eu...,exequiel palacios,en,2023-02-16T23:56:00Z
1,xasocc footbal europa leagu plai second leg mo...,exequiel palacios,en,2023-02-23T20:50:50Z
2,pickworth mailonlin view comment bayer leverku...,exequiel palacios,en,2023-02-23T20:53:59Z
3,bueno air world cup winner argentina celebr fa...,exequiel palacios,en,2023-03-03T16:40:46Z
4,sign sign star edit chang locat copi person no...,exequiel palacios,en,2023-03-03T16:42:19Z
...,...,...,...,...
3,jose mourinho roma reach europa leagu final ho...,piero hincapié,en,2023-05-18T21:09:58Z
0,"futur chelsea rumor beaten ""up dozen"" team win...",piero hincapié,en,2023-04-27T04:57:02Z
1,man city' alex robertson make debut aiden o'ne...,piero hincapié,en,2023-03-24T15:24:08Z
2,exequiel palacio score penalti bayer leverkuse...,piero hincapié,en,2023-03-19T20:03:28Z


## Because the playernames took a huge influence on the clustering they will be removed for each player

In [68]:
#checking
#df_stem['data'].iloc[1]

In [69]:
# Function to remove specific words from the string
def remove_words(text):
    pattern = r"\b(mitchel|bakker|exequiel|palacios|piero|hincapie|jeremie|frimpong|jonathan|tah|moussa|diaby|mykhaylo|mudryk)\b"
    return re.sub(pattern, "", text)

# Apply the function to the data column
df_stem['data'] = df_stem['data'].apply(lambda x: remove_words(str(x)))

df_stem

# for every player remove their names from the texts 
for player in df_stem['player'].unique():
    f_l_name = player.split()

    # Extracting the first name
    first_name = str(f_l_name[0])

    # Extracting the last name
    last_name = str(f_l_name[1])

    updated_pattern = r"\b(" + first_name.lower() + r"|" + last_name.lower() + r")\b|"


    # Apply the function to the data column
    df_stem['data'] = df_stem['data'].apply(lambda x: remove_words(str(x)))
    df_stem.loc[df_stem['player'] == player, 'data'] = df_stem.loc[df_stem['player'] == player, 'data'].apply(lambda x: re.sub(updated_pattern, "", str(x)))


In [70]:
#checking
#df_stem['data'].iloc[1]

# Vectorization

In [71]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=0.02)
df_tfidf = tfidf.fit_transform(df_stem.data).toarray()

In [72]:
df_tfidf

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.04024208, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [73]:
#Store the frequency matrix in data_tfidf. 
data_tfidf = pd.DataFrame(df_tfidf, columns=tfidf.vocabulary_.keys())

#data_tfidf['target_names']=sorted_df_ng.target_names.values #we can't use that because otherwise knn wouldn't work
data_tfidf.head()

Unnamed: 0,footbal,florian,wirtz,goal,bayer,leverkusen,europa,leagu,repres,step,...,patino,art,benoit,badiashil,moura,bite,overdr,xarasmu,noni,maduek
0,0.0,0.0,0.0,0.08531,0.0,0.0,0.0,0.053909,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.040893,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
#Store the feature names in a words list.
words = data_tfidf.columns.tolist()[:-2] 
words

['footbal',
 'florian',
 'wirtz',
 'goal',
 'bayer',
 'leverkusen',
 'europa',
 'leagu',
 'repres',
 'step',
 'road',
 'recoveri',
 'german',
 'teenag',
 'adam',
 'bayarena',
 'discuss',
 'player',
 'potenti',
 'sport',
 'director',
 'simon',
 'rolf',
 'comment',
 'analysi',
 'fridai',
 'februari',
 'moment',
 'wait',
 'came',
 'special',
 'long',
 'term',
 'knee',
 'injuri',
 'game',
 'monaco',
 'touch',
 'ball',
 'time',
 'score',
 'includ',
 'vollei',
 'team',
 'mate',
 'palacio',
 'build',
 'never',
 'move',
 'boot',
 'finish',
 'show',
 'quick',
 'shift',
 'foot',
 'seen',
 'minut',
 'earlier',
 'previou',
 'diabi',
 'equalis',
 'calm',
 'abil',
 'break',
 'best',
 'possess',
 'felt',
 'winner',
 'late',
 'win',
 'leg',
 'plai',
 'round',
 'blow',
 'support',
 'latest',
 'journei',
 'bigger',
 'stori',
 'speak',
 'offic',
 'stadium',
 'english',
 'know',
 'wonder',
 'talent',
 'tell',
 'sky',
 'sens',
 'right',
 'great',
 'hope',
 'excit',
 'return',
 'fit',
 'extend',
 'far',
 'c

In [75]:
kmeans = KMeans(n_clusters = 4, max_iter=1000, random_state=42)
kmeans.fit(data_tfidf)



In [76]:
common_words = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]

In [77]:
for num, centroid in enumerate(common_words):
    words_in_centroid = [words[word] for word in centroid if word < len(words)]
    print(f"{num}: {', '.join(words_in_centroid)}")
# By adding the if word < len(words) condition, we ensure that only valid indices are used to access the words list or dictionary.

0: creator, pereira, passion, milner, moises, theori, kid, arteta, simakan, palkin
1: squeez, sane, west, nile, adli, munich, hoffenheim, boulder, abdoulay, coast
2: hoffenheim, gameswednesdai, copi, resili, marco, massiv, franca, februaryman, consol, copa
3: canada, youtub, jami, occupi, gunners, massiv, dealsreport, lost, doubt, matchweek


In [78]:
df_stem['cluster'] = kmeans.labels_
df_stem

Unnamed: 0,data,player,language,publishedAt,cluster
0,footbal florian wirtz goal bayer leverkusen eu...,exequiel palacios,en,2023-02-16T23:56:00Z,3
1,xasocc footbal europa leagu plai second leg mo...,exequiel palacios,en,2023-02-23T20:50:50Z,3
2,pickworth mailonlin view comment bayer leverku...,exequiel palacios,en,2023-02-23T20:53:59Z,3
3,bueno air world cup winner argentina celebr fa...,exequiel palacios,en,2023-03-03T16:40:46Z,2
4,sign sign star edit chang locat copi person no...,exequiel palacios,en,2023-03-03T16:42:19Z,3
...,...,...,...,...,...
3,jose mourinho roma reach europa leagu final ho...,piero hincapié,en,2023-05-18T21:09:58Z,3
0,"futur chelsea rumor beaten ""up dozen"" team win...",piero hincapié,en,2023-04-27T04:57:02Z,2
1,man city' alex robertson make debut aiden o'ne...,piero hincapié,en,2023-03-24T15:24:08Z,2
2,palacio score penalti bayer leverkusen beat b...,piero hincapié,en,2023-03-19T20:03:28Z,3


In [79]:
clusters = df_stem.groupby(['kmeans_cluster']).size()
clusters

cluster
0     1360
1     8409
2    16702
3    11893
dtype: int64

# LDA Model

In [80]:
# Split the texts into a list of words
texts = [text.split() for text in df_stem['data']]

# Create the Gensim dictionary
dictionary = corpora.Dictionary(texts)

# Remove rare and common words from the dictionary
dictionary.filter_extremes(no_below=118, no_above=0.95)

# Derive the absolute frequency matrix using doc2bow
corpus = [dictionary.doc2bow(text) for text in texts]


In [81]:
lda = models.LdaModel(corpus, num_topics=4, id2word=dictionary, chunksize=10, iterations=100, passes=10, random_state=42)


In [82]:
lda.show_topics()

[(0,
  '0.073*"recap" + 0.066*"replai" + 0.065*"match" + 0.063*"highlight" + 0.036*"usa" + 0.036*"network" + 0.023*"xa-" + 0.019*"player" + 0.018*"man" + 0.018*"rate"'),
 (1,
  '0.012*"club" + 0.011*"season" + 0.010*"chelsea" + 0.008*"game" + 0.007*"player" + 0.007*"time" + 0.007*"leagu" + 0.007*"goal" + 0.007*"team" + 0.006*"year"'),
 (2,
  '0.020*"arsen" + 0.016*"chelsea" + 0.011*"sign" + 0.010*"report" + 0.009*"year" + 0.009*"leagu" + 0.008*"summer" + 0.008*"million" + 0.008*"new" + 0.008*"transfer"'),
 (3,
  '0.032*"leagu" + 0.020*"unit" + 0.015*"leverkusen" + 0.015*"final" + 0.014*"inter" + 0.014*"tottenham" + 0.013*"season" + 0.013*"manchest" + 0.013*"win" + 0.012*"milan"')]

In [83]:
topics=lda[corpus]
df_stem['lda_predicted_clusters']= [max(topics[i],key=lambda item:item[1])[0] for i in range(len(topics))]

In [84]:
df_stem.head()

Unnamed: 0,data,player,language,publishedAt,cluster,lda_predicted_clusters
0,footbal florian wirtz goal bayer leverkusen eu...,exequiel palacios,en,2023-02-16T23:56:00Z,3,1
1,xasocc footbal europa leagu plai second leg mo...,exequiel palacios,en,2023-02-23T20:50:50Z,3,1
2,pickworth mailonlin view comment bayer leverku...,exequiel palacios,en,2023-02-23T20:53:59Z,3,3
3,bueno air world cup winner argentina celebr fa...,exequiel palacios,en,2023-03-03T16:40:46Z,2,2
4,sign sign star edit chang locat copi person no...,exequiel palacios,en,2023-03-03T16:42:19Z,3,1


In [85]:
pyLDAvis.enable_notebook()
LDAvis_prepared = prepare(lda, corpus, dictionary)
LDAvis_prepared

In [88]:
lda_5 = models.LdaModel(corpus, num_topics=5, id2word=dictionary, chunksize=10, iterations=100, passes=10, random_state=42)

In [89]:
lda_5.show_topics()

[(0,
  '0.021*"chelsea" + 0.016*"sign" + 0.013*"report" + 0.012*"summer" + 0.011*"year" + 0.011*"million" + 0.010*"transfer" + 0.010*"new" + 0.010*"club" + 0.010*"deal"'),
 (1,
  '0.056*"recap" + 0.055*"match" + 0.050*"replai" + 0.048*"highlight" + 0.030*"usa" + 0.027*"network" + 0.018*"player" + 0.018*"xa-" + 0.017*"premier" + 0.016*"arsen"'),
 (2,
  '0.051*"arsen" + 0.015*"goal" + 0.014*"ball" + 0.012*"time" + 0.012*"improv" + 0.011*"game" + 0.010*"work" + 0.010*"minut" + 0.010*"com" + 0.009*"win"'),
 (3,
  '0.041*"shakhtar" + 0.025*"twitter" + 0.024*"winner" + 0.023*"seri" + 0.022*"premier" + 0.021*"rice" + 0.019*"sport" + 0.018*"midfield" + 0.018*"latest" + 0.017*"thursdai"'),
 (4,
  '0.039*"leagu" + 0.028*"season" + 0.024*"link" + 0.018*"attack" + 0.018*"year" + 0.015*"chelsea" + 0.012*"eurm" + 0.012*"winger" + 0.012*"sai" + 0.012*"champion"')]

In [90]:
LDAvis_prepared_5 = prepare(lda_5, corpus, dictionary)
LDAvis_prepared_5

In [91]:
lda_10 = models.LdaModel(corpus, num_topics=10, id2word=dictionary, chunksize=10, iterations=100, passes=10, random_state=42)

In [92]:
lda_10.show_topics()

[(0,
  '0.073*"recap" + 0.072*"match" + 0.063*"highlight" + 0.039*"usa" + 0.035*"arsen" + 0.023*"premier" + 0.023*"xa-" + 0.021*"man" + 0.021*"player" + 0.018*"watch"'),
 (1,
  '0.639*"network" + 0.024*"ben" + 0.019*"ham" + 0.019*"list" + 0.017*"access" + 0.016*"extra" + 0.015*"west" + 0.014*"standard" + 0.014*"score" + 0.012*"william"'),
 (2,
  '0.235*"replai" + 0.046*"season" + 0.032*"deal" + 0.032*"video" + 0.030*"like" + 0.026*"link" + 0.025*"player" + 0.017*"new" + 0.015*"talk" + 0.015*"pic"'),
 (3,
  '0.030*"chelsea" + 0.018*"year" + 0.015*"summer" + 0.014*"transfer" + 0.011*"midfield" + 0.011*"new" + 0.011*"old" + 0.010*"juli" + 0.010*"januari" + 0.009*"jpw"'),
 (4,
  '0.062*"sign" + 0.045*"million" + 0.026*"unit" + 0.025*"manchest" + 0.023*"contract" + 0.018*"star" + 0.014*"com" + 0.013*"open" + 0.011*"tottenham" + 0.011*"right"'),
 (5,
  '0.081*"want" + 0.071*"leagu" + 0.062*"season" + 0.039*"champion" + 0.028*"milan" + 0.028*"year" + 0.024*"inter" + 0.022*"madrid" + 0.021*"tu

In [93]:
LDAvis_prepared_10 = prepare(lda_10, corpus, dictionary)
LDAvis_prepared_10

## again Kmeans 

In [94]:
kmeans_5 = KMeans(n_clusters = 5, max_iter=1000, random_state=42)
kmeans_5.fit(data_tfidf)
common_words_5 = kmeans_5.cluster_centers_.argsort()[:,-1:-11:-1]



In [95]:
for num, centroid in enumerate(common_words_5):
    words_in_centroid = [words[word] for word in centroid if word < len(words)]
    print(f"{num}: {', '.join(words_in_centroid)}")
# By adding the if word < len(words) condition, we ensure that only valid indices are used to access the words list or dictionary.

0: hoffenheim, gameswednesdai, massiv, true, copi, consol, marco, boost, allow, franca
1: squeez, sane, west, nile, adli, munich, hoffenheim, boulder, abdoulay, coast
2: canada, jami, gunners, youtub, massiv, occupi, dealsreport, doubt, lost, matchweek
3: copa, worker, semi, resili, weslei, advantag, repli, franca, copi, februaryman
4: creator, pereira, passion, milner, moises, theori, kid, arteta, simakan, palkin


In [96]:
kmeans_10 = KMeans(n_clusters = 10, max_iter=1000, random_state=42)
kmeans_10.fit(data_tfidf)
common_words_10 = kmeans_10.cluster_centers_.argsort()[:,-1:-11:-1]



In [97]:
for num, centroid in enumerate(common_words_10):
    words_in_centroid = [words[word] for word in centroid if word < len(words)]
    print(f"{num}: {', '.join(words_in_centroid)}")
# By adding the if word < len(words) condition, we ensure that only valid indices are used to access the words list or dictionary.

0: sane, boulder, nile, west, coast, hoffenheim, abdoulay, worker, gunner, matchdai
1: judg, hoffenheim, justarsen, usual, moises, theori, marco, massiv, taylor, nor
2: canada, lost, dealsreport, araujo, english, gunners, ajax, occupi, massiv, strategi
3: copa, semi, weslei, franca, worker, die, repli, resili, mccambridg, februaryman
4: hoffenheim, gameswednesdai, copi, marco, consol, resili, massiv, franca, februaryman, allow
5: creator, pereira, passion, milner, moises, theori, kid, arteta, simakan, palkin
6: squeez, adli, munich, burst, ccoffici, west, hoffenheim, nile, abdoulay, matchdai
7: true, boost, tackl, secret, secur, compromis, bernabeu, youtub, experienc, terribl
8: jami, youtub, novemberbrighton, doubt, line, captain, massiv, jordan, occupi, gunners
9: reserv, leonardo, enquir, fade, worker, resili, tournament, canada, gameswednesdai, scrap


# QUESTION: Make without emojis?

# QUESTION: Wordpairs?