In [61]:
#%pip install pyLDAvis
#%pip install utils

In [62]:
from gensim.parsing.preprocessing import stem_text
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim import corpora, models 
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
#sys.path.append('../')

from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
#import pyLDAvis
from pyLDAvis.gensim_models import prepare
import re


#from utils import word_frequency_per_player, remove_similar_rows_per_player
def word_frequency_per_player(df, playerlist):
    #Function which returns the frequncy of words in articles for all players 
    
     # define empty df which will be returned in the end
    df_complete = pd.DataFrame()

    for player in playerlist:

         # create the df for the player
        df_player = df[df["player"] == player]
        df_player = df_player.dropna(subset=['data'])
        df_player = df_player.reset_index(drop=True)

        # create a stemmed data corpus
        df_player['stemmed_data'] = df_player['data'].apply(stem_text)
        data_stem = df_player['data'].apply(stem_text)
        data = data_stem.tolist()

        # create a corpus
        corpus_gen=[doc.split() for doc in data]

        # Assume `corpus` is a preprocessed corpus
        id2word = Dictionary(corpus_gen)

        # Filter out rare and common words
        id2word.filter_extremes(no_below=5, no_above=0.95)

        # Display features and their frequencies
        df_frequencies = pd.DataFrame(columns=['Word', 'Frequency', 'player'])
    
        i = 1
        for feature, frequency in id2word.cfs.items():

            # Append a new row to the DataFrame
            df_frequencies.loc[i]= [id2word[feature],frequency, player]
            i = i+1

        df_frequencies = df_frequencies.sort_values('Frequency', ascending=False)

        df_complete = pd.concat([df_complete, df_frequencies], axis=0)

    return df_complete
from difflib import SequenceMatcher
def remove_similar_rows_per_player(df, playerlist, threshold=0.9):
    # The procedure of deleting similiar articles needs to be done by each player because if an article writes about 
    # e.g. two players we want to keep it for both of the players

    # define empty df which will be returned in the end
    df_complete = pd.DataFrame()

    for player in playerlist:
        
        # create the df for the player
        df_player = df[df["player"] == player]
        df_player = df_player.reset_index(drop=True)
        column_as_df = pd.DataFrame(df_player['data'])


        
        # Compute similarity scores for each pair of rows
        similarity_scores = {}
        for i, row in column_as_df.iterrows():
            for j, other_row in column_as_df.iterrows():
                if i >= j:
                    continue
                score = SequenceMatcher(None, row, other_row).ratio()
                if score >= threshold:
                    similarity_scores[(i, j)] = score
        
        # Identify rows to remove
        rows_to_remove = []
        for (i, j), score in similarity_scores.items():
            if i not in rows_to_remove and j not in rows_to_remove:
                rows_to_remove.append(j if df_player.index[i] < df_player.index[j] else i)
        
        # Remove rows and concatenate df
        df_player = df_player.drop(rows_to_remove)
        df_complete = pd.concat([df_complete, df_player], axis=0)

        #return modified DataFrame
    return df_complete


# Load data and convert to stemmed and BERT data 

In [63]:
# load data and drop NA's
df = pd.read_csv('https://github.com/svisel22/SS23-BIPM-Analytics-Lab---Group-4-repository/raw/main/Preprocessing/data_clean/en_clean_2_1.csv')
df = df.dropna(subset=['data'])
df = remove_similar_rows_per_player(df, df['player'])

HTTPError: HTTP Error 404: Not Found

# Stemming

In [None]:
# create two copies 
df_stem = df.copy()

# create stemmed data
df_stem['data'] = df_stem['data'].apply(stem_text)

df_stem

Unnamed: 0,data,player,language,publishedAt
0,footballflorian wirtz goal bayer leverkusen eu...,exequiel palacios,en,2023-02-16T23:56:00Z
1,xasocc footbal europa leagu plai second leg mo...,exequiel palacios,en,2023-02-23T20:50:50Z
2,pickworth mailonlineview commentsbay leverkuse...,exequiel palacios,en,2023-02-23T20:53:59Z
3,bueno air world cup winner argentina celebr fa...,exequiel palacios,en,2023-03-03T16:40:46Z
4,sign insign inth star editionchang locationthi...,exequiel palacios,en,2023-03-03T16:42:19Z
...,...,...,...,...
1,man city' alex robertson make debut aiden o'ne...,piero hincapié,en,2023-03-24T15:24:08Z
2,exequiel palacio score penalti bayer leverkuse...,piero hincapié,en,2023-03-19T20:03:28Z
0,"futurechelsea rumor beaten ""up dozen"" team win...",piero hincapié,en,2023-04-27T04:57:02Z
1,man city' alex robertson make debut aiden o'ne...,piero hincapié,en,2023-03-24T15:24:08Z


## Because the playernames took a huge influence on the clustering they will be removed for each player

In [None]:
#checking
#df_stem['data'].iloc[1]

In [None]:
# Function to remove specific words from the string
def remove_words(text):
    pattern = r"\b(mitchel|bakker|exequiel|palacios|piero|hincapie|jeremie|frimpong|jonathan|tah|moussa|diaby|mykhaylo|mudryk)\b"
    return re.sub(pattern, "", text)

# Apply the function to the data column
df_stem['data'] = df_stem['data'].apply(lambda x: remove_words(str(x)))

df_stem

# for every player remove their names from the texts 
for player in df_stem['player'].unique():
    f_l_name = player.split()

    # Extracting the first name
    first_name = str(f_l_name[0])

    # Extracting the last name
    last_name = str(f_l_name[1])

    updated_pattern = r"\b(" + first_name.lower() + r"|" + last_name.lower() + r")\b|"


    # Apply the function to the data column
    df_stem['data'] = df_stem['data'].apply(lambda x: remove_words(str(x)))
    df_stem.loc[df_stem['player'] == player, 'data'] = df_stem.loc[df_stem['player'] == player, 'data'].apply(lambda x: re.sub(updated_pattern, "", str(x)))


In [None]:
#checking
#df_stem['data'].iloc[1]

# Vectorization

In [None]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=0.02)
df_tfidf = tfidf.fit_transform(df_stem.data).toarray()

In [None]:
df_tfidf

array([[0.        , 0.        , 0.08681054, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.04523016, 0.        , 0.09243338, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [None]:
#Store the frequency matrix in data_tfidf. 
data_tfidf = pd.DataFrame(df_tfidf, columns=tfidf.vocabulary_.keys())

#data_tfidf['target_names']=sorted_df_ng.target_names.values #we can't use that because otherwise knn wouldn't work
data_tfidf.head()

Unnamed: 0,wirtz,goal,bayer,leverkusen,europa,leagu,repres,step,road,recoveri,...,xaraheem,sterl,xarasmu,holjund,xaivan,raheem,badiashil,noni,maduek,benoit
0,0.0,0.0,0.086811,0.0,0.0,0.0,0.0,0.057622,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.043401,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#Store the feature names in a words list.
words = data_tfidf.columns.tolist()[:-2] 
words

['wirtz',
 'goal',
 'bayer',
 'leverkusen',
 'europa',
 'leagu',
 'repres',
 'step',
 'road',
 'recoveri',
 'german',
 'teenag',
 'adam',
 'bayarena',
 'discuss',
 'player',
 'potenti',
 'sport',
 'director',
 'simon',
 'analysi',
 'februari',
 'moment',
 'florian',
 'wait',
 'came',
 'special',
 'long',
 'term',
 'knee',
 'injuri',
 'game',
 'monaco',
 'touch',
 'ball',
 'time',
 'score',
 'includ',
 'vollei',
 'team',
 'mate',
 'palacio',
 'build',
 'never',
 'move',
 'metr',
 'boot',
 'finish',
 'show',
 'quick',
 'shift',
 'foot',
 'seen',
 'minut',
 'earlier',
 'previou',
 'diabi',
 'equalis',
 'calm',
 'abil',
 'break',
 'stride',
 'best',
 'possess',
 'felt',
 'winner',
 'late',
 'win',
 'leg',
 'plai',
 'round',
 'blow',
 'support',
 'latest',
 'journei',
 'bigger',
 'stori',
 'speak',
 'rolf',
 'offic',
 'stadium',
 'english',
 'know',
 'wonder',
 'talent',
 'tell',
 'sky',
 'sens',
 'right',
 'great',
 'hope',
 'footbal',
 'excit',
 'return',
 'fit',
 'extend',
 'far',
 'club

In [None]:
kmeans = KMeans(n_clusters = 4, max_iter=1000, random_state=42)
kmeans.fit(data_tfidf)



In [None]:
common_words = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]

In [None]:
for num, centroid in enumerate(common_words):
    words_in_centroid = [words[word] for word in centroid if word < len(words)]
    print(f"{num}: {', '.join(words_in_centroid)}")
# By adding the if word < len(words) condition, we ensure that only valid indices are used to access the words list or dictionary.

0: subscriptionjoin, appoint, meet, chilwel, demonstr, keeper, intrigu, hot, entertain, public
1: februaryarsen, pass, mood, plan, didn, backlin, publish, abl, gregor, sow
2: dealt, thiago, xale, xavitor, martial, decemberwest, young, lionel, feet, son
3: mundo, departur, imag, footballarsen, tempt, scoresheet, sardar, declar, individu, breath


In [None]:
df_stem['cluster'] = kmeans.labels_
df_stem

Unnamed: 0,data,player,language,publishedAt,cluster
0,footballflorian wirtz goal bayer leverkusen eu...,exequiel palacios,en,2023-02-16T23:56:00Z,0
1,xasocc footbal europa leagu plai second leg mo...,exequiel palacios,en,2023-02-23T20:50:50Z,0
2,pickworth mailonlineview commentsbay leverkuse...,exequiel palacios,en,2023-02-23T20:53:59Z,0
3,bueno air world cup winner argentina celebr fa...,exequiel palacios,en,2023-03-03T16:40:46Z,2
4,sign insign inth star editionchang locationthi...,exequiel palacios,en,2023-03-03T16:42:19Z,2
...,...,...,...,...,...
1,man city' alex robertson make debut aiden o'ne...,piero hincapié,en,2023-03-24T15:24:08Z,2
2,palacio score penalti bayer leverkusen beat b...,piero hincapié,en,2023-03-19T20:03:28Z,0
0,"futurechelsea rumor beaten ""up dozen"" team win...",piero hincapié,en,2023-04-27T04:57:02Z,2
1,man city' alex robertson make debut aiden o'ne...,piero hincapié,en,2023-03-24T15:24:08Z,2


In [None]:
clusters = df_stem.groupby(['cluster']).size()
clusters

cluster
0    5727
1    4744
2    7586
3     972
dtype: int64

# LDA Model

In [None]:
# Split the texts into a list of words
texts = [text.split() for text in df_stem['data']]

# Create the Gensim dictionary
dictionary = corpora.Dictionary(texts)

# Remove rare and common words from the dictionary
dictionary.filter_extremes(no_below=118, no_above=0.95)

# Derive the absolute frequency matrix using doc2bow
corpus = [dictionary.doc2bow(text) for text in texts]


In [None]:
lda = models.LdaModel(corpus, num_topics=4, id2word=dictionary, chunksize=10, iterations=100, passes=10, random_state=42)


In [None]:
lda.show_topics()

[(0,
  '0.037*"extern" + 0.035*"porro" + 0.018*"footbal" + 0.018*"hincapi" + 0.018*"unit" + 0.015*"manchest" + 0.014*"sport" + 0.012*"manag" + 0.012*"time" + 0.011*"final"'),
 (1,
  '0.022*"goal" + 0.022*"ball" + 0.018*"game" + 0.017*"improv" + 0.016*"leagu" + 0.015*"strength" + 0.013*"club" + 0.012*"minut" + 0.012*"season" + 0.011*"pass"'),
 (2,
  '0.018*"chelsea" + 0.011*"sign" + 0.011*"year" + 0.009*"transfer" + 0.009*"club" + 0.008*"arsen" + 0.008*"season" + 0.008*"midfield" + 0.008*"new" + 0.008*"report"'),
 (3,
  '0.077*"recap" + 0.069*"replai" + 0.069*"match" + 0.038*"usa" + 0.038*"network" + 0.024*"xa-" + 0.020*"player" + 0.017*"watch" + 0.016*"leagu" + 0.015*"premier"')]

In [None]:
topics=lda[corpus]
df_stem['lda_predicted_clusters']= [max(topics[i],key=lambda item:item[1])[0] for i in range(len(topics))]

In [None]:
df_stem.head()

Unnamed: 0,data,player,language,publishedAt,cluster,lda_predicted_clusters
0,footballflorian wirtz goal bayer leverkusen eu...,exequiel palacios,en,2023-02-16T23:56:00Z,0,2
1,xasocc footbal europa leagu plai second leg mo...,exequiel palacios,en,2023-02-23T20:50:50Z,0,2
2,pickworth mailonlineview commentsbay leverkuse...,exequiel palacios,en,2023-02-23T20:53:59Z,0,1
3,bueno air world cup winner argentina celebr fa...,exequiel palacios,en,2023-03-03T16:40:46Z,2,2
4,sign insign inth star editionchang locationthi...,exequiel palacios,en,2023-03-03T16:42:19Z,2,2


In [None]:
pyLDAvis.enable_notebook()
LDAvis_prepared = prepare(lda, corpus, dictionary)
LDAvis_prepared

## The names of the players take a huge influence on the prediciton therefore we remove them in the next step

In [None]:
df_stem['data']

0    footballflorian wirtz goal bayer leverkusen eu...
1    xasocc footbal europa leagu plai second leg mo...
2    pickworth mailonlineview commentsbay leverkuse...
3    bueno air world cup winner argentina celebr fa...
4    sign insign inth star editionchang locationthi...
                           ...                        
1    man city' alex robertson make debut aiden o'ne...
2     palacio score penalti bayer leverkusen beat b...
0    futurechelsea rumor beaten "up dozen" team win...
1    man city' alex robertson make debut aiden o'ne...
2     palacio score penalti bayer leverkusen beat b...
Name: data, Length: 19029, dtype: object

In [None]:
df_stem['data'].iloc[1]

"xasocc footbal europa leagu plai second leg monaco bayer leverkusen stade loui monaco franc februari bayer leverkusen player celebr win penalti shoot reuter eric gaillardmonaco feb reuter bayer leverkusen beat monaco penalti win frantic second leg europa leagu playoff thursdai send game extra time team lock aggreg monaco midfield eliot matazo hit bar second round spot kick winger  diabi seal victori german sent leverkusen lost home leg week level tie minut midfield florian wirtz mistak monaco keeper alexand nuebel monaco striker wissam ben yedder equalis minut later spot host earn penalti edmond tapsoba foul monaco midfield eliess ben seghir leverkusen took lead minut midfield  palacio struck edg box follow corner amin adli hour mark fine header monaco level tie breel embolo headerneith team abl score extra time leverkusen held nerv shootout win leg knockout tie major european competit time lose leg home standard thomson reuter trust principl actor adjoa andoh read william shakespear 

In [None]:
lda_5 = models.LdaModel(corpus, num_topics=5, id2word=dictionary, chunksize=10, iterations=100, passes=10, random_state=42)

In [None]:
lda_5.show_topics()

[(0,
  '0.037*"porro" + 0.019*"hincapi" + 0.018*"manag" + 0.017*"manchest" + 0.017*"unit" + 0.017*"team" + 0.016*"touch" + 0.016*"time" + 0.015*"final" + 0.013*"leagu"'),
 (1,
  '0.077*"recap" + 0.070*"replai" + 0.069*"match" + 0.039*"usa" + 0.038*"network" + 0.024*"xa-" + 0.020*"player" + 0.017*"watch" + 0.016*"leagu" + 0.015*"premier"'),
 (2,
  '0.020*"arsen" + 0.019*"report" + 0.013*"new" + 0.012*"want" + 0.012*"loan" + 0.012*"month" + 0.010*"unit" + 0.009*"left" + 0.009*"manchest" + 0.009*"open"'),
 (3,
  '0.026*"chelsea" + 0.016*"sign" + 0.016*"club" + 0.016*"year" + 0.014*"season" + 0.014*"transfer" + 0.011*"leagu" + 0.010*"summer" + 0.010*"player" + 0.009*"old"'),
 (4,
  '0.035*"ball" + 0.034*"goal" + 0.024*"game" + 0.023*"improv" + 0.020*"strength" + 0.019*"minut" + 0.017*"pass" + 0.014*"leagu" + 0.014*"shot" + 0.013*"half"')]

In [None]:
LDAvis_prepared_5 = prepare(lda_5, corpus, dictionary)
LDAvis_prepared_5

In [None]:
lda_10 = models.LdaModel(corpus, num_topics=10, id2word=dictionary, chunksize=10, iterations=100, passes=10, random_state=42)

In [None]:
lda_10.show_topics()

[(0,
  '0.067*"premier" + 0.035*"highlight" + 0.031*"season" + 0.019*"wolv" + 0.018*"report" + 0.017*"like" + 0.016*"old" + 0.015*"team" + 0.014*"unit" + 0.014*"game"'),
 (1,
  '0.166*"replai" + 0.059*"player" + 0.047*"leagu" + 0.042*"watch" + 0.034*"rate" + 0.031*"arsen" + 0.024*"palac" + 0.024*"liverpool" + 0.023*"live" + 0.021*"video"'),
 (2,
  '0.128*"premium" + 0.056*"porro" + 0.027*"minut" + 0.026*"score" + 0.024*"time" + 0.023*"subscript" + 0.023*"closer" + 0.023*"antonio" + 0.021*"mount" + 0.018*"pursuit"'),
 (3,
  '0.049*"chelsea" + 0.031*"sign" + 0.031*"club" + 0.031*"year" + 0.025*"transfer" + 0.019*"summer" + 0.018*"midfield" + 0.012*"attack" + 0.010*"star" + 0.010*"contract"'),
 (4,
  '0.056*"link" + 0.052*"ball" + 0.042*"time" + 0.036*"left" + 0.035*"goal" + 0.023*"box" + 0.022*"plu" + 0.021*"finish" + 0.020*"possess" + 0.019*"chelsea\'"'),
 (5,
  '0.000*"porro" + 0.000*"udines" + 0.000*"kendri" + 0.000*"independient" + 0.000*"doherti" + 0.000*"vall" + 0.000*"unsur" + 0.0

In [None]:
LDAvis_prepared_10 = prepare(lda_10, corpus, dictionary)
LDAvis_prepared_10

## now also Kmeans 

In [None]:
kmeans_5 = KMeans(n_clusters = 5, max_iter=1000, random_state=42)
kmeans_5.fit(data_tfidf)
common_words_5 = kmeans_5.cluster_centers_.argsort()[:,-1:-11:-1]



In [None]:
for num, centroid in enumerate(common_words_5):
    words_in_centroid = [words[word] for word in centroid if word < len(words)]
    print(f"{num}: {', '.join(words_in_centroid)}")
# By adding the if word < len(words) condition, we ensure that only valid indices are used to access the words list or dictionary.

0: februaryarsen, pass, mood, plan, didn, backlin, publish, abl, gregor, sow
1: rais, werder, sure, subscriptionjoin, altern, unbeaten, basic, llori, bolasi, aprille
2: dealt, thiago, xale, decemberwest, xavitor, martial, young, public, feet, son
3: mundo, departur, imag, footballarsen, tempt, scoresheet, sardar, declar, individu, breath
4: appoint, keeper, chilwel, subscriptionjoin, demonstr, meet, intrigu, tri, hot, public


In [None]:
kmeans_10 = KMeans(n_clusters = 10, max_iter=1000, random_state=42)
kmeans_10.fit(data_tfidf)
common_words_10 = kmeans_10.cluster_centers_.argsort()[:,-1:-11:-1]



In [None]:
for num, centroid in enumerate(common_words_10):
    words_in_centroid = [words[word] for word in centroid if word < len(words)]
    print(f"{num}: {', '.join(words_in_centroid)}")
# By adding the if word < len(words) condition, we ensure that only valid indices are used to access the words list or dictionary.

0: pass, plan, abl, februaryarsen, mood, backlin, publish, gregor, plu, peacockspur
1: rais, werder, sure, subscriptionjoin, unbeaten, basic, llori, bolasi, aprille, collin
2: mundo, departur, imag, footballarsen, tempt, scoresheet, sardar, declar, individu, breath
3: didn, sow, februaryarsen, backlin, mood, revel, publish, gregor, italophil, mykhailo
4: dealt, graham, xavitor, deadlin, revel, italian, decemberwest, sell, octoberfulham, son
5: martial, rice, diogo, peacockman, guess, highlightswest, comkei, thiago, analysisaston, basic
6: subscriptionjoin, malen, intrigu, quick, meet, chilwel, hot, xather, demonstr, total
7: young, respond, xale, feet, joao, thiago, highlightsbrentford, develop, replay, decemberwest
8: keeper, appoint, demonstr, tri, chilwel, meet, public, wonderkid, entertain, confer
9: lionel, decemberwest, thiago, revel, slice, xavitor, xale, gross, son, deadlin


# QUESTION: Make without emojis?

# QUESTION: Wordpairs?