In [24]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
from gensim.parsing.preprocessing import remove_stopwords, strip_numeric, strip_punctuation, strip_multiple_whitespaces, strip_short

# Load model

In [25]:
# https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Load data clean

In [26]:
# Load the CSV file into a DataFrame
df_de = pd.read_csv('../Preprocessing/data_clean/labeled-data/labeled-de_clean_1-1.csv',sep = ';')
df_en = pd.read_csv('../Preprocessing/data_clean/labeled-data/labeled-en_clean_1-1_not101010.csv')
df_es = pd.read_csv('../Preprocessing/data_clean/labeled-data/labeled-es_clean_1-1.csv', sep = ';')


In [28]:
df_es['data'] = df_es['data'].apply(strip_punctuation)
df_es['data'] = df_es['data'].str.replace('xa', ' ')
df_es['data'] = df_es['data'].apply(strip_multiple_whitespaces)

df_de['data'] = df_de['data'].apply(strip_punctuation)
df_de['data'] = df_de['data'].apply(strip_multiple_whitespaces)

# Sentiment Analysis

In [29]:
def sentiment_score(data):
    if isinstance(data, str):
        tokens = tokenizer.encode(data, return_tensors='pt')
        result = model(tokens)
        return int(torch.argmax(result.logits)) + 1
    else:
        return 0

In [30]:
df_de['sentiment'] = df_de['data'].apply(lambda x: sentiment_score(x[:512]))
df_en['sentiment'] = df_en['data'].apply(lambda x: sentiment_score(x[:512]))
df_es['sentiment'] = df_es['data'].apply(lambda x: sentiment_score(x[:512]))

In [31]:
# Grouping the data by 'player' and calculating the mean of 'sentiment' for each player
mean_sentiment_de = df_de.groupby('player')['sentiment'].mean()
mean_sentiment_en = df_en.groupby('player')['sentiment'].mean()
mean_sentiment_es = df_es.groupby('player')['sentiment'].mean()

print('Sentiment score per player DE')
print(mean_sentiment_de)
print('Sentiment score per player EN')
print(mean_sentiment_en)
print('Sentiment score per player ES')
print(mean_sentiment_es)

Sentiment score per player DE
player
bakker      2.380952
diaby       2.928571
frimpong    2.915663
hincapie    2.745455
mudryk      2.555556
palacios    2.592105
tah         3.416667
Name: sentiment, dtype: float64
Sentiment score per player EN
player
bakker      2.222222
diaby       3.018868
frimpong    2.772727
hincapie    3.000000
mudryk      2.580645
palacios    2.217391
tah         3.142857
Name: sentiment, dtype: float64
Sentiment score per player ES
player
bakker      2.125000
diaby       2.233333
frimpong    2.772727
hincapie    2.973333
mudryk      2.000000
palacios    2.652778
tah         2.666667
Name: sentiment, dtype: float64


### Translate scores to positive, neutral and negative

In [32]:
sentiment_mapping = {1: 'negativ', 2: 'neutral', 3: 'neutral', 4: 'neutral', 5: 'positiv'}

# Replace the numbers with labels using the mapping
df_de['sentiment'] = df_de['sentiment'].map(sentiment_mapping)
df_en['sentiment'] = df_en['sentiment'].map(sentiment_mapping)
df_es['sentiment'] = df_es['sentiment'].map(sentiment_mapping)

### Drop unlabeled rows

In [34]:
# Drop rows where 'Label' is NaN or empty
df_de.dropna(subset=['Label'], inplace=True)
df_en.dropna(subset=['Label'], inplace=True)
df_es.dropna(subset=['Label'], inplace=True)

# Print the updated DataFrame
print(df_de)

                                                  data    player language  \
14   zweimal verwandelte palacios es wurde ein uber...  palacios       de   
23   schalke reis nimmt den spieler ausdrucklich in...  palacios       de   
31   palacios r verletzte sich leicht im hinspiel g...  palacios       de   
41    auf jeden fall sei das ein ganz grosser tag s...  palacios       de   
58   beim abschlusstraining trug letzterer zumindes...  palacios       de   
63   bayerleverkusen argentinischer weltmeister pal...  palacios       de   
83    es gibt im moment nicht viel besseres in der ...  frimpong       de   
93   der fc bayern munchen interessiert sich nach i...  frimpong       de   
95   frimpong hat noch bis vertrag in bayerleverkus...  frimpong       de   
102  hincapie l und tah bedanken sich fur die fan u...  frimpong       de   
120  diaby wirtz adam hlozek und frimpong teilten s...  frimpong       de   
123   bayerleverkusen torschutze amiri rechts und f...  frimpong       de   

### Accuracy exact match positive, neutral and negative

In [35]:
# Calculate the accuracy
accuracy = (df_es['sentiment'] == df_es['Label']).mean() * 100

# Print the accuracy
print("Spanish Accuracy: {:.2f}%".format(accuracy))

Spanish Accuracy: 43.33%


In [36]:
# Calculate the accuracy
accuracy = (df_en['sentiment'] == df_en['Label']).mean() * 100

# Print the accuracy
print("English Accuracy: {:.2f}%".format(accuracy))

English Accuracy: 50.00%


In [37]:
# Calculate the accuracy
accuracy = (df_de['sentiment'] == df_de['Label']).mean() * 100

# Print the accuracy
print("German Accuracy: {:.2f}%".format(accuracy))

German Accuracy: 56.67%


### Accuracy calcualte error negative-positive

In [38]:


# Count positive and negative combinations
positive_negative_count = ((df_es['Label'] == 'positiv') & (df_es['sentiment'] == 'negativ')).sum()
negative_positive_count = ((df_es['Label'] == 'negativ') & (df_es['sentiment'] == 'positiv')).sum()
print('Spanish')
print("Positive and Negative combinations count:", positive_negative_count)
print("Negative and Positive combinations count:", negative_positive_count)


Spanish
Positive and Negative combinations count: 5
Negative and Positive combinations count: 0


In [39]:
# Count positive and negative combinations
positive_negative_count = ((df_en['Label'] == 'positiv') & (df_en['sentiment'] == 'negativ')).sum()
negative_positive_count = ((df_en['Label'] == 'negativ') & (df_en['sentiment'] == 'positiv')).sum()
print('English')
print("Positive and Negative combinations count:", positive_negative_count)
print("Negative and Positive combinations count:", negative_positive_count)

English
Positive and Negative combinations count: 5
Negative and Positive combinations count: 0


In [40]:
# Count positive and negative combinations
positive_negative_count = ((df_de['Label'] == 'positiv') & (df_de['sentiment'] == 'negativ')).sum()
negative_positive_count = ((df_de['Label'] == 'negativ') & (df_de['sentiment'] == 'positiv')).sum()
print('German')
print("Positive and Negative combinations count:", positive_negative_count)
print("Negative and Positive combinations count:", negative_positive_count)

German
Positive and Negative combinations count: 3
Negative and Positive combinations count: 1


# Accuracy for data condensed

In [41]:
# Load the CSV file into a DataFrame
df_de_1 = pd.read_csv('../Preprocessing/data_clean/labeled-data/labeled-de_clean_condensed.csv',sep = ';')

In [42]:
df_de_1['sentiment'] = df_de_1['data'].apply(lambda x: sentiment_score(x[:512]))

In [43]:
sentiment_mapping = {1: 'negativ', 2: 'neutral', 3: 'neutral', 4: 'neutral', 5: 'positiv'}

# Replace the numbers with labels using the mapping
df_de_1['sentiment'] = df_de_1['sentiment'].map(sentiment_mapping)

In [44]:
# Drop rows where 'Label' is NaN or empty
df_de_1.dropna(subset=['Label'], inplace=True)

In [45]:
# Calculate the accuracy
accuracy = (df_de_1['sentiment'] == df_de_1['Label']).mean() * 100

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy))

Accuracy: 46.67%


In [46]:
# Count positive and negative combinations
positive_negative_count = ((df_de_1['Label'] == 'positiv') & (df_de_1['sentiment'] == 'negativ')).sum()
negative_positive_count = ((df_de_1['Label'] == 'negativ') & (df_de_1['sentiment'] == 'positiv')).sum()

print("Positive and Negative combinations count:", positive_negative_count)
print("Negative and Positive combinations count:", negative_positive_count)

Positive and Negative combinations count: 3
Negative and Positive combinations count: 1
