In [23]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
from gensim.parsing.preprocessing import remove_stopwords, strip_numeric, strip_punctuation, strip_multiple_whitespaces, strip_short
from sklearn.metrics import confusion_matrix

# Load model

In [9]:
# https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Load data clean

In [10]:
# Load the CSV file into a DataFrame
df_de = pd.read_csv('../Preprocessing/data_clean/labeled-data/labeled-de_clean_1-1.csv',sep = ';')
#df_en = pd.read_csv('../Preprocessing/data_clean/en_clean_1.csv')
df_en = pd.read_csv('../Preprocessing/data_clean/labeled-data/labeled-en_clean_1-1_not101010.csv')
#df_es = pd.read_csv('../Preprocessing/data_clean/en_clean_1.csv')
df_es = pd.read_csv('../Preprocessing/data_clean/labeled-data/labeled-es_clean_1-1.csv', sep = ';')


In [11]:
df_es['data'] = df_es['data'].apply(strip_punctuation)
df_es['data'] = df_es['data'].str.replace('xa', ' ')
df_es['data'] = df_es['data'].apply(strip_multiple_whitespaces)

df_de['data'] = df_de['data'].apply(strip_punctuation)
df_de['data'] = df_de['data'].apply(strip_multiple_whitespaces)

df_en['data'] = df_en['data'].apply(strip_punctuation)
df_en['data'] = df_en['data'].apply(strip_multiple_whitespaces)

# Sentiment Analysis

In [12]:
def sentiment_score(data):
    if isinstance(data, str):
        tokens = tokenizer.encode(data, return_tensors='pt')
        result = model(tokens)
        return int(torch.argmax(result.logits)) + 1
    else:
        return 0

In [13]:
df_de['sentiment'] = df_de['data'].apply(lambda x: sentiment_score(x[:512]))
df_en['sentiment'] = df_en['data'].apply(lambda x: sentiment_score(x[:512]))
df_es['sentiment'] = df_es['data'].apply(lambda x: sentiment_score(x[:512]))

### Translate scores to positive, neutral and negative

In [14]:
sentiment_mapping = {1: 'negativ', 2: 'neutral', 3: 'neutral', 4: 'neutral', 5: 'positiv'}

# Replace the numbers with labels using the mapping
df_de['sentiment'] = df_de['sentiment'].map(sentiment_mapping)
df_en['sentiment'] = df_en['sentiment'].map(sentiment_mapping)
df_es['sentiment'] = df_es['sentiment'].map(sentiment_mapping)

### Drop unlabeled rows

In [15]:
# Drop rows where 'Label' is NaN or empty
df_de.dropna(subset=['Label'], inplace=True)
df_en.dropna(subset=['Label'], inplace=True)
df_es.dropna(subset=['Label'], inplace=True)

# Print the updated DataFrame
print(df_de)

                                                  data    player language  \
14   zweimal verwandelte palacios es wurde ein uber...  palacios       de   
23   schalke reis nimmt den spieler ausdrucklich in...  palacios       de   
31   palacios r verletzte sich leicht im hinspiel g...  palacios       de   
41    auf jeden fall sei das ein ganz grosser tag s...  palacios       de   
58   beim abschlusstraining trug letzterer zumindes...  palacios       de   
63   bayerleverkusen argentinischer weltmeister pal...  palacios       de   
83    es gibt im moment nicht viel besseres in der ...  frimpong       de   
93   der fc bayern munchen interessiert sich nach i...  frimpong       de   
95   frimpong hat noch bis vertrag in bayerleverkus...  frimpong       de   
102  hincapie l und tah bedanken sich fur die fan u...  frimpong       de   
120  diaby wirtz adam hlozek und frimpong teilten s...  frimpong       de   
123   bayerleverkusen torschutze amiri rechts und f...  frimpong       de   

### Accuracy exact match positive, neutral and negative

In [16]:
# Calculate the accuracy
accuracy_de = (df_de['sentiment'] == df_de['Label']).mean() * 100
accuracy_en = (df_en['sentiment'] == df_en['Label']).mean() * 100
accuracy_es = (df_es['sentiment'] == df_es['Label']).mean() * 100

# Print the accuracy
print("Accuracy DE: {:.2f}%".format(accuracy_de))
print("Accuracy EN: {:.2f}%".format(accuracy_en))
print("Accuracy ES: {:.2f}%".format(accuracy_es))

Accuracy DE: 56.67%
Accuracy EN: 46.67%
Accuracy ES: 43.33%


In [28]:
def c_matrix(df):

    # Define the labels
    labels = ['positiv', 'neutral', 'negativ']

    # Create the confusion matrix
    cm = confusion_matrix(df['Label'], df['sentiment'], labels=labels)

    # Calculate True Positives (TP), True Negatives (TN), False Positives (FP), and False Negatives (FN) as percentages
    total = np.sum(cm)
    tp_percent = (np.diag(cm) / total) * 100
    tn_percent = ((total - (np.sum(cm, axis=0) + np.sum(cm, axis=1) - np.diag(cm))) / total) * 100
    fp_percent = (np.sum(cm, axis=0) - np.diag(cm)) / total * 100
    fn_percent = (np.sum(cm, axis=1) - np.diag(cm)) / total * 100

    # Display the confusion matrix and calculations as percentages
    confusion_df = pd.DataFrame(cm, index=labels, columns=labels)
    confusion_df['True Positives (%)'] = tp_percent
    confusion_df['True Negatives (%)'] = tn_percent
    confusion_df['False Positives (%)'] = fp_percent
    confusion_df['False Negatives (%)'] = fn_percent
    return confusion_df

In [29]:
de_matrix = c_matrix(df_de)
de_matrix

Unnamed: 0,positiv,neutral,negativ,True Positives (%),True Negatives (%),False Positives (%),False Negatives (%)
positiv,4,3,3,13.333333,60.0,6.666667,20.0
neutral,1,8,1,26.666667,43.333333,23.333333,6.666667
negativ,1,4,5,16.666667,53.333333,13.333333,16.666667


In [30]:
en_matrix = c_matrix(df_en)
en_matrix

Unnamed: 0,positiv,neutral,negativ,True Positives (%),True Negatives (%),False Positives (%),False Negatives (%)
positiv,2,5,6,6.666667,50.0,6.666667,36.666667
neutral,2,8,3,26.666667,40.0,16.666667,16.666667
negativ,0,0,4,13.333333,56.666667,30.0,0.0


In [31]:
es_matrix = c_matrix(df_es)
es_matrix

Unnamed: 0,positiv,neutral,negativ,True Positives (%),True Negatives (%),False Positives (%),False Negatives (%)
positiv,0,5,5,0.0,66.666667,0.0,33.333333
neutral,0,8,2,26.666667,33.333333,33.333333,6.666667
negativ,0,5,5,16.666667,43.333333,23.333333,16.666667


# Accuracy for data condensed

In [17]:
# Load the CSV file into a DataFrame
df_de_1 = pd.read_csv('../Preprocessing/data_clean/labeled-data/labeled-de_clean_condensed.csv',sep = ';')

In [18]:
df_de_1['sentiment'] = df_de_1['data'].apply(lambda x: sentiment_score(x[:512]))

In [19]:
sentiment_mapping = {1: 'negativ', 2: 'neutral', 3: 'neutral', 4: 'neutral', 5: 'positiv'}

# Replace the numbers with labels using the mapping
df_de_1['sentiment'] = df_de_1['sentiment'].map(sentiment_mapping)

In [20]:
# Drop rows where 'Label' is NaN or empty
df_de_1.dropna(subset=['Label'], inplace=True)

In [21]:
# Calculate the accuracy
accuracy = (df_de_1['sentiment'] == df_de_1['Label']).mean() * 100

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy))

Accuracy: 46.67%
