In [70]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


# Load Data

In [71]:
# Load the CSV file into a DataFrame
#ACTION:Why doesn't this work for me? I checked the link so often and the one below works.
df_de = pd.read_csv('../Preprocessing/data_clean/labeled-data/labeled-de_clean_1-1.csv', sep=';')

In [72]:
# Load the CSV file into a DataFrame
df_de_con = pd.read_csv('../Preprocessing/data_clean/labeled-data/labeled-de_clean_condensed.csv', sep=';')

In [73]:
df_de_con.head()

Unnamed: 0,data,player,language,publishedAt,Label
0,trainer alonso mitgereisten fans monaco glucks...,exequiel palacios,de,2023-02-24T09:33:31Z,positiv
1,sechser droht viereinhalb monaten pause bitter...,exequiel palacios,de,2023-03-03T21:35:13Z,
2,bayerleverkusen sechser erklart budapest spiel...,exequiel palacios,de,2023-03-07T11:34:39Z,
3,bakker mittwoch abschluss training bayerleverk...,exequiel palacios,de,2023-03-08T14:25:18Z,
4,allerdings andrich argentinischen weltmeister ...,exequiel palacios,de,2023-03-09T19:53:46Z,


### Fix DE

In [74]:
#for DE datasate drop empty lines
df_de = df_de.dropna(subset=['data']) #Why in data and not in Label column?

# Sentiment analysis full text

In [75]:
def perform_sentiment_analysis(df):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("oliverguhr/german-sentiment-bert")
    model = AutoModelForSequenceClassification.from_pretrained("oliverguhr/german-sentiment-bert")

    # Create an empty list to store the sentiment scores
    sentiment_scores = []

    # Iterate over the 'data' column in the DataFrame
    for text in df['data']:
        # Tokenize the input text
        tokens = tokenizer.encode_plus(text, padding="max_length", truncation=True, max_length=128,
                                       return_tensors="pt")

        # Perform the sentiment analysis
        with torch.no_grad():
            logits = model(**tokens)[0]

        # Convert logits to predicted label (positive/negative)
        predicted_label = torch.argmax(logits, dim=1).item()
        sentiment = "positiv" if predicted_label == 1 else "negativ"

        # Append the sentiment score to the list
        sentiment_scores.append(sentiment)

    # Add the sentiment scores as a new column in the DataFrame
    df['sentiment'] = sentiment_scores

    return df

## d1-1

In [76]:
# Perform sentiment analysis on the DataFrame
df = perform_sentiment_analysis(df_de)

# Print the updated DataFrame
df.head()

Unnamed: 0,data,player,language,publishedAt,Label,sentiment
0,trainer alonso vor den mitgereisten fans in mo...,palacios,de,2023-02-24T09:33:31Z,,negativ
1,zudem ist die konkurrenzsituation auf der dopp...,palacios,de,2023-03-03T21:35:13Z,,negativ
2,wie auch palacios sah der defensive mittelfeld...,palacios,de,2023-03-07T11:34:39Z,,negativ
3,",,er ist eine option"", erklart alonso, der im ...",palacios,de,2023-03-08T14:25:18Z,,negativ
4,allerdings waren in andrich und dem argenti...,palacios,de,2023-03-09T19:53:46Z,,negativ


In [77]:
df[df['sentiment'] == 'negativ'].shape[0]/df.shape[0]

0.9703872437357631

In [78]:
# Drop rows where 'Label' is NaN or empty
df_de.dropna(subset=['Label'], inplace=True)

# Print the updated DataFrame
data = df_de[df_de['Label'] != 'neutral']

df_de

Unnamed: 0,data,player,language,publishedAt,Label,sentiment
14,"zweimal verwandelte palacios, es wurde ein ube...",palacios,de,2023-03-19T20:01:45Z,positiv,negativ
23,schalke: reis nimmt den spieler ausdrucklich i...,palacios,de,2023-04-03T08:12:20Z,neutral,negativ
31,palacios (r.) verletzte sich leicht im hinspie...,palacios,de,2023-04-17T06:50:19Z,neutral,negativ
41,"""auf jeden fall"" sei das ein ganz grosser tag,...",palacios,de,2023-04-20T20:54:03Z,positiv,negativ
58,beim abschlusstraining trug letzterer zumindes...,palacios,de,2023-05-17T11:20:50Z,neutral,negativ
63,bayerleverkusen argentinischer weltmeister pal...,palacios,de,2023-05-14T18:07:21Z,negativ,negativ
83,"""es gibt im moment nicht viel besseres in der ...",frimpong,de,2023-03-31T07:59:57Z,positiv,negativ
93,der fc bayern munchen interessiert sich nach i...,frimpong,de,2023-04-06T16:49:00Z,neutral,negativ
95,frimpong hat noch bis vertrag in bayerleverkus...,frimpong,de,2023-04-07T07:42:27Z,neutral,negativ
102,hincapie (l.) und tah bedanken sich fur die fa...,frimpong,de,2023-04-14T08:16:22Z,positiv,negativ


In [88]:
data.head()

Unnamed: 0,data,player,language,publishedAt,Label,sentiment
14,"zweimal verwandelte palacios, es wurde ein ube...",palacios,de,2023-03-19T20:01:45Z,positiv,negativ
41,"""auf jeden fall"" sei das ein ganz grosser tag,...",palacios,de,2023-04-20T20:54:03Z,positiv,negativ
63,bayerleverkusen argentinischer weltmeister pal...,palacios,de,2023-05-14T18:07:21Z,negativ,negativ
83,"""es gibt im moment nicht viel besseres in der ...",frimpong,de,2023-03-31T07:59:57Z,positiv,negativ
102,hincapie (l.) und tah bedanken sich fur die fa...,frimpong,de,2023-04-14T08:16:22Z,positiv,negativ


In [89]:
# Calculate the accuracy
accuracy = (data['sentiment'] == data['Label']).mean() * 100

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy))

Accuracy: 50.00%


In [91]:
print(data['sentiment'].unique())

['negativ']


In [92]:
# Assuming you have the true labels in 'Label' column and predicted labels in 'sentiment_label' column of the DataFrame
true_labels = data['Label']
predicted_labels = data['sentiment']

# Create the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)

# Convert the confusion matrix to a DataFrame for better visualization
labels = np.unique(np.concatenate((true_labels, predicted_labels)))
cm_df = pd.DataFrame(cm, index=labels, columns=labels)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm_df)

Confusion Matrix:
         negativ  positiv
negativ       10        0
positiv       10        0


In [93]:
# Assuming you have the true labels in 'Label' column and predicted labels in 'sentiment_label' column of the DataFrame
true_labels = data['Label']
predicted_labels = data['sentiment']

# Generate the classification report
report = classification_report(true_labels, predicted_labels)

# Display the classification report
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

     negativ       0.50      1.00      0.67        10
     positiv       0.00      0.00      0.00        10

    accuracy                           0.50        20
   macro avg       0.25      0.50      0.33        20
weighted avg       0.25      0.50      0.33        20



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## d_con

In [81]:
# Perform sentiment analysis on the DataFrame
df_con = perform_sentiment_analysis(df_de_con)

# Print the updated DataFrame
df_con.head()

Unnamed: 0,data,player,language,publishedAt,Label,sentiment
0,trainer alonso mitgereisten fans monaco glucks...,exequiel palacios,de,2023-02-24T09:33:31Z,positiv,negativ
1,sechser droht viereinhalb monaten pause bitter...,exequiel palacios,de,2023-03-03T21:35:13Z,,negativ
2,bayerleverkusen sechser erklart budapest spiel...,exequiel palacios,de,2023-03-07T11:34:39Z,,positiv
3,bakker mittwoch abschluss training bayerleverk...,exequiel palacios,de,2023-03-08T14:25:18Z,,negativ
4,allerdings andrich argentinischen weltmeister ...,exequiel palacios,de,2023-03-09T19:53:46Z,,negativ


In [82]:
df_con[df_con['sentiment'] == 'negativ'].shape[0]/df_con.shape[0]

0.8901639344262295

In [83]:
# Drop rows where 'Label' is NaN or empty
df_de_con.dropna(subset=['Label'], inplace=True)

# Print the updated DataFrame
data_con = df_de_con[df_de_con['Label'] != 'neutral']

df_de_con

Unnamed: 0,data,player,language,publishedAt,Label,sentiment
0,trainer alonso mitgereisten fans monaco glucks...,exequiel palacios,de,2023-02-24T09:33:31Z,positiv,negativ
9,heisse begegnung bremen trotz fruhen fuhrungst...,exequiel palacios,de,2023-03-12T11:30:02Z,neutral,negativ
29,dazn experte michael ballack gesprach julian n...,exequiel palacios,de,2023-03-20T13:16:11Z,neutral,negativ
72,diesmal andrich entnervt angesprochen standige...,exequiel palacios,de,2023-05-05T18:02:55Z,negativ,negativ
111,torschutzen sich wirtz jubeln tage wirtz darau...,jeremie frimpong,de,2023-04-02T12:05:20Z,positiv,negativ
132,hincapie tah bedanken fan unterstutzung tolle ...,jeremie frimpong,de,2023-04-14T08:16:22Z,positiv,positiv
148,bayerleverkusen vortanzer jubelt uber tor hins...,jeremie frimpong,de,2023-04-20T21:02:06Z,positv,negativ
162,bayerleverkusen torschutze amiri rechts jubeln...,jeremie frimpong,de,2023-04-23T17:33:57Z,positiv,negativ
168,ganz entspannt fruhstuck amsterdam bayerleverk...,jeremie frimpong,de,2023-05-01T16:20:02Z,neutral,negativ
170,bayerleverkusen verteidiger drei spiele gesper...,jeremie frimpong,de,2023-05-23T10:24:42Z,negativ,negativ


In [84]:
data_con.head()

Unnamed: 0,data,player,language,publishedAt,Label,sentiment
0,trainer alonso mitgereisten fans monaco glucks...,exequiel palacios,de,2023-02-24T09:33:31Z,positiv,negativ
72,diesmal andrich entnervt angesprochen standige...,exequiel palacios,de,2023-05-05T18:02:55Z,negativ,negativ
111,torschutzen sich wirtz jubeln tage wirtz darau...,jeremie frimpong,de,2023-04-02T12:05:20Z,positiv,negativ
132,hincapie tah bedanken fan unterstutzung tolle ...,jeremie frimpong,de,2023-04-14T08:16:22Z,positiv,positiv
148,bayerleverkusen vortanzer jubelt uber tor hins...,jeremie frimpong,de,2023-04-20T21:02:06Z,positv,negativ


In [85]:
# Calculate the accuracy
accuracy_con = (data_con['sentiment'] == data_con['Label']).mean() * 100

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy_con))

Accuracy: 42.86%


In [94]:
print(data_con['sentiment'].unique())

['negativ' 'positiv']


In [97]:
# Assuming you have the true labels in 'Label' column and predicted labels in 'sentiment_label' column of the DataFrame
true_labels = data_con['Label']
predicted_labels = data_con['sentiment']

# Create the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)

# Convert the confusion matrix to a DataFrame for better visualization
labels = np.unique(np.concatenate((true_labels, predicted_labels)))
cm_df = pd.DataFrame(cm, index=labels, columns=labels)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm_df)

Confusion Matrix:
         negativ  neutal  positiv  positv
negativ        8       0        2       0
neutal         1       0        0       0
positiv        8       0        1       0
positv         1       0        0       0


In [98]:
# Assuming you have the true labels in 'Label' column and predicted labels in 'sentiment_label' column of the DataFrame
true_labels = data_con['Label']
predicted_labels = data_con['sentiment']

# Generate the classification report
report = classification_report(true_labels, predicted_labels)

# Display the classification report
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

     negativ       0.44      0.80      0.57        10
      neutal       0.00      0.00      0.00         1
     positiv       0.33      0.11      0.17         9
      positv       0.00      0.00      0.00         1

    accuracy                           0.43        21
   macro avg       0.19      0.23      0.18        21
weighted avg       0.35      0.43      0.34        21



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
