In [25]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd


# Load Data

In [30]:
# Load the CSV file into a DataFrame
df_de = pd.read_csv('../Preprocessing/data_clean/de_clean_1.csv')

### Fix DE

In [31]:
#for DE datasate drop empty lines
df_de = df_de.dropna(subset=['data'])

# Sentiment analysis full text

In [32]:
def perform_sentiment_analysis(df):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("oliverguhr/german-sentiment-bert")
    model = AutoModelForSequenceClassification.from_pretrained("oliverguhr/german-sentiment-bert")

    # Create an empty list to store the sentiment scores
    sentiment_scores = []

    # Iterate over the 'data' column in the DataFrame
    for text in df['data']:
        # Tokenize the input text
        tokens = tokenizer.encode_plus(text, padding="max_length", truncation=True, max_length=128,
                                       return_tensors="pt")

        # Perform the sentiment analysis
        with torch.no_grad():
            logits = model(**tokens)[0]

        # Convert logits to predicted label (positive/negative)
        predicted_label = torch.argmax(logits, dim=1).item()
        sentiment = "positive" if predicted_label == 1 else "negative"

        # Append the sentiment score to the list
        sentiment_scores.append(sentiment)

    # Add the sentiment scores as a new column in the DataFrame
    df['sentiment'] = sentiment_scores

    return df


# Perform sentiment analysis on the DataFrame
df = perform_sentiment_analysis(df_de)

# Print the updated DataFrame
print(df)

                                                  data             player  \
0    trainer xabi alonso mitgereisten fans monaco f...  exequiel palacios   
1    sechser droht viereinhalb monaten pause bitter...  exequiel palacios   
2    leverkusens sechser erklart budapest spiel jun...  exequiel palacios   
3    mitchel bakker mittwoch abschluss training lev...  exequiel palacios   
5    warum sehe faz net nicht permalink aktuelle na...  exequiel palacios   
..                                                 ...                ...   
627  bayer leverkusen wehrt europa league lange gut...     piero hincapie   
628  xabi alonso taktik duell alten lehrmeister jos...     piero hincapie   
629  trotz pleite halbfinal hinspiel rom gibt lever...     piero hincapie   
630  hitzigen aufeinandertreffen stadio olimpico na...     piero hincapie   
631  hitzigen aufeinandertreffen stadio olimpico na...     piero hincapie   

    language           publishedAt sentiment  
0         de  2023-02-24T09:

# Percentage negative

In [38]:
df[df['sentiment'] == 'negative'].shape[0]/df.shape[0]

0.9517684887459807

# Analysis only Player

In [39]:
def extract_sentence(df):
    df['short'] = ''
    
    for index, row in df.iterrows():
        player = row['player']
        data = str(row['data'])  # Convert data to string
        
        # Find the sentence containing player's information
        sentences = data.split('.')
        for sentence in sentences:
            if player.lower() in sentence.lower():
                df.at[index, 'short'] = sentence.strip()
                break
    
    return df

### Create short column

In [40]:
extract_sentence(df)

Unnamed: 0,data,player,language,publishedAt,sentiment,short
0,trainer xabi alonso mitgereisten fans monaco f...,exequiel palacios,de,2023-02-24T09:33:31Z,negative,trainer xabi alonso mitgereisten fans monaco f...
1,sechser droht viereinhalb monaten pause bitter...,exequiel palacios,de,2023-03-03T21:35:13Z,negative,sechser droht viereinhalb monaten pause bitter...
2,leverkusens sechser erklart budapest spiel jun...,exequiel palacios,de,2023-03-07T11:34:39Z,negative,leverkusens sechser erklart budapest spiel jun...
3,mitchel bakker mittwoch abschluss training lev...,exequiel palacios,de,2023-03-08T14:25:18Z,negative,mitchel bakker mittwoch abschluss training lev...
5,warum sehe faz net nicht permalink aktuelle na...,exequiel palacios,de,2023-03-09T19:53:46Z,negative,warum sehe faz net nicht permalink aktuelle na...
...,...,...,...,...,...,...
627,bayer leverkusen wehrt europa league lange gut...,piero hincapie,de,2023-05-12T10:02:12Z,negative,bayer leverkusen wehrt europa league lange gut...
628,xabi alonso taktik duell alten lehrmeister jos...,piero hincapie,de,2023-05-11T21:17:00Z,negative,xabi alonso taktik duell alten lehrmeister jos...
629,trotz pleite halbfinal hinspiel rom gibt lever...,piero hincapie,de,2023-05-11T21:09:16Z,negative,trotz pleite halbfinal hinspiel rom gibt lever...
630,hitzigen aufeinandertreffen stadio olimpico na...,piero hincapie,de,2023-05-11T20:58:58Z,negative,hitzigen aufeinandertreffen stadio olimpico na...


# New analysis

In [41]:
def perform_sentiment_analysis1(df):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("oliverguhr/german-sentiment-bert")
    model = AutoModelForSequenceClassification.from_pretrained("oliverguhr/german-sentiment-bert")

    # Create an empty list to store the sentiment scores
    sentiment_scores = []

    # Iterate over the 'data' column in the DataFrame
    for text in df['short']:
        # Tokenize the input text
        tokens = tokenizer.encode_plus(text, padding="max_length", truncation=True, max_length=128,
                                       return_tensors="pt")

        # Perform the sentiment analysis
        with torch.no_grad():
            logits = model(**tokens)[0]

        # Convert logits to predicted label (positive/negative)
        predicted_label = torch.argmax(logits, dim=1).item()
        sentiment = "positive" if predicted_label == 1 else "negative"

        # Append the sentiment score to the list
        sentiment_scores.append(sentiment)

    # Add the sentiment scores as a new column in the DataFrame
    df['sentiment2'] = sentiment_scores

    return df


# Perform sentiment analysis on the DataFrame
df = perform_sentiment_analysis1(df_de)

# Print the updated DataFrame
print(df)

                                                  data             player  \
0    trainer xabi alonso mitgereisten fans monaco f...  exequiel palacios   
1    sechser droht viereinhalb monaten pause bitter...  exequiel palacios   
2    leverkusens sechser erklart budapest spiel jun...  exequiel palacios   
3    mitchel bakker mittwoch abschluss training lev...  exequiel palacios   
5    warum sehe faz net nicht permalink aktuelle na...  exequiel palacios   
..                                                 ...                ...   
627  bayer leverkusen wehrt europa league lange gut...     piero hincapie   
628  xabi alonso taktik duell alten lehrmeister jos...     piero hincapie   
629  trotz pleite halbfinal hinspiel rom gibt lever...     piero hincapie   
630  hitzigen aufeinandertreffen stadio olimpico na...     piero hincapie   
631  hitzigen aufeinandertreffen stadio olimpico na...     piero hincapie   

    language           publishedAt sentiment  \
0         de  2023-02-24T09

# Percentage of Negative

In [43]:
df[df['sentiment2'] == 'negative'].shape[0]/df.shape[0]

0.7427652733118971