In [3]:
import requests
import numpy as np
import pandas as pd
from transformers import pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


  from .autonotebook import tqdm as notebook_tqdm


# Load Data

In [4]:
# Load the CSV file into a DataFrame
df_en = pd.read_csv('../Preprocessing/data_clean/labeled-data/labeled-en_clean_1-1_not101010.csv')

In [5]:
df_en.head()

Unnamed: 0,data,player,language,publishedAt,Label
0,ten if you included the toe-poked volley to te...,palacios,en,2023-02-16T23:56:00Z,
1,bayerleverkusen took the lead again in the st ...,palacios,en,2023-02-23T20:50:50Z,
2,wissam ben yedder levelled straight away from ...,palacios,en,2023-02-23T20:53:59Z,positiv
3,"midfielders: leandro paredes (juventus), angel...",palacios,en,2023-03-03T16:40:46Z,neutral
4,midfielders: rodrigo de paul (atletico madrid)...,palacios,en,2023-03-03T18:17:37Z,neutral


# Fix NA

In [6]:
#for DE datasate drop empty lines
df_de = df_en.dropna(subset=['data'])

In [7]:
df_en.head()

Unnamed: 0,data,player,language,publishedAt,Label
0,ten if you included the toe-poked volley to te...,palacios,en,2023-02-16T23:56:00Z,
1,bayerleverkusen took the lead again in the st ...,palacios,en,2023-02-23T20:50:50Z,
2,wissam ben yedder levelled straight away from ...,palacios,en,2023-02-23T20:53:59Z,positiv
3,"midfielders: leandro paredes (juventus), angel...",palacios,en,2023-03-03T16:40:46Z,neutral
4,midfielders: rodrigo de paul (atletico madrid)...,palacios,en,2023-03-03T18:17:37Z,neutral


# Sentiment analysis full text

In [8]:
sentiment_classifier = pipeline('sentiment-analysis', model='bert-base-uncased')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
def perform_comparison_sentiment_analysis(df):
    # Create empty lists to store the sentiment scores and labels
    sentiment_scores = []
    sentiment_labels = []

    # Iterate over the 'data' column in the DataFrame
    for text in df['data']:
        # Perform sentiment analysis using the Hugging Face pipeline
        result = sentiment_classifier(text)[0]
        sentiment_score = result['score']
        
        # Convert logits to predicted label (positive/negative)
        predicted_label = 1 if sentiment_score > 0.5 else 0
        sentiment_label = "positiv" if predicted_label == 1 else "negativ"

        # Append the sentiment score and label to the respective lists
        sentiment_scores.append(sentiment_score)
        sentiment_labels.append(sentiment_label)

    # Add the sentiment scores and labels as new columns in the DataFrame
    df['sentiment_score'] = sentiment_scores
    df['sentiment_label'] = sentiment_labels

    return df


# Perform sentiment analysis on the DataFrame
df = perform_comparison_sentiment_analysis(df_en)

# Print the updated DataFrame
df.head()


Unnamed: 0,data,player,language,publishedAt,Label,sentiment_score,sentiment_label
0,ten if you included the toe-poked volley to te...,palacios,en,2023-02-16T23:56:00Z,,0.5428,positiv
1,bayerleverkusen took the lead again in the st ...,palacios,en,2023-02-23T20:50:50Z,,0.563268,positiv
2,wissam ben yedder levelled straight away from ...,palacios,en,2023-02-23T20:53:59Z,positiv,0.553591,positiv
3,"midfielders: leandro paredes (juventus), angel...",palacios,en,2023-03-03T16:40:46Z,neutral,0.50152,positiv
4,midfielders: rodrigo de paul (atletico madrid)...,palacios,en,2023-03-03T18:17:37Z,neutral,0.50523,positiv


In [10]:
'''three labels:
def perform_comparison_sentiment_analysis(df):
    # Create empty lists to store the sentiment scores and labels
    sentiment_scores = []
    sentiment_labels = []

    # Iterate over the 'data' column in the DataFrame
    for text in df['data']:
        # Perform sentiment analysis using the Hugging Face pipeline
        result = sentiment_classifier(text)[0]
        sentiment_score = result['score']
        
        # Determine sentiment label based on sentiment score
        if sentiment_score > 0.6:
            sentiment_label = "positive"
        elif sentiment_score < 0.4:
            sentiment_label = "negative"
        else:
            sentiment_label = "neutral"

        # Append the sentiment score and label to the respective lists
        sentiment_scores.append(sentiment_score)
        sentiment_labels.append(sentiment_label)

    # Add the sentiment scores and labels as new columns in the DataFrame
    df['sentiment_score'] = sentiment_scores
    df['sentiment_label'] = sentiment_labels

    return df


# Perform sentiment analysis on the DataFrame
df = perform_comparison_sentiment_analysis(df_en)

# Print the updated DataFrame
df.head()

'''

'three labels:\ndef perform_comparison_sentiment_analysis(df):\n    # Create empty lists to store the sentiment scores and labels\n    sentiment_scores = []\n    sentiment_labels = []\n\n    # Iterate over the \'data\' column in the DataFrame\n    for text in df[\'data\']:\n        # Perform sentiment analysis using the Hugging Face pipeline\n        result = sentiment_classifier(text)[0]\n        sentiment_score = result[\'score\']\n        \n        # Determine sentiment label based on sentiment score\n        if sentiment_score > 0.6:\n            sentiment_label = "positive"\n        elif sentiment_score < 0.4:\n            sentiment_label = "negative"\n        else:\n            sentiment_label = "neutral"\n\n        # Append the sentiment score and label to the respective lists\n        sentiment_scores.append(sentiment_score)\n        sentiment_labels.append(sentiment_label)\n\n    # Add the sentiment scores and labels as new columns in the DataFrame\n    df[\'sentiment_score\']

In [11]:
print(df['sentiment_label'].unique())
print(df['sentiment_score'].min())
print(df['sentiment_score'].max())

['positiv']
0.5005073547363281
0.5869556665420532


In [12]:
df[df['sentiment_label'] == 'negativ'].shape[0]/df.shape[0]

0.0

In [13]:
# Drop rows where 'Label' is NaN or empty
df_en.dropna(subset=['Label'], inplace=True)

# Print the updated DataFrame
data = df_en[df_en['Label'] != 'neutral']

df_en.head()

Unnamed: 0,data,player,language,publishedAt,Label,sentiment_score,sentiment_label
2,wissam ben yedder levelled straight away from ...,palacios,en,2023-02-23T20:53:59Z,positiv,0.553591,positiv
3,"midfielders: leandro paredes (juventus), angel...",palacios,en,2023-03-03T16:40:46Z,neutral,0.50152,positiv
4,midfielders: rodrigo de paul (atletico madrid)...,palacios,en,2023-03-03T18:17:37Z,neutral,0.50523,positiv
5,a first half goal from joshua kimmich was canc...,palacios,en,2023-03-19T18:30:00Z,positiv,0.549338,positiv
9,""" palacios scored two penalties as bayerlever...",palacios,en,2023-03-19T20:03:28Z,positiv,0.555996,positiv


In [14]:
data

Unnamed: 0,data,player,language,publishedAt,Label,sentiment_score,sentiment_label
2,wissam ben yedder levelled straight away from ...,palacios,en,2023-02-23T20:53:59Z,positiv,0.553591,positiv
5,a first half goal from joshua kimmich was canc...,palacios,en,2023-03-19T18:30:00Z,positiv,0.549338,positiv
9,""" palacios scored two penalties as bayerlever...",palacios,en,2023-03-19T20:03:28Z,positiv,0.555996,positiv
17,"today it was a lifesaver for the game."" palaci...",palacios,en,2023-03-20T21:20:28Z,positiv,0.556484,positiv
19,foul by palacios (argentina),palacios,en,2023-03-29T06:58:57Z,negativ,0.540333,positiv
24,the main area that bayerleverkusen have lent o...,frimpong,en,2023-03-20T13:00:35Z,positiv,0.535735,positiv
26,ronald koeman launched a scathing assessment o...,frimpong,en,2023-03-22T10:35:51Z,negativ,0.546306,positiv
27,"ronald koeman has made an undiplomatic return,...",frimpong,en,2023-03-24T07:23:27Z,negativ,0.536617,positiv
30,bayerleverkusen handed schalke their first def...,frimpong,en,2023-04-01T19:24:43Z,positiv,0.540007,positiv
55,the red devils have been linked with bayerleve...,frimpong,en,2023-05-24T11:33:33Z,positiv,0.555975,positiv


In [15]:
# Calculate the accuracy
accuracy = (data['sentiment_label'] == data['Label']).mean() * 100

# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy))

Accuracy: 76.47%


In [18]:
# Assuming you have the true labels in 'Label' column and predicted labels in 'sentiment_label' column of the DataFrame
true_labels = data['Label']
predicted_labels = data['sentiment_label']

# Create the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)

# Convert the confusion matrix to a DataFrame for better visualization
labels = np.unique(np.concatenate((true_labels, predicted_labels)))
cm_df = pd.DataFrame(cm, index=labels, columns=labels)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm_df)

Confusion Matrix:
         negativ  positiv
negativ        0        4
positiv        0       13


In [20]:
# Assuming you have the true labels in 'Label' column and predicted labels in 'sentiment_label' column of the DataFrame
true_labels = data['Label']
predicted_labels = data['sentiment_label']

# Generate the classification report
report = classification_report(true_labels, predicted_labels)

# Display the classification report
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

     negativ       0.00      0.00      0.00         4
     positiv       0.76      1.00      0.87        13

    accuracy                           0.76        17
   macro avg       0.38      0.50      0.43        17
weighted avg       0.58      0.76      0.66        17



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
