In [38]:
import requests
import numpy as np
import pandas as pd
from transformers import pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


# Load Data

In [39]:
# Load the CSV file into a DataFrame
#df_en = pd.read_csv('../Preprocessing/data_clean/labeled-data/labeled-en_clean_1-1_not101010.csv')

In [40]:
df_en = pd.read_csv('https://raw.githubusercontent.com/svisel22/SS23-BIPM-Analytics-Lab---Group-4-repository/main/Preprocessing/data_clean/labeled-data/labeled-en_clean_con_sen.csv')

In [41]:
df_en.head()

Unnamed: 0,data,player,language,publishedAt,Label
0,bayerleverkusen took lead minute midfielder p...,exequiel palacios,en,2023-02-23T20:50:50Z,
1,midfielders leandro paredes juventus angel ...,exequiel palacios,en,2023-03-03T16:42:19Z,neutral
2,half goal joshua kimmich canceled penalties pa...,exequiel palacios,en,2023-03-19T18:30:00Z,positiv
3,by reuters bayerleverkusen s palacios scored s...,exequiel palacios,en,2023-03-19T18:42:59Z,
4,bayerleverkusen s palacios scored second half ...,exequiel palacios,en,2023-03-19T19:05:09Z,positiv


# Fix NA

In [42]:
#for DE datasate drop empty lines
df_de = df_en.dropna(subset=['data'])

In [43]:
df_en.head()

Unnamed: 0,data,player,language,publishedAt,Label
0,bayerleverkusen took lead minute midfielder p...,exequiel palacios,en,2023-02-23T20:50:50Z,
1,midfielders leandro paredes juventus angel ...,exequiel palacios,en,2023-03-03T16:42:19Z,neutral
2,half goal joshua kimmich canceled penalties pa...,exequiel palacios,en,2023-03-19T18:30:00Z,positiv
3,by reuters bayerleverkusen s palacios scored s...,exequiel palacios,en,2023-03-19T18:42:59Z,
4,bayerleverkusen s palacios scored second half ...,exequiel palacios,en,2023-03-19T19:05:09Z,positiv


# Sentiment analysis full text

In [44]:
sentiment_classifier = pipeline('sentiment-analysis', model='bert-base-uncased')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [45]:
def perform_comparison_sentiment_analysis(df):
    # Create empty lists to store the sentiment scores and labels
    sentiment_scores = []
    sentiment_labels = []

    # Iterate over the 'data' column in the DataFrame
    for text in df['data']:
        # Perform sentiment analysis using the Hugging Face pipeline
        result = sentiment_classifier(text)[0]
        sentiment_score = result['score']
        
        # Convert logits to predicted label (positive/negative)
        predicted_label = 1 if sentiment_score > 0.5 else 0
        sentiment_label = "positiv" if predicted_label == 1 else "negativ"

        # Append the sentiment score and label to the respective lists
        sentiment_scores.append(sentiment_score)
        sentiment_labels.append(sentiment_label)

    # Add the sentiment scores and labels as new columns in the DataFrame
    df['sentiment_score'] = sentiment_scores
    df['sentiment_label'] = sentiment_labels

    return df


# Perform sentiment analysis on the DataFrame
df = perform_comparison_sentiment_analysis(df_en)

# Print the updated DataFrame
df.head()


Unnamed: 0,data,player,language,publishedAt,Label,sentiment_score,sentiment_label
0,bayerleverkusen took lead minute midfielder p...,exequiel palacios,en,2023-02-23T20:50:50Z,,0.635572,positiv
1,midfielders leandro paredes juventus angel ...,exequiel palacios,en,2023-03-03T16:42:19Z,neutral,0.568171,positiv
2,half goal joshua kimmich canceled penalties pa...,exequiel palacios,en,2023-03-19T18:30:00Z,positiv,0.618601,positiv
3,by reuters bayerleverkusen s palacios scored s...,exequiel palacios,en,2023-03-19T18:42:59Z,,0.639803,positiv
4,bayerleverkusen s palacios scored second half ...,exequiel palacios,en,2023-03-19T19:05:09Z,positiv,0.623193,positiv


# Results with two labels

In [46]:
data = df_en.copy()
# Drop rows where 'Label' is NaN or empty
data.dropna(subset=['Label'], inplace=True)


# Print the updated DataFrame
data = data[data['Label'] != 'neutral']


data.head()


Unnamed: 0,data,player,language,publishedAt,Label,sentiment_score,sentiment_label
2,half goal joshua kimmich canceled penalties pa...,exequiel palacios,en,2023-03-19T18:30:00Z,positiv,0.618601,positiv
4,bayerleverkusen s palacios scored second half ...,exequiel palacios,en,2023-03-19T19:05:09Z,positiv,0.623193,positiv
6,palacios scored penalties bayerleverkusen beat...,exequiel palacios,en,2023-03-19T20:03:28Z,positiv,0.599491,positiv
8,bayerleverkusen stun bayern munich bayerleverk...,exequiel palacios,en,2023-03-20T04:56:02Z,positiv,0.609421,positiv
11,foul palacios argentina,exequiel palacios,en,2023-03-29T06:58:57Z,negativ,0.610034,positiv


In [47]:
print(data['sentiment_score'].min())
print(data['sentiment_score'].max())

0.5994913578033447
0.6824474334716797


In [48]:
data[data['Label']=='negativ']

Unnamed: 0,data,player,language,publishedAt,Label,sentiment_score,sentiment_label
11,foul palacios argentina,exequiel palacios,en,2023-03-29T06:58:57Z,negativ,0.610034,positiv
16,ronald koeman launched scathing assessment fri...,jeremie frimpong,en,2023-03-22T10:35:51Z,negativ,0.682447,positiv
17,ronald koeman undiplomatic return saying over...,jeremie frimpong,en,2023-03-24T07:23:27Z,negativ,0.652525,positiv


In [50]:
# Calculate the accuracy
accuracy = (data['sentiment_label'] == data['Label']).mean() * 100


# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy))

Accuracy: 76.92%


In [51]:
# Assuming you have the true labels in 'Label' column and predicted labels in 'sentiment_label' column of the DataFrame
true_labels = data['Label']
predicted_labels = data['sentiment_label']


# Create the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)


# Convert the confusion matrix to a DataFrame for better visualization
labels = np.unique(np.concatenate((true_labels, predicted_labels)))
cm_df = pd.DataFrame(cm, index=labels, columns=labels)


# Display the confusion matrix
print("Confusion Matrix:")
print(cm_df)


Confusion Matrix:
         negativ  positiv
negativ        0        3
positiv        0       10


In [52]:
# Assuming you have the true labels in 'Label' column and predicted labels in 'sentiment_label' column of the DataFrame
true_labels = data['Label']
predicted_labels = data['sentiment_label']


# Generate the classification report
report = classification_report(true_labels, predicted_labels)


# Display the classification report
print("Classification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

     negativ       0.00      0.00      0.00         3
     positiv       0.77      1.00      0.87        10

    accuracy                           0.77        13
   macro avg       0.38      0.50      0.43        13
weighted avg       0.59      0.77      0.67        13



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Results with three labels

In [63]:
#three labels:
def perform_comparison_sentiment_analysis(df):
    # Create empty lists to store the sentiment scores and labels
    sentiment_scores = []
    sentiment_labels = []

    # Iterate over the 'data' column in the DataFrame
    for text in df['data']:
        # Perform sentiment analysis using the Hugging Face pipeline
        result = sentiment_classifier(text)[0]
        sentiment_score = result['score']
        
        # Determine sentiment label based on sentiment score
        if sentiment_score > 0.6:
            sentiment_label = "positiv"
        elif sentiment_score < 0.4:
            sentiment_label = "negativ"
        else:
            sentiment_label = "neutral"

        # Append the sentiment score and label to the respective lists
        sentiment_scores.append(sentiment_score)
        sentiment_labels.append(sentiment_label)

    # Add the sentiment scores and labels as new columns in the DataFrame
    df['sentiment_score'] = sentiment_scores
    df['sentiment_label'] = sentiment_labels

    return df


# Perform sentiment analysis on the DataFrame
df = perform_comparison_sentiment_analysis(df_en)

# Print the updated DataFrame
df.head()


Unnamed: 0,data,player,language,publishedAt,Label,sentiment_score,sentiment_label
0,bayerleverkusen took lead minute midfielder p...,exequiel palacios,en,2023-02-23T20:50:50Z,,0.635572,positiv
1,midfielders leandro paredes juventus angel ...,exequiel palacios,en,2023-03-03T16:42:19Z,neutral,0.568171,neutral
2,half goal joshua kimmich canceled penalties pa...,exequiel palacios,en,2023-03-19T18:30:00Z,positiv,0.618601,positiv
3,by reuters bayerleverkusen s palacios scored s...,exequiel palacios,en,2023-03-19T18:42:59Z,,0.639803,positiv
4,bayerleverkusen s palacios scored second half ...,exequiel palacios,en,2023-03-19T19:05:09Z,positiv,0.623193,positiv


In [64]:
print(df['sentiment_label'].unique())
print(df['sentiment_score'].min())
print(df['sentiment_score'].max())

['positiv' 'neutral']
0.5645632147789001
0.691910982131958


In [69]:
# Calculate the accuracy
accuracy = (df['sentiment_label'] == df['Label']).mean() * 100


# Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy))

Accuracy: 8.15%


In [65]:
# Assuming you have your data stored in a DataFrame called 'df'
true_labels = df['Label']
predicted_labels = df['sentiment_label']

# Replace missing values with 'unknown' category
true_labels = true_labels.fillna('unknown')
predicted_labels = predicted_labels.fillna('unknown')

# Create the confusion matrix
confusion_mat = confusion_matrix(true_labels, predicted_labels)

# Convert confusion matrix to DataFrame for better visualization
categories = sorted(set(true_labels) | set(predicted_labels))
confusion_df = pd.DataFrame(confusion_mat, index=categories, columns=categories)

print(confusion_df)

         negativ  neutral  positiv  unknown
negativ        0        0        3        0
neutral        0        2        8        0
positiv        0        1        9        0
unknown        0       13       99        0


In [66]:
df[df['Label']=='neutral']

Unnamed: 0,data,player,language,publishedAt,Label,sentiment_score,sentiment_label
1,midfielders leandro paredes juventus angel ...,exequiel palacios,en,2023-03-03T16:42:19Z,neutral,0.568171,neutral
14,timothy fosu mensah replaces frimpong,jeremie frimpong,en,2023-03-19T20:03:28Z,neutral,0.608497,positiv
23,frimpong manchester united looking PSmillion ...,jeremie frimpong,en,2023-04-07T06:22:07Z,neutral,0.617254,positiv
38,assisted frimpong,jeremie frimpong,en,2023-05-18T21:09:58Z,neutral,0.675571,positiv
46,when came down says frimpong,jeremie frimpong,en,2023-05-10T11:04:33Z,neutral,0.662112,positiv
60,arsenal manager arteta seeking close deal newc...,moussa diaby,en,2023-03-13T10:25:02Z,neutral,0.622505,positiv
69,time diaby introduced minute momentum swung i...,moussa diaby,en,2023-03-27T22:57:31Z,neutral,0.681747,positiv
102,addition rice crystal palace defender marc gu...,moussa diaby,en,2023-05-15T18:29:53Z,neutral,0.573503,neutral
107,west ham midfielder declan rice believed top t...,moussa diaby,en,2023-05-10T17:23:37Z,neutral,0.628964,positiv
115,despite influx players west london silva freq...,mykhaylo mudryk,en,2023-04-19T08:05:31Z,neutral,0.61489,positiv


In [67]:
df[df['Label']=='negativ']

Unnamed: 0,data,player,language,publishedAt,Label,sentiment_score,sentiment_label
11,foul palacios argentina,exequiel palacios,en,2023-03-29T06:58:57Z,negativ,0.610034,positiv
16,ronald koeman launched scathing assessment fri...,jeremie frimpong,en,2023-03-22T10:35:51Z,negativ,0.682447,positiv
17,ronald koeman undiplomatic return saying over...,jeremie frimpong,en,2023-03-24T07:23:27Z,negativ,0.652525,positiv


In [68]:
df[df['sentiment_score']<0.6]

Unnamed: 0,data,player,language,publishedAt,Label,sentiment_score,sentiment_label
1,midfielders leandro paredes juventus angel ...,exequiel palacios,en,2023-03-03T16:42:19Z,neutral,0.568171,neutral
6,palacios scored penalties bayerleverkusen beat...,exequiel palacios,en,2023-03-19T20:03:28Z,positiv,0.599491,neutral
48,bakker bayerleverkusen left footed shot cent...,mitchel bakker,en,2023-03-19T20:03:28Z,,0.579518,neutral
55,jpw arsenal linked wingers galore moment appe...,moussa diaby,en,2023-02-25T16:02:07Z,,0.595702,neutral
61,forwards kingsley coman bayern munich olivi...,moussa diaby,en,2023-03-16T14:14:47Z,,0.598148,neutral
70,pacey bayerleverkusen winger diaby attracting ...,moussa diaby,en,2023-04-13T15:26:24Z,,0.598034,neutral
82,arsenal said interested brighton s moises caic...,moussa diaby,en,2023-05-05T23:35:55Z,,0.573915,neutral
85,rice key target arsenal revive brighton s moi...,moussa diaby,en,2023-05-05T06:09:01Z,,0.577933,neutral
88,france international bayerleverkusen winger d...,moussa diaby,en,2023-05-04T15:16:05Z,,0.564563,neutral
93,forward diaby came closest elusive goal bayerl...,moussa diaby,en,2023-05-19T10:42:41Z,,0.592809,neutral
