In [1]:
import pandas as pd
import numpy as np
from transformers import pipeline

import matplotlib.pyplot as plt
import seaborn as sns
import ast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from collections import defaultdict
import itertools

In [84]:
annotated_df = pd.read_csv('/work/ptyagi/masterthesis/data/test/new_annotation.csv')

In [85]:
annotated_df['manual_label'].value_counts()

manual_label
anger       33
joy         16
disgust     15
fear        13
sadness     12
surprise    10
Name: count, dtype: int64

In [86]:
replies_feb_2019_en = pd.read_csv('/work/ptyagi/masterthesis/data/tmp/tweet_replies_feb_2019_en.csv')

In [87]:
merged_df = pd.merge(annotated_df[['id','manual_label']], replies_feb_2019_en, on='id', how='left')

In [88]:
merged_df.head()

Unnamed: 0,id,manual_label,conversation_id,created_at,replies,tweet_text,tweet_lang,lang_confidence
0,1097968284102733825,anger,1097912551038439430,2019-02-19 21:16:52+00:00,seriously this is bs climate change is called ...,Despite the immediate danger posed by climate ...,en,0.924149
1,1100028581311381506,sadness,1099918265005080578,2019-02-25 13:43:45+00:00,hello ladies and gentleman i did a poem known ...,Many thanks for the RTs[USER] [USER] [USER] [U...,en,0.812816
2,1098431395314192384,fear,1097987884097900544,2019-02-21 03:57:07+00:00,climate change is real and nature will respond...,"It’s not a big, iconic or ‘sexy’ species and i...",en,0.96718
3,1100462023278821377,surprise,1100154569663733760,2019-02-26 18:26:06+00:00,i can not wait right now for the climate chang...,"In new Pew Poll, climate change falls to the b...",en,0.876844
4,1099798892596678657,anger,1099783621609750528,2019-02-24 22:31:03+00:00,is too stupid to understand maybe if they watc...,The White House plans to assemble a group of s...,en,0.876993


In [89]:
emotion_mapping = {
    "anger": "anger",
    "anticipation": "surprise", 
    "disgust": "disgust",
    "fear": "fear",
    "joy": "joy",
    "love": "joy",           
    "optimism": "joy",       
    "pessimism": "fear",     
    "sadness": "sadness",
    "surprise": "surprise",
    "trust": "joy"           
}

In [90]:
emotion_classifier = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion-latest", device=2)

In [91]:
emotion_predictions = emotion_classifier(merged_df['replies'].tolist(), batch_size=8)

In [92]:
emotion_predictions[:5] # type: ignore

[{'label': 'anger', 'score': 0.9839533567428589},
 {'label': 'anticipation', 'score': 0.8289657831192017},
 {'label': 'optimism', 'score': 0.4852372109889984},
 {'label': 'anger', 'score': 0.9356208443641663},
 {'label': 'disgust', 'score': 0.9095050692558289}]

In [93]:
merged_df['pred_roberta_base'] = [pred['label'] for pred in emotion_predictions] # type: ignore
merged_df['confidence_roberta_base'] = [pred['score'] for pred in emotion_predictions] # type: ignore

In [94]:
merged_df['pred_roberta_base'] = merged_df['pred_roberta_base'].map(emotion_mapping)

In [95]:
emotion_classifier = pipeline("text-classification", model="cardiffnlp/twitter-roberta-large-emotion-latest", device=2)

In [96]:
emotion_predictions = emotion_classifier(merged_df['replies'].tolist(), batch_size=8)

In [97]:
emotion_predictions[:5] # type: ignore

[{'label': 'disgust', 'score': 0.9929659366607666},
 {'label': 'optimism', 'score': 0.7913478016853333},
 {'label': 'optimism', 'score': 0.46346914768218994},
 {'label': 'sadness', 'score': 0.7249971628189087},
 {'label': 'disgust', 'score': 0.9782056212425232}]

In [98]:
merged_df['pred_roberta_large'] = [pred['label'] for pred in emotion_predictions] # type: ignore
merged_df['confidence_roberta_large'] = [pred['score'] for pred in emotion_predictions] # type: ignore

In [99]:
merged_df['pred_roberta_large'] = merged_df['pred_roberta_large'].map(emotion_mapping)

In [100]:
def data():
    for value in merged_df['replies'].to_list():
        yield value

In [101]:
bart_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=2)



In [102]:
classification_results = []
for text in data():
    result = bart_classifier(
    text,
    candidate_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise"])
    classification_results.append(result)


In [103]:
classification_results[3]

{'sequence': 'i can not wait right now for the climate change to does not feel warm out people oh wait it has to get cold before it gets hot',
 'labels': ['surprise', 'anger', 'joy', 'fear', 'disgust', 'sadness'],
 'scores': [0.35057732462882996,
  0.25067138671875,
  0.23708945512771606,
  0.10718908160924911,
  0.04285239055752754,
  0.011620339006185532]}

In [104]:
merged_df['pred_bart'] = [x['labels'][0] for x in classification_results]
merged_df['confidence_bart'] = [x['scores'][0] for x in classification_results]

In [105]:

deberta_classifier = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7", device=2)




In [106]:
classification_results = []
for text in data():
    result = deberta_classifier(
    text,
    candidate_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise"])
    classification_results.append(result)

In [107]:
classification_results[0]

{'sequence': 'seriously this is bs climate change is called weather end of lesson about of meteorologist can not even predict the weather daily on the news this is just more grant money for special interest stop wasting our tax dollars on an unpoven theory',
 'labels': ['sadness', 'disgust', 'surprise', 'anger', 'fear', 'joy'],
 'scores': [0.24847759306430817,
  0.24724611639976501,
  0.1980893462896347,
  0.16834282875061035,
  0.10081437230110168,
  0.0370296947658062]}

In [108]:
merged_df['pred_deberta_nli'] = [x['labels'][0] for x in classification_results]
merged_df['confidence_deberta_nli'] = [x['scores'][0] for x in classification_results]

In [109]:
zeroshot_classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v2.0", device=2)

In [110]:
hypothesis_template = "This emotion of this text is {}"
classification_results = []
for text in data():
    result = zeroshot_classifier(
    text,
    candidate_labels = ["anger", "disgust", "fear", "joy", "sadness", "surprise"],
    hypothesis_template=hypothesis_template,
    multi_label=False)
    classification_results.append(result)

In [111]:
classification_results[3]

{'sequence': 'i can not wait right now for the climate change to does not feel warm out people oh wait it has to get cold before it gets hot',
 'labels': ['joy', 'surprise', 'fear', 'anger', 'sadness', 'disgust'],
 'scores': [0.8574373126029968,
  0.08026963472366333,
  0.024532657116651535,
  0.014948503114283085,
  0.014545111916959286,
  0.00826676283031702]}

In [112]:
merged_df['pred_deberta_zero'] = [x['labels'][0] for x in classification_results]
merged_df['confidence_deberta_zero'] = [x['scores'][0] for x in classification_results]

In [113]:
merged_df.head()

Unnamed: 0,id,manual_label,conversation_id,created_at,replies,tweet_text,tweet_lang,lang_confidence,pred_roberta_base,confidence_roberta_base,pred_roberta_large,confidence_roberta_large,pred_bart,confidence_bart,pred_deberta_nli,confidence_deberta_nli,pred_deberta_zero,confidence_deberta_zero
0,1097968284102733825,anger,1097912551038439430,2019-02-19 21:16:52+00:00,seriously this is bs climate change is called ...,Despite the immediate danger posed by climate ...,en,0.924149,anger,0.983953,disgust,0.992966,disgust,0.528941,sadness,0.248478,anger,0.942579
1,1100028581311381506,sadness,1099918265005080578,2019-02-25 13:43:45+00:00,hello ladies and gentleman i did a poem known ...,Many thanks for the RTs[USER] [USER] [USER] [U...,en,0.812816,surprise,0.828966,joy,0.791348,surprise,0.425855,anger,0.703004,anger,0.412198
2,1098431395314192384,fear,1097987884097900544,2019-02-21 03:57:07+00:00,climate change is real and nature will respond...,"It’s not a big, iconic or ‘sexy’ species and i...",en,0.96718,joy,0.485237,joy,0.463469,surprise,0.356557,sadness,0.360635,fear,0.242224
3,1100462023278821377,surprise,1100154569663733760,2019-02-26 18:26:06+00:00,i can not wait right now for the climate chang...,"In new Pew Poll, climate change falls to the b...",en,0.876844,anger,0.935621,sadness,0.724997,surprise,0.350577,sadness,0.357519,joy,0.857437
4,1099798892596678657,anger,1099783621609750528,2019-02-24 22:31:03+00:00,is too stupid to understand maybe if they watc...,The White House plans to assemble a group of s...,en,0.876993,disgust,0.909505,disgust,0.978206,surprise,0.584068,disgust,0.404413,anger,0.91979


In [114]:
def confidence(confidence_threshold):
    models = ['roberta_base', 'roberta_large', 'bart', 'deberta_nli', 'deberta_zero']

    results = {}

    for model in models:
        
        high_confidence_df = merged_df[merged_df[f'confidence_{model}'] >= confidence_threshold].copy()
        
        high_confidence_df['correct'] = high_confidence_df['manual_label'] == high_confidence_df[f'pred_{model}']
        
        correct_high_confidence = high_confidence_df['correct'].sum()
        total_high_confidence = len(high_confidence_df)
        percentage_correct = correct_high_confidence / total_high_confidence * 100
        
        results[model] = {
        'correct': correct_high_confidence,
        'total': total_high_confidence,
        'percentage': percentage_correct
    }

    return results

In [121]:
results = confidence(0.95)
pd.DataFrame(results)

Unnamed: 0,roberta_base,roberta_large,bart,deberta_nli,deberta_zero
correct,35.0,34.0,2.0,0.0,15.0
total,47.0,52.0,2.0,1.0,19.0
percentage,74.468085,65.384615,100.0,0.0,78.947368


In [116]:
results = confidence(0.9)
pd.DataFrame(results)

Unnamed: 0,roberta_base,roberta_large,bart,deberta_nli,deberta_zero
correct,38.0,41.0,4.0,0.0,24.0
total,61.0,65.0,5.0,1.0,33.0
percentage,62.295082,63.076923,80.0,0.0,72.727273


In [117]:
results = confidence(0.8)
pd.DataFrame(results)

Unnamed: 0,roberta_base,roberta_large,bart,deberta_nli,deberta_zero
correct,43.0,41.0,6.0,5.0,32.0
total,75.0,76.0,11.0,7.0,48.0
percentage,57.333333,53.947368,54.545455,71.428571,66.666667


In [118]:
results = confidence(0.7)
pd.DataFrame(results)

Unnamed: 0,roberta_base,roberta_large,bart,deberta_nli,deberta_zero
correct,47.0,47.0,10.0,8.0,40.0
total,82.0,89.0,25.0,14.0,61.0
percentage,57.317073,52.808989,40.0,57.142857,65.57377


In [119]:
merged_df[merged_df[f'confidence_deberta_zero'] >= 0.8][['manual_label', 'pred_deberta_zero', 'replies']].head(10)

Unnamed: 0,manual_label,pred_deberta_zero,replies
0,anger,anger,seriously this is bs climate change is called ...
3,surprise,joy,i can not wait right now for the climate chang...
4,anger,anger,is too stupid to understand maybe if they watc...
5,anger,surprise,i fully expected meghans head to spin around c...
8,surprise,anger,emember when the alarmists said there would be...
9,sadness,anger,never have and never would i have a lot of dem...
10,joy,joy,i am a wind turbine but let us dance when i co...
11,anger,anger,fact globally cc is still based on theory only...
14,sadness,joy,i am thoroughly enjoying this decline into apo...
18,surprise,surprise,so you realized they gained more ice than expe...
