# Dataset: *ferguson*

- df.to_csv(r'D:\論文\PHEME9\Data\CSV\ferguson-df.csv',index=False)
- src_tw_df.to_csv(r'D:\論文\PHEME9\Data\CSV\ferguson-src.csv',index=False)

In [1]:
import numpy as np
import pandas as pd
import datetime 
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv(r'D:\論文\PHEME9\Data\CSV\ferguson-df.csv', encoding = 'utf-8', header = 0)
df.dropna(inplace = True)
df.reset_index(drop=True,inplace=True)
src_tw_df = pd.read_csv(r'D:\論文\PHEME9\Data\CSV\ferguson-src.csv', encoding = 'utf-8', header =0)

## Emotion Analysis

### T5 Based
<https://huggingface.co/mrm8488/t5-base-finetuned-emotion?text=I+wish+you+were+here+but+it+is+impossible>

In [3]:
from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModelForSequenceClassification

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [4]:
t5_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion",use_fast=False)
t5_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-emotion")



def get_emotion_t5(text):
    input_ids = t5_tokenizer.encode(text , return_tensors='pt')
    output = t5_model.generate(input_ids=input_ids,max_length=2)
  
    dec = [t5_tokenizer.decode(ids) for ids in output]
    label = dec[0]
    return label



In [5]:
emotion_t5 = df.cleaned_reply_tw.apply(get_emotion_t5)

In [6]:
df['reply_emo_t5'] = emotion_t5

In [16]:
import re
df['reply_emo_t5'] = df['reply_emo_t5'].apply(lambda x : re.sub('<pad> ',"",x))
df.reply_emo_t5

0           anger
1           anger
2           anger
3           anger
4             joy
           ...   
21751         joy
21752       anger
21753     sadness
21754    surprise
21755       anger
Name: reply_emo_t5, Length: 21756, dtype: object

In [8]:
src_emo_t5 = src_tw_df.cleaned_src_tw.astype(str).apply(get_emotion_t5)
src_tw_df['src_emo_t5'] = src_emo_t5

In [17]:
src_tw_df['src_emo_t5']  = src_tw_df.src_emo_t5.apply(lambda x : re.sub('<pad> ',"",x))

In [None]:
ax = df.reply_emotion.value_counts().plot.bar()

for i in ax.patches:
    height = i.get_height()
    ax.text(
        i.get_x() + i.get_width() / 2,height, str(height), ha="center", va="bottom"
    )
plt.xticks(rotation=0)
plt.show()
#plt.savefig('D:/論文/PHEME9/Code/graph/ferguson/rep-emotion.pdf',dpi=300)

### roBERTa-base model
<https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion>

In [9]:
import numpy as np
from scipy.special import softmax

task='emotion'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
ro_tokenizer = AutoTokenizer.from_pretrained(MODEL)
ro_model = AutoModelForSequenceClassification.from_pretrained(MODEL)


In [10]:
def get_emotion_roBERT(text):
    
    def preprocess(txt):
        new_text = []
        for t in txt.split(" "):
            t = '@user' if t.startswith('@') and len(t) > 1 else t
            t = 'http' if t.startswith('http') else t
            new_text.append(t)
        return " ".join(new_text)
    
    labels = ['anger','joy','optimism','sadness']
    text = preprocess(text)
    encoded_input = ro_tokenizer(text, return_tensors='pt')
    output = ro_model(**encoded_input)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    emo = labels[ranking[0]]
    return emo

In [11]:
emotion_ro = df.cleaned_reply_tw.apply(get_emotion_roBERT)

In [19]:
df['reply_emo_ro'] = emotion_ro
df.to_csv(r'D:\論文\PHEME9\Data\CSV\ferguson-df-2.csv',index=False)

In [14]:
src_emo_ro = src_tw_df.cleaned_src_tw.astype(str).apply(get_emotion_roBERT)
src_tw_df['src_emo_ro'] = src_emo_ro

In [18]:
src_tw_df.to_csv(r'D:\論文\PHEME9\Data\CSV\ferguson-src-2.csv',index=False)

In [None]:
print('roBERT based: ',get_emotion_roBERT(df.reply_tweet[34]))
print('t5 based: ',get_emotion_t5(df.reply_tweet[34]))
print('lexicon based: ', df.reply_emotion[34])

In [None]:
print('roBERT based: ',get_emotion_roBERT(df.cleaned_reply_tw[34]))
print('t5 based: ',get_emotion_t5(df.cleaned_reply_tw[34]))
print('lexicon based: ', df.reply_emotion[34])

In [None]:
print("reply tweet: ", df.reply_tweet[34])
print("cleaned reply tweet: ", df.cleaned_reply_tw[34])