<a href="https://colab.research.google.com/github/yuchen74/Colab-backup/blob/main/4_Emotion_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset: *charliehebdo*

In [None]:
import numpy as np
import pandas as pd
import datetime 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv(r'D:\論文\PHEME9\Data\CSV\charliehebdo.csv', encoding = 'utf-8', header = None)
df.columns = ['src_tweet_id', 'src_user_id', 'src_tweet','src_date','reply_tweet_id','reply_user_id','reply_tweet','reply_date', 'label']
df.reply_tweet = df.reply_tweet.replace(np.nan,"")

In [None]:
src_tw_df = df.iloc[:,[0,1,2,3,8]].copy()
src_tw_df = src_tw_df.drop_duplicates()
src_tw_df.reset_index(inplace=True,drop=True)
src_tw_df

Unnamed: 0,src_tweet_id,src_user_id,src_tweet,src_date,label
0,552784600502915072,331658004,Charlie Hebdo became well known for publishing...,Wed Jan 07 11:11:33 +0000 2015,0
1,552784898743099392,465973,Charlie Hebdo’s Last Tweet Before Shootings ht...,Wed Jan 07 11:12:44 +0000 2015,0
2,552785391653494784,15798091,Prediction: the #CharlieHebdo massacre will no...,Wed Jan 07 11:14:42 +0000 2015,0
3,552786116404072448,465973,10:28am Charlie Hebdo account mocks ISIS leade...,Wed Jan 07 11:17:35 +0000 2015,0
4,552786299875520512,144301368,If your faith isn't strong enough to cope with...,Wed Jan 07 11:18:18 +0000 2015,0
...,...,...,...,...,...
1997,553590653784195072,380648579,#BREAKING Paris supermarket hostage-taker 'neu...,Fri Jan 09 16:34:31 +0000 2015,1
1998,553590721207615488,5402612,Hostages held at kosher supermarket in eastern...,Fri Jan 09 16:34:47 +0000 2015,1
1999,553590835850514433,7587032,"Hostage-taker in supermarket siege killed, rep...",Fri Jan 09 16:35:15 +0000 2015,1
2000,553590891852886019,18424289,French media reports two suspects of #CharlieH...,Fri Jan 09 16:35:28 +0000 2015,1


# Preprocessing
 - lowercase
 - remove hashtag
 - remove urls
 - remove punctuation
 - remove userhandles
 - remove multiple spaces


In [None]:
import nltk
from nltk.corpus import stopwords 

## Define function to process text at one time

In [None]:
def textprocessing(text):
    text = text.lower()

    text = " ".join([word for word in text.split() if 'http' not in word
                                and not word.startswith('@')])
                                #and word != 'RT'])
    import re
    def remove_punct(tweet):
        new_words = []
        for word in tweet:
            w = re.sub(r'[^\w\s]','',word) #remove everything except words and space
            w = re.sub(r'_','',w) #how to remove underscore as well
            new_words.append(w)

        return new_words
    text = "".join(remove_punct(text))
        
    from nltk.stem.porter import PorterStemmer
    stem = PorterStemmer()
    
    from nltk.tokenize import TweetTokenizer
    tknzr = TweetTokenizer(strip_handles=True)
    text = tknzr.tokenize(text)
    
    from nltk.corpus import stopwords
    stop_words=set(stopwords.words("english"))
    def filterstopwords(tw):
        filter_stopwords = []
        for w in tw:
            if w not in stop_words:
                filter_stopwords.append(w)
        return filter_stopwords
    text = filterstopwords(text)
    text = " ".join(text)
    
    return text

In [None]:
print("original text: ", df.reply_tweet.iloc[20])
print("cleaned text: ", textprocessing(df.reply_tweet.iloc[20]))

original text:  @lj_kulwicki @GabTarquini @BBCDanielS @BBCWorld We're not allowed to commit heinous crimes, especially in the name of God.
cleaned text:  allowed commit heinous crimes especially name god


## Process data
### applied on original data (**df**)

In [None]:
df['cleaned_reply_tw'] = df.reply_tweet.apply(textprocessing)
df['cleaned_src_tw'] = df.src_tweet.apply(textprocessing)

In [None]:
df.head()

Unnamed: 0,src_tweet_id,src_user_id,src_tweet,src_date,reply_tweet_id,reply_user_id,reply_tweet,reply_date,label,cleaned_reply_tw,cleaned_src_tw
0,552784600502915072,331658004,Charlie Hebdo became well known for publishing...,Wed Jan 07 11:11:33 +0000 2015,552785249420447745,18370911,"Now 10 dead in a shooting there today RT ""@BBC...",Wed Jan 07 11:14:08 +0000 2015,0,10 dead shooting today rt bbcdaniels charlie h...,charlie hebdo became well known publishing muh...
1,552784600502915072,331658004,Charlie Hebdo became well known for publishing...,Wed Jan 07 11:11:33 +0000 2015,552786761534144512,2806109387,@BBCDanielS @BBCWorld I'm guessing this is bei...,Wed Jan 07 11:20:08 +0000 2015,0,im guessing considered terrorism right lone wolf,charlie hebdo became well known publishing muh...
2,552784600502915072,331658004,Charlie Hebdo became well known for publishing...,Wed Jan 07 11:11:33 +0000 2015,552786803884060672,146142164,@BBCDanielS @BBCWorld why would you mention th...,Wed Jan 07 11:20:18 +0000 2015,0,would mention knowing facts islamphobiaatitsbest,charlie hebdo became well known publishing muh...
3,552784600502915072,331658004,Charlie Hebdo became well known for publishing...,Wed Jan 07 11:11:33 +0000 2015,552786954656710656,940853760,@BBCDanielS @BBCWorld perps identified?,Wed Jan 07 11:20:54 +0000 2015,0,perps identified,charlie hebdo became well known publishing muh...
4,552784600502915072,331658004,Charlie Hebdo became well known for publishing...,Wed Jan 07 11:11:33 +0000 2015,552787979224092672,2888783532,@BBCDanielS @BBCWorld who is charlie hebdo?,Wed Jan 07 11:24:59 +0000 2015,0,charlie hebdo,charlie hebdo became well known publishing muh...


### applied on Source Tweet (**src_tw_df**)

In [None]:
src_tw_df['cleaned_src_tw'] = src_tw_df.src_tweet.apply(textprocessing)

# Sentiment Analysis

## Vader 

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
def get_sentiment(tweet):
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(tweet)
    if vs['compound'] >=0.05:
        sentiment_label = 'Positive'
    elif (vs['compound'] > -0.05) & (vs['compound'] < 0.05):
        sentiment_label = 'Neutral'
    elif vs['compound']<= -0.05:
        sentiment_label = 'Negative' 
    result = sentiment_label
    return result

In [None]:
df['src_sentiment']= df.cleaned_src_tw.apply(get_sentiment)
df['reply_sentiment']= df.cleaned_reply_tw.apply(get_sentiment)

In [None]:
src_tw_df['src_sentiment']=  src_tw_df['cleaned_src_tw'].apply(get_sentiment)

In [None]:
df.to_csv(r'D:\論文\PHEME9\Data\CSV\charliehebdo-df.csv',index=False)
src_tw_df.to_csv(r'D:\論文\PHEME9\Data\CSV\charliehebdo-src.csv',index=False)

## textblob

In [None]:
# from textblob import TextBlob

'from textblob import TextBlob'

In [None]:
# def get_sentiment(text):
#     blob = TextBlob(text)
#     sentiment_polarity = blob.sentiment.polarity
#     if sentiment_polarity > 0:
#         sentiment_label = 'Positive'
#     elif sentiment_polarity < 0:
#         sentiment_label = 'Negative'
#     else:
#         sentiment_label = 'Neutral'
#     result = sentiment_label
#     return result

## Visualization

In [None]:
print(df[df.label ==1]['reply_sentiment'].value_counts())
plt.figure(figsize=(8,6))
sns.countplot(x='reply_sentiment',data= df[df['label'] == 1])

plt.suptitle('Sentiment analysis on Reply tweets under Catergory Rumour',fontsize=16)
plt.title('Reply tweets show more positive attitude towards rumour tweet',fontsize=12,color='grey')
plt.savefig('graph/charliehebdo/senti_reply_rumour.pdf',dpi=300)

In [None]:
print(df[df.label ==0]['reply_sentiment'].value_counts())
plt.figure(figsize=(8,6))
sns.countplot(x='reply_sentiment',data= df[df['label'] == 0])

plt.suptitle('Sentiment analysis on Reply tweets under Catergory Non-Rumour',fontsize=16)
plt.title('Reply tweets show more neutrual attitude towards non-rumour tweet',fontsize=12,color='grey')
plt.savefig('graph/charliehebdo/senti_reply_nonrumour.pdf',dpi=300)

# Emotion Detection

## NRCLex

In [None]:
from nltk.tokenize import word_tokenize,TweetTokenizer

In [None]:
from google.colab import files
uploaded = files.upload()

Saving charliehebdo-df.csv to charliehebdo-df.csv
Saving charliehebdo-src.csv to charliehebdo-src.csv


In [None]:
df = pd.read_csv('charliehebdo-df.csv', encoding = 'utf-8', header=0)
df.info

<bound method DataFrame.info of              src_tweet_id  src_user_id  ... src_sentiment reply_sentiment
0      552784600502915072    331658004  ...      Positive        Negative
1      552784600502915072    331658004  ...      Positive        Negative
2      552784600502915072    331658004  ...      Positive         Neutral
3      552784600502915072    331658004  ...      Positive         Neutral
4      552784600502915072    331658004  ...      Positive         Neutral
...                   ...          ...  ...           ...             ...
36184  553591259672379392     87416722  ...      Negative        Negative
36185  553591259672379392     87416722  ...      Negative        Negative
36186  553591259672379392     87416722  ...      Negative        Negative
36187  553591259672379392     87416722  ...      Negative        Negative
36188  553591259672379392     87416722  ...      Negative         Neutral

[36189 rows x 13 columns]>

In [None]:
src_tw_df = pd.read_csv('charliehebdo-src.csv',encoding='utf-8',header=0)
src_tw_df.shape

(2002, 7)

In [None]:
from nrclex import NRCLex
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [45]:
def get_emotion(tweet):
  score = NRCLex(tweet)
  if len(score.top_emotions)>1:
    emo = [i[0] for i in score.top_emotions]
    emo 
  else:
    emo = score.top_emotions[0][0]
  emotion = emo
  return emotion

  

In [42]:
get_emotion(src_tw_df.cleaned_src_tw[00])


['fear',
 'anger',
 'anticip',
 'trust',
 'surprise',
 'positive',
 'negative',
 'sadness',
 'disgust',
 'joy']

In [46]:
src_tw_df['emotion'] = src_tw_df.cleaned_src_tw.apply(get_emotion)

In [48]:
src_tw_df.info

<bound method DataFrame.info of             src_tweet_id  ...                                            emotion
0     552784600502915072  ...  [fear, anger, anticip, trust, surprise, positi...
1     552784898743099392  ...  [fear, anger, anticip, trust, surprise, positi...
2     552785391653494784  ...  [fear, anger, positive, negative, sadness, dis...
3     552786116404072448  ...                                              trust
4     552786299875520512  ...                                           negative
...                  ...  ...                                                ...
1997  553590653784195072  ...  [fear, anger, anticip, trust, surprise, positi...
1998  553590721207615488  ...                                               fear
1999  553590835850514433  ...  [fear, anger, anticip, trust, surprise, positi...
2000  553590891852886019  ...                            [fear, anger, negative]
2001  553591259672379392  ...  [fear, anger, anticip, trust, surprise, positi

## text2emotion package
https://snyk.io/advisor/python/text2emotion 

In [None]:
import text2emotion as te

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
emotion = df.cleaned_reply_tw.apply(te.get_emotion).cuda()
emotion = pd.json_normalize(emotion)

In [None]:
emotion.apply(lambda x: max(x,key=x.get))

##  pysentimiento 

In [None]:
from pysentimiento import EmotionAnalyzer

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


In [None]:
emotion_analyzer = EmotionAnalyzer(lang="en")
emo=df.cleaned_reply_tw.apply(emotion_analyzer.predict)

In [None]:
emo.head()

In [None]:
emo[1]

In [None]:
emo.tail()

In [None]:
df.reply_tweet[0]

In [None]:
from pysentimiento.preprocessing import preprocess_tweet

In [None]:
temp = df.reply_tweet.apply(preprocess_tweet)
emo=temp.apply(emotion_analyzer.predict)

In [None]:
emo[0]