In [16]:
import glob
import pandas as pd
import preprocessor as p
import nltk
import re
import emojis
import csv
import collections
import json
from ekphrasis.classes.segmenter import Segmenter
from textblob import TextBlob, Word
from nltk import word_tokenize
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
seg = Segmenter(corpus='twitter')

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

Reading twitter - 1grams ...
Reading twitter - 2grams ...


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\John8\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\John8\AppData\Roaming\nltk_data...


True

In [2]:
with open('emotion_dicts.txt', 'r') as f:
    emotion_dict = json.loads(f.read())

In [3]:
#Read the emotion corpus
anger_dict = emotion_dict['anger']
anticipation_dict = emotion_dict['anticipation']
disgust_dict = emotion_dict['disgust']
fear_dict = emotion_dict['fear']
joy_dict = emotion_dict['joy']
sadness_dict = emotion_dict['sadness']
surprise_dict = emotion_dict['surprise']
trust_dict = emotion_dict['trust']

In [4]:
#Extract the text in the hashtag
def extract_hashtag_text(tweet):
    tag_text = ""
    tweet= re.findall(r'#(\w+)', tweet)
    for tag in tweet:
        clean_tag=seg.segment(tag)
        tag_text += (clean_tag + " ")
    return tag_text

In [5]:
#Extract text information in emoji
def extract_emoji_text(tweet):
    emoji_text = ""
    emoji_list = emojis.get(tweet)
    for emoji in emoji_list:
        emoji = emojis.decode(emoji)
        emoji = re.sub(r':', '', emoji) 
        emoji = re.sub(r'_', ' ', emoji) 
        emoji_text += (emoji + " ")
    return emoji_text

In [6]:
#Clean up irrelevant symbols in tweets
def tweets_cleaner(tweet):
    #Use preprocessor to clean up URLs, Hashtags and user mentions
    tweet = p.clean(tweet)
    #Clean up all numbers
    tweet = re.sub(r'[0-9]*', '', tweet) 
    #Clean up all punctuation
    tweet = re.sub(r'[^\w\s]', '', tweet) 
    #Lowercase letters
    semiclean_tweet = tweet.lower()
    return semiclean_tweet

In [7]:
#Lemmatization
def lemmatization_without_stopwords(semiclean_tweet):
    lemmatized_list=[]
    #Use TextBlob to tokenize tweets
    sent = TextBlob(semiclean_tweet)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    #Realize lemmatization according to the corresponding POS
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]
    #Remove stopwords
    for wd, tag in words_and_tags:
        if wd  not in stopwords:
            lemmatized_list.append(wd.lemmatize(tag))
    return lemmatized_list

In [8]:
#Get clean tweet word lists
def get_clean_text(raw_daily_data):
    
    clean_text=[0]*len(raw_daily_data["Tweet"])
    
    for num, tweet in enumerate(raw_daily_data["Tweet"]):
        t_ls = lemmatization_without_stopwords(tweets_cleaner(tweet))
        e_ls = lemmatization_without_stopwords(extract_emoji_text(tweet))
        h_ls = lemmatization_without_stopwords(extract_hashtag_text(tweet))
        clean_text[num] = t_ls + e_ls + h_ls
    
    return clean_text

In [9]:
#Get the emotion score of each tweet
def get_emotion_score(dataframe,clean_text):
    
    anger_score = [0]*len(clean_text)
    for i, clean in enumerate(clean_text):
        counts = collections.Counter(clean)
        for word, freq in counts.items():
            anger_score[i] += anger_dict.get(word, 0) * freq

    anticipation_score = [0]*len(clean_text)
    for i, clean in enumerate(clean_text):
        counts = collections.Counter(clean)
        for word, freq in counts.items():
            anticipation_score[i] += anticipation_dict.get(word, 0) * freq
            
    disgust_score = [0]*len(clean_text)
    for i, clean in enumerate(clean_text):
        counts = collections.Counter(clean)
        for word, freq in counts.items():
            disgust_score[i] += disgust_dict.get(word, 0) * freq
    
    fear_score = [0]*len(clean_text)
    for i, clean in enumerate(clean_text):
        counts = collections.Counter(clean)
        for word, freq in counts.items():
            fear_score[i] += fear_dict.get(word, 0) * freq
    
    joy_score = [0]*len(clean_text)
    for i, clean in enumerate(clean_text):
        counts = collections.Counter(clean)
        for word, freq in counts.items():
            joy_score[i] += joy_dict.get(word, 0) * freq
    
    sadness_score = [0]*len(clean_text)
    for i, clean in enumerate(clean_text):
        counts = collections.Counter(clean)
        for word, freq in counts.items():
            sadness_score[i] += sadness_dict.get(word, 0) * freq
            
    surprise_score = [0]*len(clean_text)
    for i, clean in enumerate(clean_text):
        counts = collections.Counter(clean)
        for word, freq in counts.items():
            surprise_score[i] += surprise_dict.get(word, 0) * freq
    
    trust_score = [0]*len(clean_text)
    for i, clean in enumerate(clean_text):
        counts = collections.Counter(clean)
        for word, freq in counts.items():
            trust_score[i] += trust_dict.get(word, 0) * freq
            
    dataframe['anger_score']=anger_score  
    dataframe['anticipation_score']=anticipation_score
    dataframe['disgust_score']=disgust_score
    dataframe['fear_score']=fear_score
    dataframe['joy_score']=joy_score
    dataframe['sadness_score']=sadness_score
    dataframe['surprise_score']=surprise_score
    dataframe['trust_score']=trust_score 

In [27]:
#Get the emotion score of the day
def get_daily_emotion(emotion_df):
    d_anger_score=0
    d_anticipation_score=0
    d_disgust_score=0
    d_fear_score=0
    d_joy_score=0
    d_sadness_score=0
    d_surprise_score=0
    d_trust_score=0
    
    for i in range(0,len(emotion_df)):
        d_anger_score += (1+0.01*emotion_df.iloc[i]['like'])*emotion_df.iloc[i]['anger_score']
        #Use like as an additional weight for tweets, and each like represents 1% of the sentiment score
        d_anticipation_score += (1+0.01*emotion_df.iloc[i]['like'])*emotion_df.iloc[i]['anticipation_score']
        d_disgust_score +=(1+0.01*emotion_df.iloc[i]['like'])*emotion_df.iloc[i]['disgust_score']
        d_fear_score +=(1+0.01*emotion_df.iloc[i]['like'])*emotion_df.iloc[i]['fear_score']
        d_joy_score +=(1+0.01*emotion_df.iloc[i]['like'])*emotion_df.iloc[i]['joy_score']
        d_sadness_score +=(1+0.01*emotion_df.iloc[i]['like'])*emotion_df.iloc[i]['sadness_score']
        d_surprise_score +=(1+0.01*emotion_df.iloc[i]['like'])*emotion_df.iloc[i]['surprise_score']
        d_trust_score +=(1+0.01*emotion_df.iloc[i]['like'])*emotion_df.iloc[i]['trust_score']
    
    return d_anger_score,d_anticipation_score,d_disgust_score,d_fear_score,d_joy_score,d_sadness_score,d_surprise_score,d_trust_score

In [11]:
#Get valid tweet data
def get_available_emotion(df):
    available_daily_data = df[ (df['anger_score'] != 0) | (df['anticipation_score']!=0) | (df['disgust_score']!=0) | (df['fear_score']!=0)| (df['joy_score']!=0) | (df['sadness_score']!=0) | (df['surprise_score']!=0) | (df['trust_score'] != 0)]
    return available_daily_data

### Test (Original)

In [12]:
month= '1'
coin='Bitcoin'

In [None]:
data_input_path = 'E:\\project_data\\tweet_data\\#'+coin+'\\'+ month

filenames = glob.glob(data_input_path + '\\*.csv') 

daily_emotion=pd.DataFrame(columns=('date','volume','available_volume','anger_score','anticipation_score','disgust_score','fear_score','joy_score','sadness_score','surprise_score','trust_score'))

for filename in filenames:
    daily_data = pd.read_csv(filename, index_col = None, header = 0)
    
    #Get date
    date = daily_data['created_at'][0][0:10]
    
    clean_text = get_clean_text(daily_data)
    
    #Get sentiment score for each tweet of the day
    get_emotion_score(daily_data ,clean_text)
    
    #Get the total sentiment score of the day
    anger_score,anticipation_score,disgust_score,fear_score,joy_score,sadness_score,surprise_score,trust_score = get_daily_emotion(daily_data)
    
    #Tweets volume
    volume=len(daily_data)
    
    #Available tweets volume (including specific emotions)
    available_volume=len(get_available_emotion(daily_data))
    
    #Write daily data to file
    daily_emotion= daily_emotion.append({'date' : date , 'volume' :volume , 'available_volume': available_volume,
                                         'anger_score' : anger_score , 'anticipation_score' : anticipation_score ,
                                         'disgust_score': disgust_score , 'fear_score' : fear_score ,
                                         'joy_score' : joy_score , 'sadness_score': sadness_score,
                                         'surprise_score' : surprise_score , 'trust_score': trust_score}, 
                                         ignore_index = True)
    print(filename)

#Output the emotion report of the current month
daily_emotion.to_csv('E:\\project_data\\tweet_data\\#'+coin+'\\report\\'+month+"Report.csv",encoding='utf-8',index=False)

### Testing (Modified)

In [28]:
daliy_data = pd.read_csv('tweets.csv', index_col=0)
clean_text = get_clean_text(daliy_data)
get_emotion_score(daliy_data ,clean_text)

In [29]:
daliy_data

Unnamed: 0,Date,Tweet,like,retweet,reply,anger_score,anticipation_score,disgust_score,fear_score,joy_score,sadness_score,surprise_score,trust_score
0,2022-11-30 23:59:58+00:00,@DashDobrofsky Soon all those people who belie...,0,0,0,1.762291,3.919842,0.975390,4.050166,1.407615,0.575600,2.864855,2.895408
1,2022-11-30 23:59:56+00:00,@marlon_humphrey How's the Bitcoin going for y...,2,0,0,0.150243,1.211380,0.081535,0.039826,0.000000,0.000000,0.000000,0.312855
2,2022-11-30 23:59:51+00:00,https://t.co/cOIk7u7kY3\nUS CFTC commissioner ...,6,0,0,0.587202,1.407864,0.475431,0.608304,0.289417,0.655437,1.860658,0.713978
3,2022-11-30 23:59:47+00:00,Bitcoin consumes energy on the scale of entire...,0,0,0,1.093080,2.375762,0.219344,0.866195,2.786284,0.827080,0.439704,0.000000
4,2022-11-30 23:59:47+00:00,@Todd_Toddleston @Arcanineties I didn't say th...,0,0,1,1.449523,2.246428,1.596995,1.728888,0.145413,0.442848,0.900368,0.374857
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2022-11-30 18:46:45+00:00,#Ethereum price update: \n\n#ETH $1272.21 USD\...,1,0,0,1.484547,1.404695,0.000000,0.000000,0.000000,0.249774,0.000000,0.083404
9996,2022-11-30 18:46:45+00:00,@ragz2crypto As I remember when Bitcoin want t...,2,0,2,0.538475,0.000000,0.835912,0.099790,0.000000,0.408701,0.193291,0.450845
9997,2022-11-30 18:46:38+00:00,Bitcoin and Ethereum show signs of a resurgenc...,0,0,0,0.000000,0.667242,0.000000,0.000000,0.000000,0.000000,0.300839,0.000000
9998,2022-11-30 18:46:37+00:00,@tbrandall33 @LightHarmonious @WatcherGuru Bit...,0,0,1,0.209599,0.650404,0.169452,0.569501,0.000000,0.664927,0.000000,0.000000


In [34]:
len(get_available_emotion(daliy_data))

9848

In [35]:
anger_score,anticipation_score,disgust_score,fear_score,joy_score,sadness_score,surprise_score,trust_score = get_daily_emotion(daliy_data)

In [36]:
anger_score,anticipation_score,disgust_score,fear_score,joy_score,sadness_score,surprise_score,trust_score

(9551.758661287771,
 13657.278563124115,
 6588.80182098704,
 8139.579406207774,
 7635.810692083374,
 4326.253232588365,
 7658.73602749644,
 8695.228075828894)