In [19]:
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import spacy
from spacy_langdetect import LanguageDetector
from spacy.language import Language
from flair.models import TextClassifier
from flair.data import Sentence
import emoji
import plotly.express as px

In [2]:
CONVERT_QUOTES = dict( [ (ord(x), ord(y)) for x,y in zip( u"‘’´“”–-",  u"'''\"\"--") ] ) 
def give_emoji_free_text(text):
    text = text.encode(encoding='utf-8')
    return emoji.get_emoji_regexp().sub(r'', text.decode('utf8'))
def is_english(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
#         doc = nlp(s)
#         detect_language = doc._.language
#         if detect_language['language'] != 'en':
#             return False
#         else:
#             return True
def has_shib_doge(s):
    s = s.lower()
    return "shib" in s or "doge" in s
def combine_sentence(message_list):
    msg = ""
    for i in message_list:
        if type(i) == str:
            msg += i
    return msg

In [3]:
def preprocess(messages):
    df = pd.DataFrame()
    messages_text = []
    messages_date = []
    for m in messages:
        messages_text.append(m['text'])
        date = m['date'][:10]
        messages_date.append(date)
    
    clean_messages = []
    clean_dates = []
    # dirty_messages = []
    # exceptions = []
    for m, d in tqdm(zip(messages_text, messages_date)):
        if type(m) != str:
            # exceptions.append(m)
            m = combine_sentence(m)
        m = give_emoji_free_text(m)
        m = m.translate(CONVERT_QUOTES)
        if has_shib_doge(m) and is_english(m):
            clean_messages.append(m)
            clean_dates.append(d)
        # else:
            # dirty_messages.append(m)
    df['text'] = clean_messages
    df['date'] = clean_dates
    return df

In [4]:
def flair_prediction(nlp, x):
    sentence = Sentence(x)
    nlp.predict(sentence)
    score = sentence.labels[0]
    if "POSITIVE" in str(score):
        return "pos"
    elif "NEGATIVE" in str(score):
        return "neg"
    else:
        return "neu"
    
def flair_prediction_value(nlp, x):
    sentence = Sentence(x)
    nlp.predict(sentence)
    score = sentence.labels[0]
    return score.score

def sentiment_analysis(sentiment_nlp, df, out_filename):
    # df = pd.DataFrame()
    # df['text'] = clean_messages
    df['sentiment'] = df['text'].apply(lambda x: flair_prediction(sentiment_nlp, x))
    df['score'] = df['text'].apply(lambda x: flair_prediction_value(sentiment_nlp, x))
    df.to_csv(out_filename, header=True, index=True)
    return df


In [5]:
filename = "messages.json"
with open(filename, "r") as f:
    chatroom = json.load(f)
messages = chatroom['messages']

In [6]:
df = preprocess(messages)
sentiment_nlp = TextClassifier.load('sentiment-fast')
out_filename = "messages_sentiment.csv"
df = sentiment_analysis(sentiment_nlp, df, out_filename)

47232it [00:08, 5349.60it/s]

2021-12-18 11:14:34,763 loading file /Users/tiffanychang/.flair/models/sentiment-en-mix-ft-rnn_v8.pt





In [7]:
df.head()

Unnamed: 0,text,date,sentiment,score
0,Doge is going craY,2021-05-01,neg,0.792797
1,Sell target of doge,2021-05-01,neg,0.959932
2,Doge,2021-05-01,pos,0.536456
3,Dogecoin!!! Que hago?,2021-05-01,neg,0.753658
4,"Anyway, is doge a good crypto for long term in...",2021-05-01,neg,0.577099


In [15]:
count_df = df.groupby(['date']).count()

In [59]:
def plot(df):
    fig = px.bar(df, x = "date", y = "text")
    fig.show()

In [26]:
count_df_posneg = df.groupby(['date', 'sentiment']).count()

In [60]:
plot(date_score, count_df)

In [23]:
count_df = count_df.reset_index()

In [46]:
from collections import defaultdict

In [49]:
date_score = defaultdict(int)
for idx, row in count_df_posneg.reset_index().iterrows():
    if row[1] == 'neg':
        date_score[row[0]] = row[2]
    else:
        date_score[row[0]] = row[2]/(date_score[row[0]]+row[2])

In [52]:
date_score

defaultdict(int,
            {'2021-05-01': 0.3958333333333333,
             '2021-05-02': 0.38461538461538464,
             '2021-05-03': 0.37037037037037035,
             '2021-05-04': 0.37606837606837606,
             '2021-05-05': 0.2391304347826087,
             '2021-05-06': 0.2727272727272727,
             '2021-05-07': 0.4122137404580153,
             '2021-05-08': 0.2675438596491228,
             '2021-05-09': 0.25390625,
             '2021-05-10': 0.24666666666666667,
             '2021-05-11': 0.2875,
             '2021-05-12': 0.42953020134228187,
             '2021-05-13': 0.32989690721649484,
             '2021-05-14': 0.3238095238095238})

In [66]:
import math
def truncate(number, digits) -> float:
    stepper = 10.0 ** digits
    return math.trunc(stepper * number) / stepper

In [67]:
fig = px.line(x=date_score.keys(), y=date_score.values(), text=[truncate(x, 2) for x in date_score.values()])

In [68]:
fig.update_traces()
fig.show()

In [69]:
def plot2(date_score, df):
    fig = px.bar(df, x = "date", y = "text")
    fig.show()
    fig = px.line(x=date_score.keys(), y=date_score.values(), text=[truncate(x, 2) for x in date_score.values()])
    fig.show()

In [70]:
plot2(date_score, count_df)