In [1]:
import numpy
import json
from tqdm import tqdm
from spacy_langdetect import LanguageDetector
from spacy.language import Language
import spacy
import emoji

In [2]:
with open("messages.json", "r") as f:
    chatroom = json.load(f)

In [3]:
messages = chatroom['messages']

In [101]:
messages[:10]

[{'id': 1903819,
  'type': 'message',
  'date': '2021-05-01T00:00:12',
  'from': None,
  'from_id': 'user1650688285',
  'text': 'hi'},
 {'id': 1903842,
  'type': 'message',
  'date': '2021-05-01T00:01:13',
  'from': None,
  'from_id': 'user1650688285',
  'text': 'do indicators work?'},
 {'id': 1903855,
  'type': 'message',
  'date': '2021-05-01T00:01:50',
  'from': 'Social Ch4in',
  'from_id': 'user484605980',
  'text': 'If you trade true gbp for gbp Fiat or vice Versa is there a fee?'},
 {'id': 1903856,
  'type': 'message',
  'date': '2021-05-01T00:02:05',
  'from': 'Social Ch4in',
  'from_id': 'user484605980',
  'text': 'And how much is that fee?'},
 {'id': 1903857,
  'type': 'message',
  'date': '2021-05-01T00:02:12',
  'from': 'Social Ch4in',
  'from_id': 'user484605980',
  'text': 'If you’re a silver card holder'},
 {'id': 1903858,
  'type': 'message',
  'date': '2021-05-01T00:02:14',
  'from': 'Syncrol',
  'from_id': 'user1619562639',
  'text': 'Make CKB withdrawable !'},
 {'id':

In [4]:
len(messages)

47232

In [5]:
messages_text = []
for m in messages:
    messages_text.append(m['text'])

In [102]:
messages_date = []
for m in messages:
    date = m['date'][:10]
    messages_date.append(date)

In [105]:
messages_date[-10:]

['2021-05-14',
 '2021-05-14',
 '2021-05-14',
 '2021-05-14',
 '2021-05-14',
 '2021-05-14',
 '2021-05-14',
 '2021-05-14',
 '2021-05-14',
 '2021-05-14']

In [6]:
messages_text[:5]

['hi',
 'do indicators work?',
 'If you trade true gbp for gbp Fiat or vice Versa is there a fee?',
 'And how much is that fee?',
 'If you’re a silver card holder']

In [7]:
nlp = spacy.load("en_core_web_sm")
def get_lang_detector(nlp, name):
    return LanguageDetector()
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x7ffd89ba9748>

In [8]:
def give_emoji_free_text(text):
    text = text.encode(encoding='utf-8')
    return emoji.get_emoji_regexp().sub(r'', text.decode('utf8'))

In [14]:
def is_english(nlp, s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
#         print(s)
        return False
    else:
        return True
#         doc = nlp(s)
#         detect_language = doc._.language
#         if detect_language['language'] != 'en':
#             return False
#         else:
#             return True
def has_shib_doge(s):
    s = s.lower()
    return "shib" in s or "doge" in s  

In [10]:
is_english(nlp, "I made an account before 4 days with email ")

True

In [11]:
give_emoji_free_text("DOGE coin is up by +16.39% 🚀🚀🚀")

'DOGE coin is up by +16.39% '

In [12]:
transl_table = dict( [ (ord(x), ord(y)) for x,y in zip( u"‘’´“”–-",  u"'''\"\"--") ] ) 

In [78]:
def combine_sentence(message_list):
    msg = ""
    for i in message_list:
        if type(i) == str:
            msg += i
    return msg

In [79]:
clean_messages = []
dirty_messages = []
exceptions = []
for m in tqdm(messages_text):
    if type(m) != str:
        m = combine_sentence(m)
    m = give_emoji_free_text(m)
    m = m.translate(transl_table)
    if has_shib_doge(m) and is_english(nlp, m):
        clean_messages.append(m)
    else:
        dirty_messages.append(m)

100%|██████████| 47232/47232 [00:08<00:00, 5306.85it/s]


In [80]:
len(clean_messages)

2872

In [99]:
# clean_messages[:50]

In [21]:
from flair.models import TextClassifier
from flair.data import Sentence

In [53]:
# sentiment_nlp = TextClassifier.load('en-sentiment')
sentiment_nlp = TextClassifier.load('sentiment-fast')

2021-12-18 01:56:23,141 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-fasttext-rnn/sentiment-en-mix-ft-rnn_v8.pt not found in cache, downloading to /tmp/tmpyd7n3_q5


100%|██████████| 1241977025/1241977025 [02:24<00:00, 8568908.74B/s] 

2021-12-18 01:58:48,696 copying /tmp/tmpyd7n3_q5 to cache at /Users/tiffanychang/.flair/models/sentiment-en-mix-ft-rnn_v8.pt





2021-12-18 01:58:50,345 removing temp file /tmp/tmpyd7n3_q5
2021-12-18 01:58:50,445 loading file /Users/tiffanychang/.flair/models/sentiment-en-mix-ft-rnn_v8.pt


In [90]:
def flair_prediction(nlp, x):
    sentence = Sentence(x)
    nlp.predict(sentence)
    score = sentence.labels[0]
    if "POSITIVE" in str(score):
        return "pos"
    elif "NEGATIVE" in str(score):
        return "neg"
    else:
        return "neu"
    
def flair_prediction_value(nlp, x):
    sentence = Sentence(x)
    nlp.predict(sentence)
    score = sentence.labels[0]
    return score.score

In [26]:
import pandas as pd

In [85]:
df = pd.DataFrame()

In [86]:
df['text'] = clean_messages

In [96]:
df['sentiment'] = df["text"].apply(lambda x: flair_prediction(sentiment_nlp, x))
df['score'] = df['text'].apply(lambda x: flair_prediction_value(sentiment_nlp, x))

In [98]:
df.head(10)

Unnamed: 0,text,sentiment,score
0,Doge is going craY,neg,0.792797
1,Sell target of doge,neg,0.959932
2,Doge,pos,0.536456
3,Dogecoin!!! Que hago?,neg,0.753658
4,"Anyway, is doge a good crypto for long term in...",neg,0.577099
5,Doge target,pos,0.608428
6,Who else going to mars With doge,pos,0.991977
7,Doge to the moon,pos,0.80682
8,Doge is dompig,neg,0.61655
9,Doge,pos,0.536456


In [59]:
flair_prediction("Will hit 1$?")

'neg'

In [61]:
flair_prediction("this is gonna rise")

'neg'

In [51]:
sentence = Sentence("this thing is ok")
sentiment_nlp.predict(sentence)
# score = sentence.labels[0]

In [52]:
sentence.labels

[NEGATIVE (0.7494)]

In [106]:
import plotly.express as px

In [None]:
fig = px.bar(df, x = "sepal_width", y = "sepal_length")