In [23]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import emoji
import torch
import json
import time

from sklearn.utils import shuffle
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [7]:
# getting the twitter comments
DATA_PATH = '../data/twitter-data-cleaned.txt'
with open(DATA_PATH, 'r',  encoding="utf-8") as f:
    data = f.readlines()
data = [d.strip() for d in data if d.strip() != '']
print('Tweets')
print(data[:3])
print(len(data))

# getting our chosen emojis
SELECTED_EMOJIS_PATH = '../data/best-emojis.json'
with open(SELECTED_EMOJIS_PATH, 'r') as f:
    EMOJIS = json.load(f)
EMOJI_CHARS = [e['char'] for e in EMOJIS]
print('Chosen emojis')
print(EMOJI_CHARS)
print(len(EMOJI_CHARS))

ALL_EMOJIS = set(emoji.emojize(emoji_code) for emoji_code in emoji.UNICODE_EMOJI.values())

Tweets
['💫 IT’S BURGER MONDAY, CBUS! 💫 Order any Gourmet Burger with a draft beer 🍔🍺 ifor just $9.99 when you dine in. 💪 Cer…', 'Stay strong 💪 You are an amazing man. I appreciate the truth that is told.', 'Code Sale Awesome from Styli 💪']
21660
Chosen emojis
['😂', '😍', '😭', '😊', '💕', '😒', '😉', '👌', '👍', '🙏', '👀', '🔥', '💯', '👏', '💪']
15


In [3]:
# # preprocessing the data through nltk
# from nltk.tokenize import TweetTokenizer
# from nltk.tokenize import word_tokenize

# def tokenize_text(text):
#     text = text.lower().strip()
#     words = word_tokenize(text)
#     updated_words = []
#     for w in words:
#         if w.strip() == '':
#             continue
            
#         beginning = 0
#         i = 0
#         while i < len(w):
#             c = w[i]
#             if i > 0 and c in ALL_EMOJIS:
#                 updated_words.append(w[beginning:i])
#                 updated_words.append(c)
#                 beginning = i+1
#                 i += 2
#             i += 1
#         updated_words.append(w[beginning:])
    
#     return updated_words

# for t in data:
#     print(tokenize_text(t))

In [11]:
# preprocessing the data
from parsing import Tokenizer, TokenType, Token
tokenizer = Tokenizer(EMOJI_CHARS)
# take 3 previous words as context for the emoji
context = {e:[] for e in EMOJI_CHARS}
emojiToId = {e:i for i,e in enumerate(EMOJI_CHARS)}

for tweet in data:
    tokens = tokenizer.tokenize(tweet)
    for i,token in enumerate(tokens):
        if token.token_type == TokenType.EMOJIS:
            closest = tokenizer.findClosestNWords(5, tokens, i)
            if closest:
                context[token.raw].append(closest)

for e, words in context.items():
    print(e)
    print(len(words))
    print(words)

😂
1340
[['familiar', 'with', 'my', 'face', 'too'], ['bollywood', 'reality'], ['though', 'your', 'debates', 'be', 'blasphemy'], ['though', 'your', 'debates', 'be', 'blasphemy'], ['game', 'too', 'strong'], ['game', 'too', 'strong'], ['rub', 'for', 'the', 'razzle', 'dazzle'], ['to', 'hang', 'up', 'on', 'santa'], ['stones', 'may', 'break', 'my', 'bones'], ['her', 'for', 'hate', 'crime', 'speech'], ['got', 'this', 'soon', 'youre', 'done'], ['19th', 'to', 'yung', 'flight', 'school'], ['that', 'julius', 'erving'], ['mrji', 'krlo', 'tum', 'bullywood', 'walo'], ['aama', 'wait', 'and', 'watch'], ['the', 'day', 'you', 'crossed', 'bro'], ['get', 'addicted', 'like', 'last', 'time'], ['as', 'she', 'should', 'folks', 'play'], ['should', 'folks', 'play', 'too', 'much'], ['should', 'folks', 'play', 'too', 'much'], ['to', 'mind', 'their', 'own', 'business'], ['abnormal', 'behavior', 'hahaha', 'chaaar', 'cheer'], ['him', 'i', 'need', 'that', 'btw'], ['that', 'a', 'million', 'times', 'before'], ['30', 'an

In [18]:
# making X and y for TFIDF
X_words = []
y = []
sentences = set()
for e, words in context.items():
    for i,word_list in enumerate(words):
        sentence = ' '.join(word_list)
        if sentence in sentences:
            continue
        sentences.add(sentence)
        X_words.append(sentence)
        y.append(emojiToId[e])
print(len(y))

13136


In [13]:
# attempting to use tfidf and RF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_words)

model = RandomForestClassifier(n_estimators=300)
scores = cross_val_score(model, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.19 (+/- 0.01)


In [20]:
# making X and y for RNN
X_words = []
y = []
sentences = set()
for e, words in context.items():
    for i, sentence in enumerate(words):
        check = ' '.join(sentence)
        if check in sentences:
            continue
        sentences.add(check)
        X_words.append(sentence)
        y.append(emojiToId[e])
print(X_words[0])
print(len(y))

['familiar', 'with', 'my', 'face', 'too']
13136


In [50]:
# making word embeddings
UNK = "<unk>"
WORDEMBSIZE = 64
W2V_WINDOW = 7
W2V_COUNT = 2
W2V_EPOCH=100

def makeVocab(text):
    vocab = set()
    for sentences in text:
        for word in sentences:
            vocab.add(word)
    return vocab

def makeEmbModel(data):
    docs = [[UNK]]
    docs.extend(data)
    model = Word2Vec(window=W2V_WINDOW, min_count=W2V_COUNT, size=WORDEMBSIZE)
    model.build_vocab(docs)
    model.train(docs, total_examples=len(docs), epochs=W2V_EPOCH)
    print(model)
    return model

def makeEmbeddings(data, model, vocab):
    vecData = []
    for sentence in data:
        wordEmbs = []
        for word in sentence:
            if word in vocab:
                print(type(model[word]))
                print(model[word])
                wordEmbs.append(model[word])
            else:
                wordEmbs.append(np.zeros(WORDEMBSIZE))
        wordEmbs = torch.FloatTensor(wordEmbs)
        vecData.append(wordEmbs)
    return vecData

In [51]:
# continue making word embeddings
X_words_train, X_words_test, y_train, y_test = train_test_split(X_words, y, test_size=0.2, random_state=42)
vocab = makeVocab(X_words_train)
emb_model = makeEmbModel(X_words_train)
vocab = set(list(emb_model.wv.vocab.keys()))
X_train = makeEmbeddings(X_words_train, emb_model, vocab)

print(X_train[0])

Word2Vec(vocab=3303, size=64, alpha=0.025)
<class 'numpy.ndarray'>
[-0.47498694  0.31437758 -0.42361426 -1.0295929   0.29128805  0.40100643
 -0.5607378  -0.38821387 -0.5099595  -1.1481271   0.7253575  -0.09276792
 -0.32799417 -0.1136446   0.0184873  -0.04296262 -0.50723493  0.10757865
  0.0236316   0.5941751  -0.25348613  0.28953773  0.01739942 -0.74934036
  0.14685358  0.69399834 -0.20071371  0.09916402 -0.2540294  -0.09890247
  0.10372674 -0.48346213  0.60450697 -0.15365323 -0.42178157  0.2372707
  0.46547288 -0.46432602 -0.5295996   0.10960971 -0.300676   -0.7220095
 -0.36924356  0.4007396  -0.02477515 -0.39119035 -0.3876181  -0.0348733
 -0.0318744   0.33850244  0.7257863   0.04160434  0.1197     -0.03264197
  0.76115084 -0.2505883  -0.57779276  0.1547842  -0.27846763  0.11296428
 -0.41674432  0.35229927 -0.09039953  0.3618178 ]
<class 'numpy.ndarray'>
[ 0.92544013  0.36872783 -0.33633032 -0.6635195   1.0445518  -0.4365708
 -1.8929044   0.1042226   0.9969103  -0.7382717   0.8188101 



ValueError: only one element tensors can be converted to Python scalars