In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import emoji
from os.path import join
import json
import time

from sklearn.utils import shuffle
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [3]:
# getting the twitter comments
DATA_PATH = join('..','data','twitter-data-cleaned.txt')
with open(DATA_PATH, 'r',  encoding="utf-8") as f:
    data = f.readlines()
data = [d.strip() for d in data if d.strip() != '']
print('Tweets')
print(data[:3])
print(len(data))

# getting our chosen emojis
SELECTED_EMOJIS_PATH = join('..','data','best-emojis.json')
with open(SELECTED_EMOJIS_PATH, 'r', encoding='utf-8') as f:
    EMOJIS = json.load(f)
EMOJI_CHARS = [e['char'] for e in EMOJIS]
print('Chosen emojis')
print(EMOJI_CHARS)
print(len(EMOJI_CHARS))

ALL_EMOJIS = set(emoji.emojize(emoji_code) for emoji_code in emoji.UNICODE_EMOJI.values())

Tweets
['💫 IT’S BURGER MONDAY, CBUS! 💫 Order any Gourmet Burger with a draft beer 🍔🍺 ifor just $9.99 when you dine in. 💪 Cer…', 'Stay strong 💪 You are an amazing man. I appreciate the truth that is told.', 'Code Sale Awesome from Styli 💪']
122179
Chosen emojis
['😂', '😍', '😭', '😊', '💕', '😒', '😉', '👌', '👍', '🙏', '👀', '🔥', '💯', '👏', '💪']
15


In [4]:
# preprocessing the data
from parsing import Tokenizer, TokenType, Token
tokenizer = Tokenizer(EMOJI_CHARS)
# take 3 previous words as context for the emoji
context = {e:[] for e in EMOJI_CHARS}
emojiToId = {e:i for i,e in enumerate(EMOJI_CHARS)}

for tweet in data:
    tokens = tokenizer.tokenize(tweet)
    for i,token in enumerate(tokens):
        if token.token_type == TokenType.EMOJIS:
            closest = tokenizer.findClosestNWords(5, tokens, i)
            if closest:
                context[token.raw].append(closest)

for e, words in context.items():
    print(e)
    print(len(words))
    print(words[:3])

😂
8458
[['familiar', 'with', 'my', 'face', 'too'], ['bollywood', 'reality'], ['though', 'your', 'debates', 'be', 'blasphemy']]
😍
7330
[['figura', 'is', 'a', 'start', 'today'], ['200', 'days', 'to', 'go'], ['200', 'days', 'to', 'go']]
😭
7464
[['lol', 'procrastination', 'almost', 'had', 'me'], ['just', 'change', 'my', 'first', 'diaper'], ['52', 'yuan', 'for', 'this', 'set']]
😊
6352
[['you', 'today', 'credit', 'stacie', 'swift'], ['fantastic', 'well', 'done', 'ladies'], ['congratulations', 'fam', 'all', 'the', 'best']]
💕
6485
[['happy', 'birthday', 'my', 'mochiiii', 'saranghae'], ['got', 'this', 'soon', 'youre', 'done'], ['we', 'got', 'this']]
😒
5873
[['️', 'dunk', 'it', 'ang', 'hina'], ['comes', '17', 'on', 'uber', 'eats'], ['comes', '17', 'on', 'uber', 'eats']]
😉
5932
[['start', 'today', 'choose', 'your', 'package'], ['the', 'richest', 'man', 'your', 'age'], ['yall', 'know', 'the', 'rest']]
👌
6035
[['yess'], ['nice', 'one', 'you', 'are', 'a'], ['the', 'best', 'and', 'great', 'artist']]


In [5]:
# making X and y for TFIDF as a baseline idea of how good our accuracy can expect to be
X_words = []
y = []
sentences = set()
for e, words in context.items():
    for i,word_list in enumerate(words):
        sentence = ' '.join(word_list)
        if sentence in sentences:
            continue
        sentences.add(sentence)
        X_words.append(sentence)
        y.append(emojiToId[e])
print(len(y))

75537


In [8]:
# attempting to use tfidf and RF
# WARNING: can take a decent amount of time: 5 mins or so?
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_words)

model = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(model, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.20 (+/- 0.01)


In [6]:
# attempting to use count_vectorizer and RF
# WARNING: can take a decent amount of time: 5 mins or so?
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X_words)
scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(X)

model = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(model, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.18 (+/- 0.01)


In [10]:
# RNN building to predict result
import torch
import torch.nn as nn
import torch.optim as optim
import random
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

In [7]:
# making X and y for RNN
X_words = []
y = []
sentences = set()
for e, words in context.items():
    for i, sentence in enumerate(words):
        check = ' '.join(sentence)
        if check in sentences:
            continue
        sentences.add(check)
        X_words.append(sentence)
        y.append(emojiToId[e])
print(X_words[0])
print(len(y))

['familiar', 'with', 'my', 'face', 'too']
75537


In [8]:
# making word embeddings for RNN
UNK = "<unk>"
WORDEMBSIZE = 32
W2V_WINDOW = 7
W2V_COUNT = 1
W2V_EPOCH=100

def makeVocab(text):
    vocab = set()
    for sentences in text:
        for word in sentences:
            vocab.add(word)
    return vocab

def makeEmbModel(data):
    docs = [[UNK]]
    docs.extend(data)
    model = Word2Vec(window=W2V_WINDOW, min_count=W2V_COUNT, size=WORDEMBSIZE)
    model.build_vocab(docs)
    model.train(docs, total_examples=len(docs), epochs=W2V_EPOCH)
    print(model)
    return model

def makeEmbeddings(data, model, vocab):
    vecData = []
    for sentence in data:
        wordEmbs = []
        for word in sentence:
            if word in vocab:
#                 print(type(model[word]))
#                 print(model[word])
                wordEmbs.append(model[word])
            else:
                wordEmbs.append(np.zeros(WORDEMBSIZE))
        wordEmbs = torch.FloatTensor(wordEmbs)
        vecData.append(wordEmbs)
    return vecData

In [11]:
# continue making word embeddings
X_words_train, X_words_test, y_train, y_test = train_test_split(X_words, y, train_size=0.8, test_size=0.2, random_state=42)
vocab = makeVocab(X_words_train)
emb_model = makeEmbModel(X_words_train)
vocab = set(list(emb_model.wv.vocab.keys()))
X_train = makeEmbeddings(X_words_train, emb_model, vocab)
X_test = makeEmbeddings(X_words_test, emb_model, vocab)

print(X_words_train[0])
print(X_train[0])
print(y_train[0])

Word2Vec(vocab=28504, size=32, alpha=0.025)




['in', 'such', 'a', 'distinguished', 'list']
tensor([[ 1.1820,  1.3621, -0.0322,  0.2035, -1.1621,  0.6980, -1.2176,  0.8748,
         -1.0998,  2.0087, -1.0029,  0.2338, -1.2695,  0.9704,  2.8493,  0.5918,
          1.7781,  1.0950,  1.5729,  0.7699,  0.6813, -0.0208, -0.2447, -2.4421,
          0.0515,  1.7090,  1.7665,  0.3372, -0.3515,  0.3266, -0.6109,  0.9711],
        [-0.5427, -1.2408, -3.2577, -0.6928, -2.2368,  2.6404,  0.7852, -2.2126,
         -1.5806,  3.0299, -1.3534, -1.4536,  2.3654, -1.1971,  2.2649, -4.7520,
          2.6533,  1.0404, -1.7436,  0.5192, -0.9644, -0.1172,  0.4721, -1.0537,
          2.6231, -2.0016,  0.3251, -4.3810, -0.0129,  0.8412,  1.6989,  1.2532],
        [-0.6066, -0.0194, -2.1453, -1.2180, -1.2285,  0.2848,  0.0540, -3.0125,
         -2.0137,  0.6630, -1.6462, -0.2690, -0.3777, -2.0911,  0.5516, -1.6750,
          0.6623,  0.6887,  0.0546, -0.1058,  0.8023, -0.6558,  1.3897, -0.2070,
          0.6506,  1.2727,  0.5628, -1.7520, -1.1888, -1.0687,

In [12]:
# RNN model
class RNN(nn.Module):
    def __init__(self, input_dim, h, output_dim = 15):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_dim, h, dropout=0.2)
        self.finalLayer = nn.Linear(h, output_dim)
        self.input_dim = input_dim
        self.hidden_dim = h
        self.output_dim =  output_dim
#         self.word_embeddings = nn.Embedding(vocab_size, input_dim)
        self.softmax = nn.LogSoftmax()
        self.loss = nn.NLLLoss()
    
    def compute_loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)
    
    def forward(self, inputs):
        out, hidden = self.rnn(inputs)
        hidden = hidden.contiguous().view(-1,self.hidden_dim)
        predicted_vector = self.softmax(self.finalLayer(hidden))
        return predicted_vector

In [None]:
# running epochs for training and validation
HIDDEN_DIM = 128
EPOCHS = 30
minibatch_size = 32

model = RNN(WORDEMBSIZE, HIDDEN_DIM, 15)
optimizer = optim.SGD(model.parameters(),lr=0.001)
train_loss = []
train_acc = []
test_loss = []
test_acc = []
for epoch in range(EPOCHS):
    print("\n\n-------------")
    print("EPOCH: {}".format(epoch + 1))
    running_loss = 0.0
    model.train()
    optimizer.zero_grad()
    print("Training started for epoch: {}".format(epoch + 1))
    X_train, y_train = shuffle(X_train, y_train)
    start_time = time.time()
    correct = total = 0
    N = len(y_train)
    for minibatch_idx in tqdm(range(N // minibatch_size)):
        optimizer.zero_grad()
        loss = None
        for idx in range(minibatch_size):
            text = X_train[minibatch_idx * minibatch_size + idx]
            text = torch.unsqueeze(text, 1)
            labelIdx = y_train[minibatch_idx * minibatch_size + idx]
            log_probs = model(text)
            text_loss = model.compute_loss(log_probs.view(1,-1), torch.tensor([labelIdx]))
            running_loss += text_loss
            if loss is None:
                loss = text_loss
            else:
                loss += text_loss
            pred_label = torch.argmax(log_probs)
            correct += int(pred_label == labelIdx)
            total += 1
        loss = loss / minibatch_size
        loss.backward()
        optimizer.step()
    train_loss.append(running_loss / N)
    train_acc.append(correct / total)
    print("Training completed for epoch: {}".format(epoch + 1))
    print("Time for train: {}".format(time.time() - start_time))
    print("Accuracy: {}".format(correct / total))
    
    #validation
    running_loss = 0.0
    model.eval()
    optimizer.zero_grad()
    print("Validation started for epoch: {}".format(epoch + 1))
    X_test, y_test = shuffle(X_test, y_test)
    start_time = time.time()
    correct = total = 0
    N = len(y_test)
    for minibatch_idx in tqdm(range(N // minibatch_size)):
        optimizer.zero_grad()
        for idx in range(minibatch_size):
            text = X_test[minibatch_idx * minibatch_size + idx]
            text = torch.unsqueeze(text, 1)
            labelIdx = y_test[minibatch_idx * minibatch_size + idx]
            log_probs = model(text)
            text_loss = model.compute_loss(log_probs.view(1,-1), torch.tensor([labelIdx]))
            running_loss += text_loss
            pred_label = torch.argmax(log_probs)
            correct += int(pred_label == labelIdx)
            total += 1
    test_loss.append(running_loss / N)
    test_acc.append(correct / total)
    print("Validation completed for epoch: {}".format(epoch + 1))
    print("Time for validation: {}".format(time.time() - start_time))
    print("Accuracy: {}".format(correct / total))

plt.plot(train_loss, label='train loss')
plt.plot(test_loss, label='test loss')
plt.legend()
plt.show()

plt.plot(train_acc, label='train acc')
plt.plot(test_acc, label='test acc')
plt.legend()
plt.show()



-------------
EPOCH: 1
Training started for epoch: 1


  "num_layers={}".format(dropout, num_layers))


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))




Training completed for epoch: 1
Time for train: 64.54505729675293
Accuracy: 0.08282574152542373
Validation started for epoch: 1


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Validation completed for epoch: 1
Time for validation: 6.368487596511841
Accuracy: 0.09765625


-------------
EPOCH: 2
Training started for epoch: 2


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))


Training completed for epoch: 2
Time for train: 67.56291723251343
Accuracy: 0.1082329184322034
Validation started for epoch: 2


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Validation completed for epoch: 2
Time for validation: 6.431422710418701
Accuracy: 0.11533368644067797


-------------
EPOCH: 3
Training started for epoch: 3


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))


Training completed for epoch: 3
Time for train: 69.96244883537292
Accuracy: 0.12799589512711865
Validation started for epoch: 3


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Validation completed for epoch: 3
Time for validation: 6.62622332572937
Accuracy: 0.13076006355932204


-------------
EPOCH: 4
Training started for epoch: 4


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))


Training completed for epoch: 4
Time for train: 74.47483515739441
Accuracy: 0.14324020127118645
Validation started for epoch: 4


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Validation completed for epoch: 4
Time for validation: 6.74909782409668
Accuracy: 0.14247881355932204


-------------
EPOCH: 5
Training started for epoch: 5


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))


Training completed for epoch: 5
Time for train: 76.1311399936676
Accuracy: 0.1530554819915254
Validation started for epoch: 5


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Validation completed for epoch: 5
Time for validation: 6.610239744186401
Accuracy: 0.15274099576271186


-------------
EPOCH: 6
Training started for epoch: 6


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))


Training completed for epoch: 6
Time for train: 77.27197360992432
Accuracy: 0.1598252118644068
Validation started for epoch: 6


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Validation completed for epoch: 6
Time for validation: 6.848995923995972
Accuracy: 0.1561175847457627


-------------
EPOCH: 7
Training started for epoch: 7


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))


Training completed for epoch: 7
Time for train: 84.4656171798706
Accuracy: 0.16204316737288135
Validation started for epoch: 7


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Validation completed for epoch: 7
Time for validation: 6.435418605804443
Accuracy: 0.1590969279661017


-------------
EPOCH: 8
Training started for epoch: 8


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))


Training completed for epoch: 8
Time for train: 80.73942732810974
Accuracy: 0.1652045815677966
Validation started for epoch: 8


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Validation completed for epoch: 8
Time for validation: 7.008832216262817
Accuracy: 0.1616790254237288


-------------
EPOCH: 9
Training started for epoch: 9


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))


Training completed for epoch: 9
Time for train: 82.258873462677
Accuracy: 0.1677204713983051
Validation started for epoch: 9


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Validation completed for epoch: 9
Time for validation: 6.330526828765869
Accuracy: 0.1633342161016949


-------------
EPOCH: 10
Training started for epoch: 10


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))


Training completed for epoch: 10
Time for train: 78.42479395866394
Accuracy: 0.1694915254237288
Validation started for epoch: 10


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Validation completed for epoch: 10
Time for validation: 7.739086389541626
Accuracy: 0.16518802966101695


-------------
EPOCH: 11
Training started for epoch: 11


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))


Training completed for epoch: 11
Time for train: 79.82336330413818
Accuracy: 0.17121292372881355
Validation started for epoch: 11


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Validation completed for epoch: 11
Time for validation: 6.947026491165161
Accuracy: 0.1661811440677966


-------------
EPOCH: 12
Training started for epoch: 12


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))


Training completed for epoch: 12
Time for train: 80.27489972114563
Accuracy: 0.1728184586864407
Validation started for epoch: 12


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Validation completed for epoch: 12
Time for validation: 6.748105525970459
Accuracy: 0.16882944915254236


-------------
EPOCH: 13
Training started for epoch: 13


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))


Training completed for epoch: 13
Time for train: 86.18985247612
Accuracy: 0.17457296080508475
Validation started for epoch: 13


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Validation completed for epoch: 13
Time for validation: 7.383548974990845
Accuracy: 0.17061705508474576


-------------
EPOCH: 14
Training started for epoch: 14


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))


Training completed for epoch: 14
Time for train: 88.96925520896912
Accuracy: 0.17593021716101695
Validation started for epoch: 14


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Validation completed for epoch: 14
Time for validation: 8.811487674713135
Accuracy: 0.1723384533898305


-------------
EPOCH: 15
Training started for epoch: 15


HBox(children=(FloatProgress(value=0.0, max=1888.0), HTML(value='')))


