In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import emoji
from os.path import join
import json
import time

from sklearn.utils import shuffle
from gensim.models import Word2Vec, FastText
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [3]:
# getting the twitter comments
DATA_PATH = join('..','data','twitter-data-cleaned.txt')
with open(DATA_PATH, 'r',  encoding="utf-8") as f:
    data = f.readlines()
data = [d.strip() for d in data if d.strip() != '']
print('Tweets')
print(data[:3])
print(len(data))

# getting our chosen emojis
SELECTED_EMOJIS_PATH = join('..','data','best-emojis.json')
with open(SELECTED_EMOJIS_PATH, 'r', encoding='utf-8') as f:
    EMOJIS = json.load(f)
EMOJI_CHARS = [e['char'] for e in EMOJIS]
print('Chosen emojis')
print(EMOJI_CHARS)
print(len(EMOJI_CHARS))

ALL_EMOJIS = set(emoji.emojize(emoji_code) for emoji_code in emoji.UNICODE_EMOJI.values())

Tweets
['💫 IT’S BURGER MONDAY, CBUS! 💫 Order any Gourmet Burger with a draft beer 🍔🍺 ifor just $9.99 when you dine in. 💪 Cer…', 'Stay strong 💪 You are an amazing man. I appreciate the truth that is told.', 'Code Sale Awesome from Styli 💪']
122179
Chosen emojis
['😂', '😍', '😭', '😊', '💕', '😒', '😉', '👌', '👍', '🙏', '👀', '🔥', '💯', '👏', '💪']
15


In [4]:
# preprocessing the data
from parsing import Tokenizer, TokenType, Token
tokenizer = Tokenizer(EMOJI_CHARS)
# take 3 previous words as context for the emoji
context = {e:[] for e in EMOJI_CHARS}
emojiToId = {e:i for i,e in enumerate(EMOJI_CHARS)}

for tweet in data:
    tokens = tokenizer.tokenize(tweet)
    for i,token in enumerate(tokens):
        if token.token_type == TokenType.EMOJIS:
            closest = tokenizer.findClosestNWords(5, tokens, i)
            if closest:
                context[token.raw].append(closest)

for e, words in context.items():
    print(e)
    print(len(words))
    print(words[:3])

😂
8440
[['go', 'lol', 'postmen', 'familiar', 'face'], ['bollywood', 'reality'], ['got', 'even', 'though', 'debates', 'blasphemy']]
😍
7329
[['week', 'studio', 'figura', 'start', 'today'], ['200', 'days', 'go'], ['200', 'days', 'go']]
😭
7428
[['thank', 'lol', 'procrastination', 'almost'], ['change', 'first', 'diaper', 'uncle', 'year'], ['deposit', '52', 'yuan', 'set', 'must']]
😊
6335
[['moment', 'today', 'credit', 'stacie', 'swift'], ['fantastic', 'well', 'done', 'ladies'], ['congratulations', 'fam', 'best']]
💕
6480
[['happy', 'birthday', 'mochiiii', 'saranghae', 'take'], ['got', 'soon', 'youre', 'done'], ['got']]
😒
5852
[['️', '️', 'dunk', 'ang', 'hina'], ['drink', 'comes', '17', 'uber', 'eats'], ['drink', 'comes', '17', 'uber', 'eats']]
😉
5905
[['figura', 'start', 'today', 'choose', 'package'], ['someone', 'said', 'richest', 'man', 'age'], ['yall', 'know', 'rest']]
👌
6023
[['yess'], ['nice', 'one', 'born', 'leader', 'keep'], ['nobody', 'perfect', 'best', 'great', 'artist']]
👍
6446
[['p

In [5]:
# making X and y for TFIDF as a baseline idea of how good our accuracy can expect to be
X_words = []
y = []
sentences = set()
for e, words in context.items():
    for i,word_list in enumerate(words):
        sentence = ' '.join(word_list)
        if sentence in sentences:
            continue
        sentences.add(sentence)
        X_words.append(sentence)
        y.append(emojiToId[e])
print(len(y))

72665


In [8]:
# attempting to use tfidf and RF
# WARNING: can take a decent amount of time: 5 mins or so?
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_words)

model = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(model, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.20 (+/- 0.01)


In [6]:
# attempting to use count_vectorizer and RF
# WARNING: can take a decent amount of time: 5 mins or so?
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X_words)
scaler = StandardScaler(with_mean=False)
X = scaler.fit_transform(X)

model = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(model, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.18 (+/- 0.01)


In [6]:
# RNN building to predict result
import torch
import torch.nn as nn
import torch.optim as optim
import random
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

In [7]:
# making X and y for RNN
X_words = []
y = []
sentences = set()
for e, words in context.items():
    for i, sentence in enumerate(words):
        check = ' '.join(sentence)
        if check in sentences:
            continue
        sentences.add(check)
        X_words.append(sentence)
        y.append(emojiToId[e])
print(X_words[0])
print(len(y))

['go', 'lol', 'postmen', 'familiar', 'face']
72665


In [15]:
# making word embeddings for RNN
UNK = "<unk>"
WORDEMBSIZE = 100
W2V_WINDOW = 7
W2V_COUNT = 1
W2V_EPOCH=100

def makeVocab(text):
    vocab = set()
    vocab.add(UNK)
    for sentences in text:
        for word in sentences:
            vocab.add(word)
    return vocab

def makeEmbModel(data):
#     model = FastText(data, size=WORDEMBSIZE, window=3, min_count=1, iter=10, sorted_vocab=1)
    model = Word2Vec(window=W2V_WINDOW, min_count=W2V_COUNT, size=WORDEMBSIZE)
    model.build_vocab(data)
    model.train(data, total_examples=len(data), epochs=W2V_EPOCH)
    print(model)
    return model

def makeEmbeddings(data, model, vocab):
    vecData = []
    for sentence in data:
        wordEmbs = []
        for word in sentence:
            if word in vocab:
#                 print(type(model[word]))
#                 print(model[word])
                wordEmbs.append(model.wv[word])
            else:
                wordEmbs.append(np.zeros(WORDEMBSIZE))
        wordEmbs = torch.FloatTensor(wordEmbs)
        vecData.append(wordEmbs)
    return vecData

In [16]:
# continue making word embeddings
X_words_train, X_words_test, y_train, y_test = train_test_split(X_words, y, train_size=0.8, test_size=0.2, random_state=42)
vocab = makeVocab(X_words_train)
emb_model = makeEmbModel(X_words_train)
vocab = set(list(emb_model.wv.vocab.keys()))
X_train = makeEmbeddings(X_words_train, emb_model, vocab)
X_test = makeEmbeddings(X_words_test, emb_model, vocab)

print(X_words_train[0])
print(X_train[0])
print(y_train[0])

Word2Vec(vocab=32885, size=100, alpha=0.025)
['llores', 'bu']
tensor([[ 0.0726,  0.3071,  0.1215,  0.1497,  0.0015, -0.1999, -0.0821, -0.0113,
         -0.1436, -0.2638, -0.0035, -0.2923, -0.0427,  0.2550,  0.0135,  0.0641,
          0.2646, -0.1737,  0.0140, -0.1118,  0.0473,  0.1453,  0.0037, -0.1263,
         -0.0384, -0.0687, -0.0493, -0.1431, -0.1057, -0.1498, -0.1081,  0.0979,
          0.0348, -0.0672, -0.0906,  0.2044, -0.0427,  0.0794,  0.1522, -0.3557,
          0.0892, -0.0883, -0.0411,  0.0468, -0.0293,  0.0626, -0.0558, -0.0076,
          0.2377,  0.0057, -0.1611, -0.0549, -0.1242, -0.0042,  0.0562, -0.1509,
          0.0740, -0.1027,  0.2137, -0.1154, -0.0855, -0.0365,  0.1870, -0.0524,
         -0.1420,  0.0887, -0.0350,  0.2134,  0.1544,  0.1406,  0.0346,  0.0072,
         -0.0298,  0.0851,  0.1626, -0.1545,  0.0430,  0.0725, -0.1892, -0.1618,
         -0.0782, -0.1726,  0.0180,  0.0380,  0.1582,  0.0925, -0.0155, -0.0879,
         -0.1873,  0.0706,  0.0046, -0.2998, -0

In [17]:
# RNN model
class RNN(nn.Module):
    def __init__(self, input_dim, h, output_dim = 15):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_dim, h, dropout=0.2)
        self.finalLayer = nn.Linear(h, output_dim)
        self.input_dim = input_dim
        self.hidden_dim = h
        self.output_dim =  output_dim
#         self.word_embeddings = nn.Embedding(vocab_size, input_dim)
        self.softmax = nn.LogSoftmax()
        self.loss = nn.NLLLoss()
    
    def compute_loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)
    
    def forward(self, inputs):
        out, hidden = self.rnn(inputs)
        hidden = hidden.contiguous().view(-1,self.hidden_dim)
        predicted_vector = self.softmax(self.finalLayer(hidden))
        return predicted_vector
    
class biRNN(nn.Module):
    def __init__(self, input_dim, h, output_dim = 15):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_dim, h, dropout=0.2, bidirectional=True)
        self.finalLayer = nn.Linear(h * 2, output_dim)
        self.input_dim = input_dim
        self.hidden_dim = h
        self.output_dim =  output_dim
#         self.word_embeddings = nn.Embedding(vocab_size, input_dim)
        self.softmax = nn.LogSoftmax()
        self.loss = nn.NLLLoss()
    
    def compute_loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)
    
    def forward(self, inputs):
        out, hidden = self.rnn(inputs)
        hidden = hidden.contiguous().view(-1,self.hidden_dim)
        predicted_vector = self.softmax(self.finalLayer(hidden))
        return predicted_vector

In [20]:
# running epochs for training and validation
HIDDEN_DIM = 124
EPOCHS = 15
minibatch_size = 64

model = RNN(WORDEMBSIZE, HIDDEN_DIM, 15)
optimizer = optim.SGD(model.parameters(),lr=0.01, weight_decay=0.01)
train_loss = []
train_acc = []
test_loss = []
test_acc = []
for epoch in range(EPOCHS):
    print("\n\n-------------")
    print("EPOCH: {}".format(epoch + 1))
    running_loss = 0.0
    model.train()
    optimizer.zero_grad()
    print("Training started for epoch: {}".format(epoch + 1))
    X_train, y_train = shuffle(X_train, y_train)
    start_time = time.time()
    correct = total = 0
    N = len(y_train)
    for minibatch_idx in tqdm(range(N // minibatch_size)):
        optimizer.zero_grad()
        loss = None
        for idx in range(minibatch_size):
            text = X_train[minibatch_idx * minibatch_size + idx]
            text = torch.unsqueeze(text, 1)
            labelIdx = y_train[minibatch_idx * minibatch_size + idx]
            log_probs = model(text)
            text_loss = model.compute_loss(log_probs.view(1,-1), torch.tensor([labelIdx]))
            running_loss += text_loss
            if loss is None:
                loss = text_loss
            else:
                loss += text_loss
            pred_label = torch.argmax(log_probs)
            correct += int(pred_label == labelIdx)
            total += 1
        loss = loss / minibatch_size
        loss.backward()
        optimizer.step()
    train_loss.append(running_loss / N)
    train_acc.append(correct / total)
    print("Training completed for epoch: {}".format(epoch + 1))
    print("Time for train: {}".format(time.time() - start_time))
    print("Accuracy: {} Loss: {}".format(correct / total, train_loss[-1]))
    
    #validation
    running_loss = 0.0
    model.eval()
    optimizer.zero_grad()
    print("Validation started for epoch: {}".format(epoch + 1))
    X_test, y_test = shuffle(X_test, y_test)
    start_time = time.time()
    correct = total = 0
    N = len(y_test)
    for minibatch_idx in tqdm(range(N // minibatch_size)):
        optimizer.zero_grad()
        for idx in range(minibatch_size):
            text = X_test[minibatch_idx * minibatch_size + idx]
            text = torch.unsqueeze(text, 1)
            labelIdx = y_test[minibatch_idx * minibatch_size + idx]
            log_probs = model(text)
            text_loss = model.compute_loss(log_probs.view(1,-1), torch.tensor([labelIdx]))
            running_loss += text_loss
            pred_label = torch.argmax(log_probs)
            correct += int(pred_label == labelIdx)
            total += 1
    curr_loss = running_loss / N
    test_loss.append(curr_loss)
    test_acc.append(correct / total)
    print("Validation completed for epoch: {}".format(epoch + 1))
    print("Time for validation: {}".format(time.time() - start_time))
    print("Accuracy: {} Loss: {}".format(correct / total, curr_loss))
    if len(test_loss > 3) and curr_loss >= test_loss[-3]:
        print("Stopping progress: no longer learning for validation")
        break

plt.plot(train_loss, label='train loss')
plt.plot(test_loss, label='test loss')
plt.legend()
plt.show()

plt.plot(train_acc, label='train acc')
plt.plot(test_acc, label='test acc')
plt.legend()
plt.show()



-------------
EPOCH: 1
Training started for epoch: 1


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=908.0), HTML(value='')))




Training completed for epoch: 1
Time for train: 91.91886281967163
Accuracy: 0.11730795704845814 Loss: 2.649101972579956
Validation started for epoch: 1


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=227.0), HTML(value='')))


Validation completed for epoch: 1
Time for validation: 8.825459241867065
Accuracy: 0.14743942731277532 Loss: 2.5859882831573486


-------------
EPOCH: 2
Training started for epoch: 2


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=908.0), HTML(value='')))


Training completed for epoch: 2
Time for train: 75.3097620010376
Accuracy: 0.16611026982378854 Loss: 2.534013271331787
Validation started for epoch: 2


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=227.0), HTML(value='')))


Validation completed for epoch: 2
Time for validation: 8.72238302230835
Accuracy: 0.17084251101321585 Loss: 2.5054945945739746


-------------
EPOCH: 3
Training started for epoch: 3


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=908.0), HTML(value='')))


Training completed for epoch: 3
Time for train: 74.24760603904724
Accuracy: 0.18357654185022027 Loss: 2.4835872650146484
Validation started for epoch: 3


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=227.0), HTML(value='')))


Validation completed for epoch: 3
Time for validation: 8.519360065460205
Accuracy: 0.1823375550660793 Loss: 2.47572660446167


-------------
EPOCH: 4
Training started for epoch: 4


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=908.0), HTML(value='')))


Training completed for epoch: 4
Time for train: 74.639328956604
Accuracy: 0.19085558920704845 Loss: 2.461818218231201
Validation started for epoch: 4


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=227.0), HTML(value='')))


Validation completed for epoch: 4
Time for validation: 8.735416889190674
Accuracy: 0.18626101321585903 Loss: 2.4600400924682617


-------------
EPOCH: 5
Training started for epoch: 5


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=908.0), HTML(value='')))


Training completed for epoch: 5
Time for train: 79.23219704627991
Accuracy: 0.19429721916299558 Loss: 2.450086832046509
Validation started for epoch: 5


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=227.0), HTML(value='')))


Validation completed for epoch: 5
Time for validation: 10.038591861724854
Accuracy: 0.18915198237885464 Loss: 2.451871156692505


-------------
EPOCH: 6
Training started for epoch: 6


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=908.0), HTML(value='')))


Training completed for epoch: 6
Time for train: 90.93724489212036
Accuracy: 0.19789372246696035 Loss: 2.4431374073028564
Validation started for epoch: 6


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=227.0), HTML(value='')))


Validation completed for epoch: 6
Time for validation: 8.413643836975098
Accuracy: 0.19114812775330398 Loss: 2.4468648433685303


-------------
EPOCH: 7
Training started for epoch: 7


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=908.0), HTML(value='')))


Training completed for epoch: 7
Time for train: 76.35072803497314
Accuracy: 0.19882296255506607 Loss: 2.438063144683838
Validation started for epoch: 7


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=227.0), HTML(value='')))


Validation completed for epoch: 7
Time for validation: 8.796296834945679
Accuracy: 0.19094162995594713 Loss: 2.444852828979492


-------------
EPOCH: 8
Training started for epoch: 8


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=908.0), HTML(value='')))


Training completed for epoch: 8
Time for train: 75.76262187957764
Accuracy: 0.20044052863436124 Loss: 2.4353044033050537
Validation started for epoch: 8


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=227.0), HTML(value='')))


Validation completed for epoch: 8
Time for validation: 8.891098260879517
Accuracy: 0.19128579295154186 Loss: 2.441119432449341


-------------
EPOCH: 9
Training started for epoch: 9


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=908.0), HTML(value='')))




KeyboardInterrupt: 