In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import emoji
from os.path import join
import json
import time

from sklearn.utils import shuffle
from gensim.models import Word2Vec, FastText
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
# getting the twitter comments
DATA_PATH = join('..','data','twitter-data-cleaned.txt')
with open(DATA_PATH, 'r',  encoding="utf-8") as f:
    data = f.readlines()
data = [d.strip() for d in data if d.strip() != '']
print('Tweets')
print(data[:3])
print(len(data))

# getting our chosen emojis
SELECTED_EMOJIS_PATH = join('..','data','best-emojis.json')
with open(SELECTED_EMOJIS_PATH, 'r', encoding='utf-8') as f:
    EMOJIS = json.load(f)
EMOJI_CHARS = [e['char'] for e in EMOJIS]
print('Chosen emojis')
print(EMOJI_CHARS)
print(len(EMOJI_CHARS))

ALL_EMOJIS = set(emoji.emojize(emoji_code) for emoji_code in emoji.UNICODE_EMOJI.values())

FileNotFoundError: [Errno 2] No such file or directory: '../data/twitter-data-cleaned.txt'

In [6]:
# preprocessing the data
from parsing import Tokenizer, TokenType, Token
tokenizer = Tokenizer(EMOJI_CHARS)
# take 3 previous words as context for the emoji
context = {e:[] for e in EMOJI_CHARS}
emojiToId = {e:i for i,e in enumerate(EMOJI_CHARS)}

for tweet in data:
    tokens = tokenizer.tokenize(tweet)
    for i,token in enumerate(tokens):
        if token.token_type == TokenType.EMOJIS:
            closest = tokenizer.findClosestNWords(8, tokens, i)
            if closest:
                context[token.raw].append(closest)

for e, words in context.items():
    print(e)
    print(len(words))
    print(words[:3])

😂
8440
[['yayyy', 'go', 'go', 'lol', 'postmen', 'familiar', 'face', 'time'], ['bollywood', 'reality'], ['brother', 'know', 'always', 'got', 'even', 'though', 'debates', 'blasphemy']]
😍
7329
[['fitnes', 'week', 'studio', 'figura', 'start', 'today', 'choose', 'package'], ['200', 'days', 'go'], ['200', 'days', 'go']]
😭
7428
[['thank', 'lol', 'procrastination', 'almost'], ['change', 'first', 'diaper', 'uncle', 'year'], ['deposit', '52', 'yuan', 'set', 'must', 'buy']]
😊
6335
[['take', 'moment', 'today', 'credit', 'stacie', 'swift', 'edutwitter', 'teachertwitter'], ['fantastic', 'well', 'done', 'ladies'], ['congratulations', 'fam', 'best']]
💕
6480
[['happy', 'birthday', 'mochiiii', 'saranghae', 'take', 'care', 'self', 'see'], ['got', 'soon', 'youre', 'done'], ['got']]
😒
5852
[['dunk', 'jonathan', '️', '️', '️', 'dunk', 'ang', 'hina'], ['starbucks', 'however', 'one', 'drink', 'comes', '17', 'uber', 'eats'], ['starbucks', 'however', 'one', 'drink', 'comes', '17', 'uber', 'eats']]
😉
5905
[['fit

In [7]:
# RNN building to predict result
import torch
import torch.nn as nn
import torch.optim as optim
import random
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

In [8]:
# making X and y for RNN
X_words = []
y = []
sentences = set()
for e, words in context.items():
    for i, sentence in enumerate(words):
        check = ' '.join(sentence)
        if check in sentences:
            continue
        sentences.add(check)
        X_words.append(sentence)
        y.append(emojiToId[e])
print(X_words[0])
print(len(y))

['yayyy', 'go', 'go', 'lol', 'postmen', 'familiar', 'face', 'time']
71857


In [9]:
# making word embeddings for RNN
UNK = "<unk>"
WORDEMBSIZE = 100
W2V_WINDOW = 7
W2V_COUNT = 1
W2V_EPOCH=100

def makeVocab(text):
    vocab = set()
    vocab.add(UNK)
    for sentences in text:
        for word in sentences:
            vocab.add(word)
    return vocab

def makeEmbModel(data):
#     model = FastText(data, size=WORDEMBSIZE, window=3, min_count=1, iter=10, sorted_vocab=1)
    model = Word2Vec(window=W2V_WINDOW, min_count=W2V_COUNT, size=WORDEMBSIZE)
    model.build_vocab(data)
    model.train(data, total_examples=len(data), epochs=W2V_EPOCH)
    print(model)
    return model

def makeEmbeddings(data, model, vocab):
    vecData = []
    for sentence in data:
        wordEmbs = []
        for word in sentence:
            if word in vocab:
#                 print(type(model[word]))
#                 print(model[word])
                wordEmbs.append(model.wv[word])
            else:
                wordEmbs.append(np.zeros(WORDEMBSIZE))
        wordEmbs = torch.FloatTensor(wordEmbs)
        vecData.append(wordEmbs)
    return vecData

In [10]:
# continue making word embeddings
X_words_train, X_words_test, y_train, y_test = train_test_split(X_words, y, train_size=0.8, test_size=0.2, random_state=42)
vocab = makeVocab(X_words_train)
emb_model = makeEmbModel(X_words_train)
vocab = set(list(emb_model.wv.vocab.keys()))
X_train = makeEmbeddings(X_words_train, emb_model, vocab)
X_test = makeEmbeddings(X_words_test, emb_model, vocab)

print(X_words_train[0])
print(X_train[0])
print(y_train[0])

Word2Vec(vocab=37159, size=100, alpha=0.025)
['vdb', 'mctominay', 'abeg']
tensor([[ 4.3793e-01,  1.6831e-01,  6.2822e-01, -3.6764e-01, -1.6899e-01,
         -1.3530e-01,  1.0859e-01,  4.3041e-01, -1.4167e+00, -8.5227e-02,
         -1.7068e-01, -9.7426e-02,  9.0586e-01,  1.0322e-01,  1.1119e+00,
          2.9117e-01,  5.5843e-01, -6.5334e-01, -4.6940e-03, -1.4686e-01,
          3.5751e-01, -5.1219e-01, -2.3945e-01,  1.8382e-01, -7.0136e-01,
          1.3294e+00,  3.3104e-01, -6.6811e-02,  8.7594e-01,  1.5579e-01,
         -4.4400e-01,  1.8207e-01, -6.8612e-01, -9.0043e-02, -1.6902e-01,
         -1.9993e-02, -8.2884e-01,  4.2203e-01, -4.3094e-01, -4.0533e-01,
         -4.9470e-01,  3.4956e-04,  1.3146e+00,  4.3590e-01, -3.5004e-01,
          1.0015e-01, -5.5199e-01, -9.9790e-01, -2.7704e-01,  5.5739e-01,
          3.2614e-01,  5.1592e-01, -5.5762e-01, -4.8401e-01, -5.5807e-01,
         -2.3866e-01, -3.6170e-01, -1.3111e-01,  3.1320e-01,  6.3173e-01,
         -6.3873e-02, -3.1641e-01,  6.

In [11]:
# RNN model
class RNN(nn.Module):
    def __init__(self, input_dim, h, output_dim = 15):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_dim, h, dropout=0.2)
        self.finalLayer = nn.Linear(h, output_dim)
        self.input_dim = input_dim
        self.hidden_dim = h
        self.output_dim =  output_dim
#         self.word_embeddings = nn.Embedding(vocab_size, input_dim)
        self.softmax = nn.LogSoftmax()
        self.loss = nn.NLLLoss()
    
    def compute_loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)
    
    def forward(self, inputs):
        out, hidden = self.rnn(inputs)
        hidden = hidden.contiguous().view(-1,self.hidden_dim)
        predicted_vector = self.softmax(self.finalLayer(hidden))
        return predicted_vector
    
class biRNN(nn.Module):
    def __init__(self, input_dim, h, output_dim = 15):
        super(biRNN, self).__init__()
        self.rnn = nn.RNN(input_dim, h, dropout=0.2, bidirectional=True)
        self.finalLayer = nn.Linear(h, output_dim)
        self.input_dim = input_dim
        self.hidden_dim = h
        self.output_dim =  output_dim
#         self.word_embeddings = nn.Embedding(vocab_size, input_dim)
        self.softmax = nn.LogSoftmax()
        self.loss = nn.NLLLoss()
    
    def compute_loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)
    
    def forward(self, inputs):
        out, hidden = self.rnn(inputs)
        hidden = hidden.contiguous().view(-1,self.hidden_dim)
        predicted_vector = self.softmax(self.finalLayer(hidden))
        return predicted_vector
    
class GRU(nn.Module):
    def __init__(self, input_dim, h, output_dim = 15):
        super(GRU, self).__init__()
        self.rnn = nn.GRU(input_dim, h, dropout=0.2)
        self.finalLayer = nn.Linear(h, output_dim)
        self.input_dim = input_dim
        self.hidden_dim = h
        self.output_dim =  output_dim
#         self.word_embeddings = nn.Embedding(vocab_size, input_dim)
        self.softmax = nn.LogSoftmax()
        self.loss = nn.NLLLoss()
    
    def compute_loss(self, predicted_vector, gold_label):
        return self.loss(predicted_vector, gold_label)
    
    def forward(self, inputs):
        out, hidden = self.rnn(inputs)
        hidden = hidden.contiguous().view(-1,self.hidden_dim)
        predicted_vector = self.softmax(self.finalLayer(hidden))
        return predicted_vector

In [1]:
# running epochs for training and validation
HIDDEN_DIM = 124
EPOCHS = 15
minibatch_size = 64

# model = RNN(WORDEMBSIZE, HIDDEN_DIM, 15)
# model = biRNN(WORDEMBSIZE, HIDDEN_DIM, 15)
model = GRU(WORDEMBSIZE, HIDDEN_DIM, 15)
optimizer = optim.SGD(model.parameters(),lr=0.01)
train_loss = []
train_acc = []
test_loss = []
test_acc = []
for epoch in range(EPOCHS):
    print("\n\n-------------")
    print("EPOCH: {}".format(epoch + 1))
    running_loss = 0.0
    model.train()
    optimizer.zero_grad()
    print("Training started for epoch: {}".format(epoch + 1))
    X_train, y_train = shuffle(X_train, y_train)
    start_time = time.time()
    correct = total = 0
    N = len(y_train)
    for minibatch_idx in tqdm(range(N // minibatch_size)):
        optimizer.zero_grad()
        loss = None
        for idx in range(minibatch_size):
            text = X_train[minibatch_idx * minibatch_size + idx]
            text = torch.unsqueeze(text, 1)
            labelIdx = y_train[minibatch_idx * minibatch_size + idx]
            log_probs = model(text)
            text_loss = model.compute_loss(log_probs.view(1,-1), torch.tensor([labelIdx]))
            running_loss += text_loss
            if loss is None:
                loss = text_loss
            else:
                loss += text_loss
            pred_label = torch.argmax(log_probs)
            correct += int(pred_label == labelIdx)
            total += 1
        loss = loss / minibatch_size
        loss.backward()
        optimizer.step()
    train_loss.append(running_loss / N)
    train_acc.append(correct / total)
    print("Training completed for epoch: {}".format(epoch + 1))
    print("Time for train: {}".format(time.time() - start_time))
    print("Accuracy: {} Loss: {}".format(correct / total, train_loss[-1]))
    
    #validation
    running_loss = 0.0
    model.eval()
    optimizer.zero_grad()
    print("Validation started for epoch: {}".format(epoch + 1))
    X_test, y_test = shuffle(X_test, y_test)
    start_time = time.time()
    correct = total = 0
    predictions = []
    N = len(y_test)
    for minibatch_idx in tqdm(range(N // minibatch_size)):
        optimizer.zero_grad()
        for idx in range(minibatch_size):
            text = X_test[minibatch_idx * minibatch_size + idx]
            text = torch.unsqueeze(text, 1)
            labelIdx = y_test[minibatch_idx * minibatch_size + idx]
            log_probs = model(text)
            text_loss = model.compute_loss(log_probs.view(1,-1), torch.tensor([labelIdx]))
            running_loss += text_loss
            pred_label = torch.argmax(log_probs)
            correct += int(pred_label == labelIdx)
            total += 1
    curr_loss = running_loss / N
    test_loss.append(curr_loss)
    test_acc.append(correct / total)
    print("Validation completed for epoch: {}".format(epoch + 1))
    print("Time for validation: {}".format(time.time() - start_time))
    print("Accuracy: {} Loss: {}".format(correct / total, curr_loss))
    if len(test_loss) > 3 and curr_loss >= test_loss[-3]:
        print("Stopping progress: no longer learning for validation")
        break

plt.plot(train_loss, label='train loss')
plt.plot(test_loss, label='test loss')
plt.legend()
plt.show()

plt.plot(train_acc, label='train acc')
plt.plot(test_acc, label='test acc')
plt.legend()
plt.show()

NameError: name 'GRU' is not defined