In [19]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import emoji
from gensim.models import Word2Vec
import torch
import time

In [12]:
UNK = "<unk>"
empty = "<empty>"
wordEmbSize = 64
data = pd.read_csv("data3.csv")

# Data Preprocessing

+ Cleaning by using nltk word tokenizer and lemmatizer
+ Adds spaces to emojis to separate them to different words using emoji library's re
+ Add a start and end token
+ Build vocab for words
+ Build vocab for emojis
+ Makes labels as 0 or 1 for each word. If label is 1, means that word is followed by an emoji

In [18]:
RE_EMOJI = emoji.get_emoji_regexp()
tokenizer = nltk.word_tokenize
lemmer = nltk.stem.WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

# tokens normalized
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

#converting text to words
def preprocessing(data, train=True):
    newData = {"words":[], "labels":[]}
    for text in data["texts"]:
        #converting to words
        emoji_split = RE_EMOJI.split(text)
        emoji_split = [x.strip() for x in emoji_split if x]
        text = " ".join(emoji_split)
        textWords = LemNormalize(text)
        textWords.insert(0,"<s>")
        textWords.append("</s>")
        newData["words"].append(textWords)
        
        #getting labels
        labels = []
        if train:
            for i in range(1, len(textWords)):
                word = textWords[i]
                if RE_EMOJI.match(word):
                    labels.append(1)
                else:
                    labels.append(0)
            labels.append(0)
        else:
            labels = [0 * len(textWords)]
        newData["labels"].append(labels)
    return pd.DataFrame(newData)

def make_vocabs(data):
    vocab = set()
    vocab.add(UNK)
    emojiVocab = set()
    emojiVocab.add(empty)
    for text in data["words"]:
        for word in text:
            vocab.add(word)
            if RE_EMOJI.match(word):
                emojiVocab.add(word)
    return vocab, emojiVocab
        
train = preprocessing(data, True)
vocab, emojiVocab = make_vocabs(train)
vocabIdx = {word : i for i, word in enumerate(vocab)}
eVocabIdx = {emoji : i for i, emoji in enumerate(emojiVocab)}

# Word Embeddings

+ Using gensim's Word2Vec
+ Builds model with word embedding size specified earlier

In [5]:
def getEmbModel(data, vocab):
    docs = [[UNK]]
    docs.extend(data["words"])
    model = Word2Vec(docs, min_count = 1, size = wordEmbSize)
    print(model)
    return model

# Takes: dataset, word2vec model, and vocabulary from training
# returns: list of tuples. First value is a torch of word embeddings for that sentence,
# second value is the labels for each word
def getEmb(data, model, vocab):
    vecData = []
    for text,y in zip(data["words"],data["labels"]):
        wordEmb = []
        for word in text:
            if word in vocab:
                wordEmb.append(model[word])
            else:
                wordEmb.append(model[UNK])
        wordEmb = torch.FloatTensor(wordEmb)
        vecData.append((wordEmb, y))
    return vecData

model = getEmbModel(train, vocab)
trainEmb = getEmb(train, model, vocab)

Word2Vec(vocab=19720, size=64, alpha=0.025)




In [8]:
sample = train.sample(5)
for index,row in sample.iterrows():
    print(row["words"])
    print(row["labels"])
    print(trainEmb[index][0])

['<s>', '“', 'damnn', 'gurl', 'you', 'fine', 'a', 'fuck', '💦', '💦', '💦', 'did', 'you', 'fall', 'from', 'heaven', 'because', 'you', 'have', 'the', 'phattest', 'as', 'on', 'god', '🍑', '🍑', 'you', 'do', 'track', 'cool', 'because', 'i', 'can', 'track', 'dat', 'as', 'a', 'u', 'and', 'me', 'go', 'crazy', 'babey', '😎', 'hey', 'you', 'forgot', 'to', 'fill', 'out', 'this', 'survey', 'haha', 'just', 'kidding', 'that', '’', 's', 'my', 'phone', '😎', 'gim', 'me', 'ur', 'number', 'so', 'we', 'can', 'talk', 'all', 'night', '😉', 'what', '’', 's', 'that', 'you', '’', 're', 'single', 'haha', 'i', 'never', 'knew', 'but', 'what', 'a', 'coincidence', 'let', '’', 's', 'get', 'not', 'single', 'together', '😎', 'i', 'may', 'not', 'run', 'but', 'i', '’', 'm', 'boutta', 'run', 'up', 'on', 'dat', 'azz', '😤', 'haha', 'what', '’', 's', 'that', 'you', 'aren', '’', 't', 'looking', 'for', 'a', 'boyfriend', 'well', 'that', '’', 's', 'perfect', 'because', 'i', '’', 'm', 'no', 'boy', '😎', '</s>']
[0, 0, 0, 0, 0, 0, 0, 1,

# Building supervised models to predict next emoji

+ RNN based architecture where we look at the hidden layer for every word
  + Using hidden layer, predict if is an emoji and what emoji it is
+ Asked TA from NLP class, they said this is similar to a language modelling problem where we only predict the set of emoji vocabulary
  + Could also view as sequence labelling where tag is next emoji or no emoji

In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
import random

jupyter = True
if jupyter:
    from tqdm.notebook import tqdm
else:
    from tqdm import tqdm

# Feedforward NN

+ https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
+ Built using previous couple of words
+ Feature is word embeddings using pytorch nn.Embedding. Use vocabulary and map words to index and emojis to index. Entire vocabulary is fed as input to the NN but output of NN is either emoji or no emoji

In [31]:
class NGramLanguageModeler(nn.Module):
    def __init__(self, vocab_size, eVocab_size, embedding_dim, context_size, hidden):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden)
        self.activation = nn.ReLU()
        self.softmax = nn.LogSoftmax()
        self.loss = nn.NLLLoss()
        self.linear2 = nn.Linear(hidden, eVocab_size)

    def compute_loss(self, predicted_vector, label):
        return self.loss(predicted_vector, label)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = self.activation(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = self.softmax(out)
        return log_probs

In [45]:
def FFfeatures(data):
    trigrams = []
    numEmpty = 0
    total = 0
    for text in data["words"]:
        currTri = []
        for i in range(len(text) - 2):
            predictWord = text[i+2]
            if not RE_EMOJI.match(predictWord):
                predictWord = empty
                numEmpty += 1
            total += 1
            currTri.append([[text[i], text[i+1]], predictWord])
        trigrams.append(currTri)
    print("% empty:{}".format(numEmpty / total))
    return trigrams

In [46]:
train_feats = FFfeatures(train)

\% empty:0.7202584279710275


In [42]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

model = NGramLanguageModeler(len(vocab), len(emojiVocab), EMBEDDING_DIM, CONTEXT_SIZE, 128)
optimizer = optim.SGD(model.parameters(),lr=0.01, momentum=0.9)
for epoch in range(10):
    model.train()
    optimizer.zero_grad()
    print("Training started for epoch:{}".format(epoch + 1))
    random.shuffle(train_feats)
    start_time = time.time()
    correct = 0
    total = 0
    minibatch_size = 16
    N = len(train_feats)
    for minibatch_idx in tqdm(range(N // minibatch_size)):
        optimizer.zero_grad()
        loss = None
        for idx in range(minibatch_size):
            text = train_feats[minibatch_idx * minibatch_size + idx]
            for context, target in text:
                context_idx = torch.tensor([vocabIdx[w] for w in context], dtype=torch.long)
                log_probs = model(context_idx)
                idx_loss = model.compute_loss(log_probs, torch.tensor([eVocabIdx[target]]))
                
                if loss is None:
                    loss = idx_loss
                else:
                    loss += idx_loss
                predicted_label = torch.argmax(log_probs)
                correct += int(predicted_label == eVocabIdx[target])
                total += 1
        loss = loss / minibatch_size
        loss.backward()
        optimizer.step()
    print("Training completed for epoch:{}".format(epoch + 1))
    print("Time for train:{}".format(time.time() - start_time))
    print("Accuracy:{}".format(correct / total))






  0%|          | 0/76 [00:00<?, ?it/s][A[A[A[A[A

Training started for epoch:1







  1%|▏         | 1/76 [00:07<09:10,  7.34s/it][A[A[A[A[A




  3%|▎         | 2/76 [00:12<08:16,  6.71s/it][A[A[A[A[A




  4%|▍         | 3/76 [00:21<09:01,  7.41s/it][A[A[A[A[A




  5%|▌         | 4/76 [00:26<07:56,  6.62s/it][A[A[A[A[A




  7%|▋         | 5/76 [00:30<06:54,  5.83s/it][A[A[A[A[A




  8%|▊         | 6/76 [00:41<08:41,  7.45s/it][A[A[A[A[A




  9%|▉         | 7/76 [00:49<08:32,  7.43s/it][A[A[A[A[A




 11%|█         | 8/76 [00:56<08:20,  7.37s/it][A[A[A[A[A




 12%|█▏        | 9/76 [01:01<07:24,  6.64s/it][A[A[A[A[A




 13%|█▎        | 10/76 [01:04<06:05,  5.54s/it][A[A[A[A[A




 14%|█▍        | 11/76 [01:07<05:11,  4.79s/it][A[A[A[A[A




 16%|█▌        | 12/76 [01:11<04:52,  4.57s/it][A[A[A[A[A




 17%|█▋        | 13/76 [01:15<04:46,  4.54s/it][A[A[A[A[A




 18%|█▊        | 14/76 [01:24<06:03,  5.86s/it][A[A[A[A[A




 20%|█▉        | 15/76 [01:27<05:00,  4.93s/it][A[A[A[A[A





Training completed for epoch:1
Time for train:362.4646899700165
Accuracy:0.701060799748817
Training started for epoch:2







  1%|▏         | 1/76 [00:02<03:38,  2.91s/it][A[A[A[A[A




  3%|▎         | 2/76 [00:07<04:12,  3.41s/it][A[A[A[A[A




  4%|▍         | 3/76 [00:18<06:46,  5.56s/it][A[A[A[A[A




  5%|▌         | 4/76 [00:23<06:39,  5.55s/it][A[A[A[A[A




  7%|▋         | 5/76 [00:39<10:19,  8.73s/it][A[A[A[A[A




  8%|▊         | 6/76 [00:44<08:39,  7.42s/it][A[A[A[A[A




  9%|▉         | 7/76 [00:48<07:30,  6.54s/it][A[A[A[A[A




 11%|█         | 8/76 [00:53<06:45,  5.96s/it][A[A[A[A[A




 12%|█▏        | 9/76 [01:05<08:45,  7.84s/it][A[A[A[A[A




 13%|█▎        | 10/76 [01:10<07:42,  7.00s/it][A[A[A[A[A




 14%|█▍        | 11/76 [01:15<07:01,  6.49s/it][A[A[A[A[A




 16%|█▌        | 12/76 [01:24<07:30,  7.04s/it][A[A[A[A[A




 17%|█▋        | 13/76 [01:25<05:44,  5.46s/it][A[A[A[A[A




 18%|█▊        | 14/76 [01:28<04:50,  4.69s/it][A[A[A[A[A




 20%|█▉        | 15/76 [01:53<11:00, 10.84s/it][A[A[A[A[A





Training completed for epoch:2
Time for train:367.20807099342346
Accuracy:0.294724636639012
Training started for epoch:3







  1%|▏         | 1/76 [00:12<15:37, 12.49s/it][A[A[A[A[A




  3%|▎         | 2/76 [00:15<12:00,  9.73s/it][A[A[A[A[A




  4%|▍         | 3/76 [00:18<09:07,  7.50s/it][A[A[A[A[A




  5%|▌         | 4/76 [00:19<06:59,  5.82s/it][A[A[A[A[A




  7%|▋         | 5/76 [00:23<05:57,  5.04s/it][A[A[A[A[A




  8%|▊         | 6/76 [00:25<05:00,  4.30s/it][A[A[A[A[A




  9%|▉         | 7/76 [00:33<05:59,  5.21s/it][A[A[A[A[A




 11%|█         | 8/76 [00:36<05:13,  4.61s/it][A[A[A[A[A




 12%|█▏        | 9/76 [00:39<04:42,  4.22s/it][A[A[A[A[A




 13%|█▎        | 10/76 [00:53<07:59,  7.27s/it][A[A[A[A[A




 14%|█▍        | 11/76 [00:56<06:27,  5.97s/it][A[A[A[A[A




 16%|█▌        | 12/76 [00:59<05:14,  4.91s/it][A[A[A[A[A




 17%|█▋        | 13/76 [01:01<04:23,  4.18s/it][A[A[A[A[A




 18%|█▊        | 14/76 [01:10<05:36,  5.42s/it][A[A[A[A[A




 20%|█▉        | 15/76 [01:13<04:59,  4.90s/it][A[A[A[A[A





Training completed for epoch:3
Time for train:344.2728612422943
Accuracy:2.2773233252564267e-05
Training started for epoch:4







  1%|▏         | 1/76 [00:05<07:05,  5.67s/it][A[A[A[A[A




  3%|▎         | 2/76 [00:10<06:49,  5.54s/it][A[A[A[A[A




  4%|▍         | 3/76 [00:15<06:21,  5.23s/it][A[A[A[A[A




  5%|▌         | 4/76 [00:18<05:25,  4.52s/it][A[A[A[A[A




  7%|▋         | 5/76 [00:21<04:57,  4.19s/it][A[A[A[A[A




  8%|▊         | 6/76 [00:27<05:22,  4.60s/it][A[A[A[A[A




  9%|▉         | 7/76 [00:35<06:26,  5.60s/it][A[A[A[A[A




 11%|█         | 8/76 [00:39<05:59,  5.29s/it][A[A[A[A[A




 12%|█▏        | 9/76 [00:53<08:47,  7.88s/it][A[A[A[A[A




 13%|█▎        | 10/76 [00:57<07:20,  6.68s/it][A[A[A[A[A




 14%|█▍        | 11/76 [01:02<06:30,  6.01s/it][A[A[A[A[A




 16%|█▌        | 12/76 [01:07<06:13,  5.83s/it][A[A[A[A[A




 17%|█▋        | 13/76 [01:13<06:02,  5.76s/it][A[A[A[A[A




 18%|█▊        | 14/76 [01:17<05:34,  5.40s/it][A[A[A[A[A




 20%|█▉        | 15/76 [01:26<06:25,  6.32s/it][A[A[A[A[A





KeyboardInterrupt: 

# RNN/LSTM 

+ https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#sphx-glr-beginner-nlp-sequence-models-tutorial-py
+ 2 models, one to determine whether a word is a 