In [2]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import emoji
from gensim.models import Word2Vec
import torch
import time

In [3]:
UNK = "<unk>"
empty = "<empty>"
wordEmbSize = 64
data = pd.read_csv("data3.csv")

# Data Preprocessing

+ Cleaning by using nltk word tokenizer and lemmatizer
+ Adds spaces to emojis to separate them to different words using emoji library's re
+ Add a start and end token
+ Build vocab for words
+ Build vocab for emojis
+ Makes labels as 0 or 1 for each word. If label is 1, means that word is followed by an emoji

In [4]:
RE_EMOJI = emoji.get_emoji_regexp()
tokenizer = nltk.word_tokenize
lemmer = nltk.stem.WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

# tokens normalized
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

#converting text to words
def preprocessing(data, train=True):
    newData = {"words":[], "labels":[]}
    for text in data["texts"]:
        #converting to words
        emoji_split = RE_EMOJI.split(text)
        emoji_split = [x.strip() for x in emoji_split if x]
        text = " ".join(emoji_split)
        textWords = LemNormalize(text)
        textWords.insert(0,"<s>")
        textWords.append("</s>")
        newData["words"].append(textWords)
        
        #getting labels
        labels = []
        if train:
            for i in range(1, len(textWords)):
                word = textWords[i]
                if RE_EMOJI.match(word):
                    labels.append(1)
                else:
                    labels.append(0)
            labels.append(0)
        else:
            labels = [0 * len(textWords)]
        newData["labels"].append(labels)
    return pd.DataFrame(newData)

def make_vocabs(data):
    vocab = set()
    vocab.add(UNK)
    emojiVocab = set()
    emojiVocab.add(empty)
    for text in data["words"]:
        for word in text:
            vocab.add(word)
            if RE_EMOJI.match(word):
                emojiVocab.add(word)
    return vocab, emojiVocab
        
train = preprocessing(data, True)
vocab, emojiVocab = make_vocabs(train)
vocabIdx = {word : i for i, word in enumerate(vocab)}
eVocabIdx = {emoji : i for i, emoji in enumerate(emojiVocab)}

# Word Embeddings

+ Using gensim's Word2Vec
+ Builds model with word embedding size specified earlier

In [5]:
def getEmbModel(data, vocab):
    docs = [[UNK]]
    docs.extend(data["words"])
    model = Word2Vec(docs, min_count = 1, size = wordEmbSize)
    print(model)
    return model

# Takes: dataset, word2vec model, and vocabulary from training
# returns: list of tuples. First value is a torch of word embeddings for that sentence,
# second value is the labels for each word
def getEmb(data, model, vocab):
    vecData = []
    for text,y in zip(data["words"],data["labels"]):
        wordEmb = []
        for word in text:
            if word in vocab:
                wordEmb.append(model[word])
            else:
                wordEmb.append(model[UNK])
        wordEmb = torch.FloatTensor(wordEmb)
        vecData.append((wordEmb, y))
    return vecData

model = getEmbModel(train, vocab)
trainEmb = getEmb(train, model, vocab)

Word2Vec(vocab=19720, size=64, alpha=0.025)




In [6]:
sample = train.sample(5)
for index,row in sample.iterrows():
    print(row["words"])
    print(row["labels"])
    print(trainEmb[index][0])

['<s>', 'i', 'have', '🈶', '🈶', 'your', '👉', 'information', '💁', '💁', '💁', 'do', 'not', 'block', 'me', 'or', 'i', 'will', 'take', 'immediate', 'action', 'you', 'are', 'aaron', 'thompson', 'from', 'boston', 'massachusetts', 'and', 'you', 'are', 'commuting', 'a', 'crime', 'by', 'having', 'possession', 'of', 'child', '👶', '👶', '👶', 'pornography', 'i', 'will', 'go', 'public', '🚋', '🚋', 'with', 'these', 'screenshots', 'of', 'our', 'convorsarion', 'and', 'send', 'your', '👉', 'nude', 'photo', '🎡', '🎡', '🎡', 'to', 'your', '👉', 'family', '👪', 'contact', 'if', 'i', 'do', 'not', 'receive', '20', 'dollar', 'in', 'the', 'next', '⏭', '⏭', '⏭', '20', 'minute', 'if', 'you', 'do', 'not', 'respond', 'within', '30', 'second', 'i', 'am', 'sharing', 'this', '⬆', 'with', 'the', 'proper', 'authority', 'and', 'family', '👪', 'contact', 'do', 'not', 'block', 'me', 'because', 'if', 'you', 'do', 'i', 'will', 'do', 'this', '⬆', 'immediately', 'send', '20', 'via', 'paypal', 'to', 'moneymangmailcom', 'your', '👉', 'ip

['<s>', 'you', '👉', 'go', 'to', 'school', '🏫', 'but', 'i', '👀', 'go', 'to', 'concert', '🎤', '🎵', '🎶', 'you', 'don', '’', 't', '🚫', 'learn', '📚', '📖', '✏', '️', 'shit', '💩', 'compared', 'to', 'what', 'i', 'learn', '📚', '📖', '✏', '️', 'in', 'the', 'pit', '🏃\u200d♂️', '⭕', '️', '</s>']
[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0]
tensor([[ 0.7752, -0.2834,  0.5366,  ...,  0.6269, -0.1106,  0.5025],
        [ 0.8660, -0.6831,  1.2369,  ...,  1.1319,  0.2700,  0.1876],
        [ 1.2860,  0.2209,  0.5731,  ...,  1.2231,  0.3449,  0.7515],
        ...,
        [-0.0921,  0.1108, -0.0412,  ...,  0.0108,  0.0603, -0.1010],
        [ 0.9478,  0.7808,  0.7707,  ..., -1.0421, -0.0534, -0.3491],
        [ 0.6049,  0.1838,  0.5812,  ...,  1.3200, -0.5779,  0.7886]])
['<s>', 'uh', 'ohh', 'stinkyy', '😨', '💩', 'pooop', 'ahhahhaha', 'pooopiees', 'funny', 'poopiees', 'lalalelelelelelelelele', 'funny', 'poo', '💩', '💩',

# Building supervised models to predict next emoji

+ RNN based architecture where we look at the hidden layer for every word
  + Using hidden layer, predict if is an emoji and what emoji it is
+ Asked TA from NLP class, they said this is similar to a language modelling problem where we only predict the set of emoji vocabulary
  + Could also view as sequence labelling where tag is next emoji or no emoji

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm
import random

# Feedforward NN

+ https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
+ Built using previous couple of words
+ Feature is word embeddings using pytorch nn.Embedding. Use vocabulary and map words to index and emojis to index. Entire vocabulary is fed as input to the NN but output of NN is either emoji or no emoji

In [8]:
class NGramLanguageModeler(nn.Module):
    def __init__(self, vocab_size, eVocab_size, embedding_dim, context_size, hidden):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden)
        self.activation = nn.ReLU()
        self.softmax = nn.LogSoftmax()
        self.loss = nn.NLLLoss()
        self.linear2 = nn.Linear(hidden, eVocab_size)

    def compute_loss(self, predicted_vector, label):
        return self.loss(predicted_vector, label)
    
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = self.activation(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = self.softmax(out)
        return log_probs

In [10]:
def FFfeatures(data):
    trigrams = []
    for text in data["words"]:
        currTri = []
        for i in range(len(text) - 2):
            predictWord = text[i+2]
            if not RE_EMOJI.match(predictWord):
                predictWord = empty
            currTri.append([[text[i], text[i+1]], predictWord])
        trigrams.append(currTri)
    return trigrams

In [12]:
train_feats = FFfeatures(train)

In [34]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10


def trainFF(epochs,cuda):
    devC = None
    if cuda and torch.cuda.is_available():
        devC = torch.device('cuda')
    else:
        devC = torch.device('cpu')
    
    losses = []
    model = NGramLanguageModeler(len(vocab), len(emojiVocab), EMBEDDING_DIM, CONTEXT_SIZE, 128).to(devC)
    optimizer = optim.SGD(model.parameters(),lr=0.01, momentum=0.9)
    
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        print("Training started for epoch:{}".format(epoch + 1))
        random.shuffle(train_feats)
        start_time = time.time()
        correct = 0
        total = 0
        minibatch_size = 16
        
        N = len(train_feats)
        for minibatch_idx in tqdm(range(N // minibatch_size)):
            optimizer.zero_grad()
            loss = 0
            for idx in range(minibatch_size):
                text = train_feats[minibatch_idx * minibatch_size + idx]
                for context, target in text:
                    context_idx = torch.tensor([vocabIdx[w] for w in context], dtype=torch.long, device=devC)
                    log_probs = model(context_idx)
                    idx_loss = model.compute_loss(log_probs, torch.tensor([eVocabIdx[target]], device=devC))
                    loss += idx_loss
                    predicted_label = torch.argmax(log_probs)
                    correct += int(predicted_label == eVocabIdx[target])
                    total += 1
            loss = loss / minibatch_size
            loss.backward()
            optimizer.step()
            
        losses.append(loss) #was total_loss, but thats undef?
        print("Training completed for epoch:{}".format(epoch + 1))
        print("Time for train:{}".format(time.time() - start_time))
        print("Accuracy:{}".format(correct / total))
        
trainFF(1,True)

Training started for epoch:1


HBox(children=(IntProgress(value=0, max=76), HTML(value='')))



loss cuda:0
loss cuda:0


KeyboardInterrupt: 