In [13]:
import pandas as pd
import numpy as np
import nltk
import string
import re
import emoji
from os.path import join
import json
import time

from sklearn.utils import shuffle
from gensim.models import FastText
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [2]:
# getting the twitter comments
DATA_PATH = join('..','data','twitter-data-cleaned.txt')
with open(DATA_PATH, 'r',  encoding="utf-8") as f:
    data = f.readlines()
data = [d.strip() for d in data if d.strip() != '']
print('Tweets')
print(data[:3])
print(len(data))

# getting our chosen emojis
SELECTED_EMOJIS_PATH = join('..','data','best-emojis.json')
with open(SELECTED_EMOJIS_PATH, 'r', encoding='utf-8') as f:
    EMOJIS = json.load(f)
EMOJI_CHARS = [e['char'] for e in EMOJIS]
print('Chosen emojis')
print(EMOJI_CHARS)
print(len(EMOJI_CHARS))

ALL_EMOJIS = set(emoji.emojize(emoji_code) for emoji_code in emoji.UNICODE_EMOJI.values())

Tweets
['💫 IT’S BURGER MONDAY, CBUS! 💫 Order any Gourmet Burger with a draft beer 🍔🍺 ifor just $9.99 when you dine in. 💪 Cer…', 'Stay strong 💪 You are an amazing man. I appreciate the truth that is told.', 'Code Sale Awesome from Styli 💪']
122179
Chosen emojis
['😂', '😍', '😭', '😊', '💕', '😒', '😉', '👌', '👍', '🙏', '👀', '🔥', '💯', '👏', '💪']
15


In [3]:
# preprocessing the data
from parsing import Tokenizer, TokenType, Token
tokenizer = Tokenizer(EMOJI_CHARS)
# take 3 previous words as context for the emoji
context = {e:[] for e in EMOJI_CHARS}
emojiToId = {e:i for i,e in enumerate(EMOJI_CHARS)}

for tweet in data:
    tokens = tokenizer.tokenize(tweet)
    for i,token in enumerate(tokens):
        if token.token_type == TokenType.EMOJIS:
            closest = tokenizer.findClosestNWords(5, tokens, i)
            if closest:
                context[token.raw].append(closest)

for e, words in context.items():
    print(e)
    print(len(words))
    print(words[:3])

😂
8458
[['familiar', 'with', 'my', 'face', 'too'], ['bollywood', 'reality'], ['though', 'your', 'debates', 'be', 'blasphemy']]
😍
7330
[['figura', 'is', 'a', 'start', 'today'], ['200', 'days', 'to', 'go'], ['200', 'days', 'to', 'go']]
😭
7464
[['lol', 'procrastination', 'almost', 'had', 'me'], ['just', 'change', 'my', 'first', 'diaper'], ['52', 'yuan', 'for', 'this', 'set']]
😊
6352
[['you', 'today', 'credit', 'stacie', 'swift'], ['fantastic', 'well', 'done', 'ladies'], ['congratulations', 'fam', 'all', 'the', 'best']]
💕
6485
[['happy', 'birthday', 'my', 'mochiiii', 'saranghae'], ['got', 'this', 'soon', 'youre', 'done'], ['we', 'got', 'this']]
😒
5873
[['️', 'dunk', 'it', 'ang', 'hina'], ['comes', '17', 'on', 'uber', 'eats'], ['comes', '17', 'on', 'uber', 'eats']]
😉
5932
[['start', 'today', 'choose', 'your', 'package'], ['the', 'richest', 'man', 'your', 'age'], ['yall', 'know', 'the', 'rest']]
👌
6035
[['yess'], ['nice', 'one', 'you', 'are', 'a'], ['the', 'best', 'and', 'great', 'artist']]


In [7]:
# making X and y for TFIDF as a baseline idea of how good our accuracy can expect to be
X_words = []
X_tokens = []
y = []

sentences = set()
for e, words in context.items():
    for i,word_list in enumerate(words):
        sentence = ' '.join(word_list)
        if sentence in sentences:
            continue
        sentences.add(sentence)
        X_tokens.append(word_list)
        X_words.append(sentence)
        y.append(emojiToId[e])
print(len(y))

model = FastText(X_tokens, size=100, window=3, min_count=1, iter=10, sorted_vocab=1)

75537


array([ 0.17454654,  0.40511113, -0.09391642,  0.72003835, -0.07052381,
       -0.08196047, -0.31681183,  0.02695159, -0.7591647 , -0.13951467,
       -0.3060772 ,  0.14738002,  0.39180475, -0.05031242, -0.33401808,
       -0.20664431,  0.57543015,  0.0742536 ,  0.48540726,  0.43160683,
       -0.57935655, -0.40128928,  0.08931972, -0.41961792,  0.24711417,
        0.27860305,  0.5956422 ,  0.39462286, -1.0656201 ,  0.4321106 ,
       -0.2288525 ,  0.5105419 , -0.41794413, -0.11967197,  0.4788728 ,
       -0.3365009 , -0.03886222, -1.2256435 , -0.4413131 ,  0.33565876,
       -0.2976866 ,  0.04310591, -0.10757468, -0.34711736, -0.05832572,
       -0.9738132 , -0.4167317 ,  0.7088729 ,  0.1166968 ,  0.14113304,
        0.12601767,  0.3617158 , -0.01839591, -0.30152157,  0.24930586,
       -0.7027018 , -0.13906552, -0.13156067,  0.79356027,  0.13548094,
       -0.04003473,  0.3007476 , -0.7594328 , -0.39455166, -0.17626712,
       -0.3215084 , -0.11813012,  0.14159289,  0.1951103 , -0.20

In [8]:
# checking fasttext embeddings
print(model['hello'])
print(len(model['hello']))

[ 0.17454654  0.40511113 -0.09391642  0.72003835 -0.07052381 -0.08196047
 -0.31681183  0.02695159 -0.7591647  -0.13951467 -0.3060772   0.14738002
  0.39180475 -0.05031242 -0.33401808 -0.20664431  0.57543015  0.0742536
  0.48540726  0.43160683 -0.57935655 -0.40128928  0.08931972 -0.41961792
  0.24711417  0.27860305  0.5956422   0.39462286 -1.0656201   0.4321106
 -0.2288525   0.5105419  -0.41794413 -0.11967197  0.4788728  -0.3365009
 -0.03886222 -1.2256435  -0.4413131   0.33565876 -0.2976866   0.04310591
 -0.10757468 -0.34711736 -0.05832572 -0.9738132  -0.4167317   0.7088729
  0.1166968   0.14113304  0.12601767  0.3617158  -0.01839591 -0.30152157
  0.24930586 -0.7027018  -0.13906552 -0.13156067  0.79356027  0.13548094
 -0.04003473  0.3007476  -0.7594328  -0.39455166 -0.17626712 -0.3215084
 -0.11813012  0.14159289  0.1951103  -0.20376056 -0.14102818  1.0088058
  0.93528366 -0.4578043   0.28075236  0.09952775  0.04204928 -0.33357
  0.5630153   1.0513438   0.11687878  0.03080976 -0.81986445

In [19]:
# creating TFIDF sentence embeddings
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(X_words)
tfidf_feat = vectorizer.get_feature_names()

sent_emb = []
errors = []
empty_sent = 0
for i, tweet in enumerate(X_tokens):
    sent_vec = np.zeros(100)
    weight_sum = 0
    for word in tweet:
        try:
            emb = model.wv[word]
#             tfidf_val = tfidf[i, tfidf_feat.index(word)]
            tfidf_val = 1
            sent_vec += (emb * tfidf_val)
            weight_sum += 1
        except:
            errors.append(word)
            pass
    if weight_sum == 0:
        empty_sent += 1
    weight_sum = max(1, weight_sum)
    sent_vec /= weight_sum
    sent_emb.append(sent_vec)
print('Errors: {}'.format(len(errors)))
print('Empty sent: {}'.format(empty_sent))

Errors: 0
Empty sent: 0


In [21]:
# training models using sentence embeddings

clfs = {'rf': RandomForestClassifier()}
for name, clf in clfs.items():
    scores = cross_val_score(clf, sent_emb, y, cv=5)
    print("Model: " + name)
    print("Accuracy for: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Model: rf
Accuracy for: 0.15 (+/- 0.01)
