In [22]:
from __future__ import absolute_import, division, print_function

In [23]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [24]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [25]:
from numpy  import array
from scipy import stats

In [26]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [27]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [28]:
corpus_filename = '../data/extracted_emoji_sequences.txt'

In [29]:
corpus = open(corpus_filename).read().splitlines()


In [30]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

In [32]:
def onlyEmojiSequences(tokens):
    threshold_emojis = [x for x in tokens if len(x) > 1]
    return threshold_emojis

In [33]:
tokenized_corpus = tokenize_corpus(corpus)
emojiSequences = onlyEmojiSequences(tokenized_corpus)

In [34]:
token_count = sum([len(sentence) for sentence in emojiSequences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 610,256 tokens


# training the model

In [35]:
# Dimensionality of the resulting word vectors.
num_features = 300

# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 2

# Downsample setting for frequent words.
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
# remove later
seed = 1

# think of how to set those variables so that variables from different tweets are not learned from together!


In [36]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [16]:
thrones2vec.build_vocab(emojiSequences)

2019-01-13 19:17:31,020 : INFO : collecting all words and their counts
2019-01-13 19:17:31,021 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-13 19:17:31,029 : INFO : PROGRESS: at sentence #10000, processed 39369 words, keeping 885 word types
2019-01-13 19:17:31,037 : INFO : PROGRESS: at sentence #20000, processed 81087 words, keeping 981 word types
2019-01-13 19:17:31,045 : INFO : PROGRESS: at sentence #30000, processed 120114 words, keeping 1018 word types
2019-01-13 19:17:31,053 : INFO : PROGRESS: at sentence #40000, processed 160613 words, keeping 1054 word types
2019-01-13 19:17:31,061 : INFO : PROGRESS: at sentence #50000, processed 200972 words, keeping 1074 word types
2019-01-13 19:17:31,069 : INFO : PROGRESS: at sentence #60000, processed 239581 words, keeping 1091 word types
2019-01-13 19:17:31,077 : INFO : PROGRESS: at sentence #70000, processed 278164 words, keeping 1105 word types
2019-01-13 19:17:31,086 : INFO : PROGRESS: at sentence #8

In [17]:
thrones2vec.train(emojiSequences, total_examples=thrones2vec.corpus_count, epochs = 2)

2019-01-13 19:17:31,116 : INFO : training model with 8 workers on 1002 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=2
2019-01-13 19:17:31,266 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-01-13 19:17:31,268 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-01-13 19:17:31,270 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-01-13 19:17:31,271 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-01-13 19:17:31,273 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-01-13 19:17:31,276 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-01-13 19:17:31,278 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-01-13 19:17:31,279 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-01-13 19:17:31,280 : INFO : EPOCH - 1 : training on 321470 raw words (196750 effective words) took 0.1s, 1390582 effec

(393449, 642940)

In [18]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [19]:
thrones2vec.save(os.path.join("trained", "2nd.w2v"))

2019-01-13 19:17:31,444 : INFO : saving Word2Vec object under trained/2nd.w2v, separately None
2019-01-13 19:17:31,445 : INFO : not storing attribute vectors_norm
2019-01-13 19:17:31,446 : INFO : not storing attribute cum_table
2019-01-13 19:17:31,467 : INFO : saved trained/2nd.w2v


# explore the trained model

In [20]:
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "2nd.w2v"))

2019-01-13 19:17:31,471 : INFO : loading Word2Vec object from trained/2nd.w2v
2019-01-13 19:17:31,486 : INFO : loading wv recursively from trained/2nd.w2v.wv.* with mmap=None
2019-01-13 19:17:31,486 : INFO : setting ignored attribute vectors_norm to None
2019-01-13 19:17:31,487 : INFO : loading vocabulary recursively from trained/2nd.w2v.vocabulary.* with mmap=None
2019-01-13 19:17:31,487 : INFO : loading trainables recursively from trained/2nd.w2v.trainables.* with mmap=None
2019-01-13 19:17:31,488 : INFO : setting ignored attribute cum_table to None
2019-01-13 19:17:31,488 : INFO : loaded trained/2nd.w2v


In [21]:
# extracts the 508 Annotator Results as the Gold-Standard
corpus_filename = '../data/EmoSim508.json'
corpus = open(corpus_filename).read()
annotator_similarity_score_508 = list(array(re.findall('(?<=_Annotator_Agreement": )(.*?)(?=\})', corpus)))

# glyph_pairs_1016
unicode_pairs_1016 = re.findall('(?<=unicodelong": "\\\)(.*?)(?=")', corpus)    
glyph_pairs_1016 = [codecs.decode(unicode_pairs_1016[x].replace(str('\\\\'),str('\\')).replace('_',''), 'unicode_escape') for x in range(len(unicode_pairs_1016))]

# computation of SpearRank
goldstandard = []
selftrained = []
for x in range(len(annotator_similarity_score_508)):
    cosineSimilarity = None
    
    emoji1 = glyph_pairs_1016.pop(0)
    emoji2 = glyph_pairs_1016.pop(0)
    
    try:
        cosineSimilarity = thrones2vec.wv.similarity(emoji1, emoji2)
    except:
        print('the cosine similarity between ' + emoji1 + ' and ' + emoji2 + ' could not be computed.')
    
    if(cosineSimilarity is not None):
        selftrained.append(cosineSimilarity)
        goldstandard.append(annotator_similarity_score_508.pop(0))

spearmanRank = stats.spearmanr(goldstandard, selftrained)

print('Der Spearman Rank Correlation Coefficient is {}'.format(spearmanRank))

the cosine similarity between 🇬🇧 and 🇺🇸 could not be computed.
the cosine similarity between 🆓 and 💸 could not be computed.
the cosine similarity between 🏅 and 🇺🇸 could not be computed.
the cosine similarity between 🆓 and 💃 could not be computed.
the cosine similarity between 🇺🇸 and ❤ could not be computed.
the cosine similarity between 🌃 and 🕹 could not be computed.
the cosine similarity between 🆓 and 📍 could not be computed.
the cosine similarity between 🌃 and 🆓 could not be computed.
the cosine similarity between 🚫 and 🆓 could not be computed.
the cosine similarity between 😏 and 🕹 could not be computed.
the cosine similarity between 🇺🇸 and 💥 could not be computed.
the cosine similarity between 🎤 and 🇳🇬 could not be computed.
the cosine similarity between 🕹 and 💯 could not be computed.
the cosine similarity between 🇳🇬 and 📲 could not be computed.
the cosine similarity between 👇 and 🇳🇬 could not be computed.
the cosine similarity between 🎧 and 🇳🇬 could not be computed.
the cosine simi

  if np.issubdtype(vec.dtype, np.int):
