In [1]:
from __future__ import absolute_import, division, print_function

In [2]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [3]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [4]:
from numpy  import array
from scipy import stats

In [5]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [6]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [7]:
corpus_filename = '../data/extracted_emoji_sequences.txt'

In [8]:
corpus = open(corpus_filename).read().splitlines()


In [9]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

In [10]:
def onlyEmojiSequences(tokens):
    threshold_emojis = [x for x in tokens if len(x) > 1]
    return threshold_emojis

In [11]:
tokenized_corpus = tokenize_corpus(corpus)
emojiSequences = onlyEmojiSequences(tokenized_corpus)

In [12]:
token_count = sum([len(sentence) for sentence in emojiSequences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 610,256 tokens


# training the model

In [13]:
# Dimensionality of the resulting word vectors.
num_features = 300

# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 8

# Downsample setting for frequent words.
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
seed = 1

# think of how to set those variables so that variables from different tweets are not learned from together!


In [14]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [15]:
thrones2vec.build_vocab(emojiSequences)

2019-01-31 17:54:26,827 : INFO : collecting all words and their counts
2019-01-31 17:54:26,828 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-31 17:54:26,835 : INFO : PROGRESS: at sentence #10000, processed 26070 words, keeping 818 word types
2019-01-31 17:54:26,843 : INFO : PROGRESS: at sentence #20000, processed 52738 words, keeping 935 word types
2019-01-31 17:54:26,852 : INFO : PROGRESS: at sentence #30000, processed 80617 words, keeping 993 word types
2019-01-31 17:54:26,859 : INFO : PROGRESS: at sentence #40000, processed 106997 words, keeping 1019 word types
2019-01-31 17:54:26,867 : INFO : PROGRESS: at sentence #50000, processed 134684 words, keeping 1038 word types
2019-01-31 17:54:26,874 : INFO : PROGRESS: at sentence #60000, processed 161638 words, keeping 1053 word types
2019-01-31 17:54:26,881 : INFO : PROGRESS: at sentence #70000, processed 189166 words, keeping 1065 word types
2019-01-31 17:54:26,888 : INFO : PROGRESS: at sentence #800

In [16]:
print("Word2Vec vocabulary length:", len(thrones2vec.wv.vocab))


Word2Vec vocabulary length: 1082


In [17]:
thrones2vec.wv.vocab

{'⚠': <gensim.models.keyedvectors.Vocab at 0x1a22aee2b0>,
 '😀': <gensim.models.keyedvectors.Vocab at 0x1a2c128710>,
 '🎷': <gensim.models.keyedvectors.Vocab at 0x1a2c128a90>,
 '🙌': <gensim.models.keyedvectors.Vocab at 0x1a2c128898>,
 '💐': <gensim.models.keyedvectors.Vocab at 0x1a2c128ba8>,
 '🌈': <gensim.models.keyedvectors.Vocab at 0x1a2c128be0>,
 '🍸': <gensim.models.keyedvectors.Vocab at 0x1a2c128c18>,
 '🎶': <gensim.models.keyedvectors.Vocab at 0x1a2c128c88>,
 '💜': <gensim.models.keyedvectors.Vocab at 0x1a2c128cc0>,
 '🌼': <gensim.models.keyedvectors.Vocab at 0x1a2c128c50>,
 '💛': <gensim.models.keyedvectors.Vocab at 0x1a2c128cf8>,
 '❤': <gensim.models.keyedvectors.Vocab at 0x1a2c128d30>,
 '☮': <gensim.models.keyedvectors.Vocab at 0x1a2c128d68>,
 '✨': <gensim.models.keyedvectors.Vocab at 0x1a2c128da0>,
 '🏕': <gensim.models.keyedvectors.Vocab at 0x1a2c128dd8>,
 '💖': <gensim.models.keyedvectors.Vocab at 0x1a2c128e10>,
 '🌤': <gensim.models.keyedvectors.Vocab at 0x1a2c128e48>,
 '🌸': <gensim.

In [18]:
thrones2vec.train(emojiSequences, total_examples=thrones2vec.corpus_count, epochs = 10)

2019-01-31 17:54:27,053 : INFO : training model with 12 workers on 1082 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=8
2019-01-31 17:54:27,352 : INFO : worker thread finished; awaiting finish of 11 more threads
2019-01-31 17:54:27,353 : INFO : worker thread finished; awaiting finish of 10 more threads
2019-01-31 17:54:27,354 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-01-31 17:54:27,355 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-01-31 17:54:27,356 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-01-31 17:54:27,356 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-01-31 17:54:27,359 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-01-31 17:54:27,365 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-01-31 17:54:27,366 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-01-31 17:54:27,368 : INF

(3889141, 6102560)

In [19]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [20]:
thrones2vec.save(os.path.join("trained", "123.w2v"))

2019-01-31 17:54:30,282 : INFO : saving Word2Vec object under trained/123.w2v, separately None
2019-01-31 17:54:30,284 : INFO : not storing attribute vectors_norm
2019-01-31 17:54:30,284 : INFO : not storing attribute cum_table
2019-01-31 17:54:30,309 : INFO : saved trained/123.w2v


# explore the trained model

In [21]:
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "123.w2v"))

2019-01-31 17:54:30,315 : INFO : loading Word2Vec object from trained/123.w2v
2019-01-31 17:54:30,335 : INFO : loading wv recursively from trained/123.w2v.wv.* with mmap=None
2019-01-31 17:54:30,336 : INFO : setting ignored attribute vectors_norm to None
2019-01-31 17:54:30,337 : INFO : loading vocabulary recursively from trained/123.w2v.vocabulary.* with mmap=None
2019-01-31 17:54:30,338 : INFO : loading trainables recursively from trained/123.w2v.trainables.* with mmap=None
2019-01-31 17:54:30,339 : INFO : setting ignored attribute cum_table to None
2019-01-31 17:54:30,340 : INFO : loaded trained/123.w2v


In [22]:
vector1 = thrones2vec.wv['😳']

In [23]:
vector2 = thrones2vec.wv['😍']

In [24]:
vector3 = thrones2vec.wv['❤️']

KeyError: "word '❤️' not in vocabulary"

In [None]:
thrones2vec.wv.similarity('🎶', '💜')

In [None]:
thrones2vec.wv.vocab

In [None]:
thrones2vec.wv.most_similar('😍')


In [25]:
# extracts the 508 Annotator Results as the Gold-Standard
corpus_filename = '../data/EmoSim508.json'
corpus = open(corpus_filename).read()
annotator_similarity_score_508 = list(array(re.findall('(?<=_Annotator_Agreement": )(.*?)(?=\})', corpus)))

# glyph_pairs_1016
unicode_pairs_1016 = re.findall('(?<=unicodelong": "\\\)(.*?)(?=")', corpus)    
glyph_pairs_1016 = [codecs.decode(unicode_pairs_1016[x].replace(str('\\\\'),str('\\')).replace('_',''), 'unicode_escape') for x in range(len(unicode_pairs_1016))]

# fdfdsfaf
goldstandard = []
selftrained = []
for x in range(len(annotator_similarity_score_508)):
    cosineSimilarity = None
    
    emoji1 = glyph_pairs_1016.pop(0)
    emoji2 = glyph_pairs_1016.pop(0)
    
    try:
        cosineSimilarity = thrones2vec.wv.similarity(emoji1, emoji2)
    except:
        print('the cosine similarity between ' + emoji1 + ' and ' + emoji2 + ' could not be computed.')
    
    if(cosineSimilarity is not None):
        goldstandard.append(cosineSimilarity)
        selftrained.append(annotator_similarity_score_508.pop(0))
print(len(goldstandard))
print(len(selftrained))

spearmanRank = stats.spearmanr(goldstandard, selftrained)

print('Der Spearman Rank Correlation Coefficient is {}'.format(spearmanRank))

the cosine similarity between 🇬🇧 and 🇺🇸 could not be computed.
the cosine similarity between 🏅 and 🇺🇸 could not be computed.
the cosine similarity between 🇺🇸 and ❤ could not be computed.
the cosine similarity between 🇺🇸 and 💥 could not be computed.
the cosine similarity between 🎤 and 🇳🇬 could not be computed.
the cosine similarity between 🇳🇬 and 📲 could not be computed.
the cosine similarity between 👇 and 🇳🇬 could not be computed.
the cosine similarity between 🎧 and 🇳🇬 could not be computed.
the cosine similarity between 🇳🇬 and 🎶 could not be computed.
the cosine similarity between 👏 and ↪ could not be computed.
498
498
Der Spearman Rank Correlation Coefficient is SpearmanrResult(correlation=0.5670391172266186, pvalue=1.041390648609959e-43)




In [26]:
goldstandard

[0.4554447095126947,
 0.4653041509014776,
 0.5107784652063863,
 0.307404959982498,
 0.3874710917320384,
 0.3757104390832654,
 0.4052640782098556,
 0.3191670239683293,
 0.36933443032082014,
 0.36306718720597575,
 0.43314122841582725,
 0.37948764192162526,
 0.3716206850232139,
 0.4136558156538691,
 0.3791055459433768,
 0.3099475646183907,
 0.32991150260512286,
 0.43136173991732435,
 0.42690563233602097,
 0.4101891074174304,
 0.3012748692703059,
 0.30987919067804054,
 0.411032381012099,
 0.42132665401025327,
 0.42909041896442623,
 0.43456883626219506,
 0.3507494695908241,
 0.313012528879552,
 0.6577179271348694,
 0.3889187932826977,
 0.5308508521317129,
 0.33636026413096565,
 0.26191562192416284,
 0.45560631603169277,
 0.3517402292445291,
 0.45433971786156535,
 0.49497135563967853,
 0.267301625897338,
 0.3724153912113354,
 0.3175736077920138,
 0.5178149073319843,
 0.3209088824335746,
 0.3777583194131311,
 0.5494227158412353,
 0.30169784551682644,
 0.47024557554620705,
 0.3701903890205816,