In [1]:
from __future__ import absolute_import, division, print_function

In [4]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [2]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [7]:
from numpy  import array
from scipy import stats

In [5]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [6]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [7]:
corpus_filename = '../data/extracted_emoji_sequences.txt'

In [8]:
corpus = open(corpus_filename).read().splitlines()


In [9]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

In [10]:
def onlyEmojiSequences(tokens):
    threshold_emojis = [x for x in tokens if len(x) > 1]
    return threshold_emojis

In [11]:
tokenized_corpus = tokenize_corpus(corpus)
emojiSequences = onlyEmojiSequences(tokenized_corpus)

In [12]:
token_count = sum([len(sentence) for sentence in emojiSequences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 610,256 tokens


# training the model

In [13]:
# Dimensionality of the resulting word vectors.
num_features = 300

# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 8

# Downsample setting for frequent words.
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
seed = 1

# think of how to set those variables so that variables from different tweets are not learned from together!


In [14]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [15]:
thrones2vec.build_vocab(emojiSequences)

2019-01-31 17:54:26,827 : INFO : collecting all words and their counts
2019-01-31 17:54:26,828 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-31 17:54:26,835 : INFO : PROGRESS: at sentence #10000, processed 26070 words, keeping 818 word types
2019-01-31 17:54:26,843 : INFO : PROGRESS: at sentence #20000, processed 52738 words, keeping 935 word types
2019-01-31 17:54:26,852 : INFO : PROGRESS: at sentence #30000, processed 80617 words, keeping 993 word types
2019-01-31 17:54:26,859 : INFO : PROGRESS: at sentence #40000, processed 106997 words, keeping 1019 word types
2019-01-31 17:54:26,867 : INFO : PROGRESS: at sentence #50000, processed 134684 words, keeping 1038 word types
2019-01-31 17:54:26,874 : INFO : PROGRESS: at sentence #60000, processed 161638 words, keeping 1053 word types
2019-01-31 17:54:26,881 : INFO : PROGRESS: at sentence #70000, processed 189166 words, keeping 1065 word types
2019-01-31 17:54:26,888 : INFO : PROGRESS: at sentence #800

In [16]:
print("Word2Vec vocabulary length:", len(thrones2vec.wv.vocab))


Word2Vec vocabulary length: 1082


In [17]:
thrones2vec.wv.vocab

{'⚠': <gensim.models.keyedvectors.Vocab at 0x1a22aee2b0>,
 '😀': <gensim.models.keyedvectors.Vocab at 0x1a2c128710>,
 '🎷': <gensim.models.keyedvectors.Vocab at 0x1a2c128a90>,
 '🙌': <gensim.models.keyedvectors.Vocab at 0x1a2c128898>,
 '💐': <gensim.models.keyedvectors.Vocab at 0x1a2c128ba8>,
 '🌈': <gensim.models.keyedvectors.Vocab at 0x1a2c128be0>,
 '🍸': <gensim.models.keyedvectors.Vocab at 0x1a2c128c18>,
 '🎶': <gensim.models.keyedvectors.Vocab at 0x1a2c128c88>,
 '💜': <gensim.models.keyedvectors.Vocab at 0x1a2c128cc0>,
 '🌼': <gensim.models.keyedvectors.Vocab at 0x1a2c128c50>,
 '💛': <gensim.models.keyedvectors.Vocab at 0x1a2c128cf8>,
 '❤': <gensim.models.keyedvectors.Vocab at 0x1a2c128d30>,
 '☮': <gensim.models.keyedvectors.Vocab at 0x1a2c128d68>,
 '✨': <gensim.models.keyedvectors.Vocab at 0x1a2c128da0>,
 '🏕': <gensim.models.keyedvectors.Vocab at 0x1a2c128dd8>,
 '💖': <gensim.models.keyedvectors.Vocab at 0x1a2c128e10>,
 '🌤': <gensim.models.keyedvectors.Vocab at 0x1a2c128e48>,
 '🌸': <gensim.

In [18]:
thrones2vec.train(emojiSequences, total_examples=thrones2vec.corpus_count, epochs = 10)

2019-01-31 17:54:27,053 : INFO : training model with 12 workers on 1082 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=8
2019-01-31 17:54:27,352 : INFO : worker thread finished; awaiting finish of 11 more threads
2019-01-31 17:54:27,353 : INFO : worker thread finished; awaiting finish of 10 more threads
2019-01-31 17:54:27,354 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-01-31 17:54:27,355 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-01-31 17:54:27,356 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-01-31 17:54:27,356 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-01-31 17:54:27,359 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-01-31 17:54:27,365 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-01-31 17:54:27,366 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-01-31 17:54:27,368 : INF

(3889141, 6102560)

In [19]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [20]:
thrones2vec.save(os.path.join("trained", "123.w2v"))

2019-01-31 17:54:30,282 : INFO : saving Word2Vec object under trained/123.w2v, separately None
2019-01-31 17:54:30,284 : INFO : not storing attribute vectors_norm
2019-01-31 17:54:30,284 : INFO : not storing attribute cum_table
2019-01-31 17:54:30,309 : INFO : saved trained/123.w2v


# explore the trained model

In [5]:
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "123.w2v"))

In [22]:
vector1 = thrones2vec.wv['😳']

In [23]:
vector2 = thrones2vec.wv['😍']

In [24]:
vector3 = thrones2vec.wv['❤️']

KeyError: "word '❤️' not in vocabulary"

In [15]:
thrones2vec.wv.similarity('🎶', '💜')

  if np.issubdtype(vec.dtype, np.int):


-3.689349e+19

In [None]:
thrones2vec.wv.vocab

In [None]:
thrones2vec.wv.most_similar('😍')


In [11]:
thrones2vec.wv.get_vector('😍')

array([-0.19183026,  0.11375655, -0.16012314, -0.25849867,  0.17544834,
        0.21091254, -0.11747479, -0.19825502, -0.10193074, -0.03427677,
       -0.00736084, -0.19820458,  0.38984558, -0.13443092,  0.01620918,
        0.332893  , -0.260124  , -0.15286909,  0.12688816,  0.10063136,
       -0.46974108, -0.3566966 ,  0.4659378 , -0.180111  ,  0.00381013,
       -0.06182682, -0.14508878,  0.06792038,  0.08275423, -0.20010997,
       -0.01149292, -0.15443453,  0.38747087,  0.19510773, -0.33782205,
       -0.19734281,  0.08655534,  0.10305947, -0.289596  ,  0.23030576,
        0.05925154,  0.14172854,  0.26680222, -0.0452031 , -0.22016795,
       -0.31578308, -0.04127728, -0.07856978, -0.2547796 , -0.03179306,
       -0.47002825,  0.03738847, -0.06890521, -0.02896455, -0.1691807 ,
        0.12014451, -0.02129841, -0.1192428 ,  0.00738062, -0.46514976,
        0.00120973,  0.05123157, -0.15342242, -0.05351759, -0.05266422,
       -0.2839693 , -0.2132006 , -0.18690728, -0.10868052, -0.45

In [9]:
# extracts the 508 Annotator Results as the Gold-Standard
corpus_filename = '../data/EmoSim508.json'
corpus = open(corpus_filename).read()
annotator_similarity_score_508 = list(array(re.findall('(?<=_Annotator_Agreement": )(.*?)(?=\})', corpus)))

# glyph_pairs_1016
unicode_pairs_1016 = re.findall('(?<=unicodelong": "\\\)(.*?)(?=")', corpus)    
glyph_pairs_1016 = [codecs.decode(unicode_pairs_1016[x].replace(str('\\\\'),str('\\')).replace('_',''), 'unicode_escape') for x in range(len(unicode_pairs_1016))]

# fdfdsfaf
goldstandard = []
selftrained = []
for x in range(len(annotator_similarity_score_508)):
    cosineSimilarity = None
    
    emoji1 = glyph_pairs_1016.pop(0)
    emoji2 = glyph_pairs_1016.pop(0)
    
    try:
        cosineSimilarity = thrones2vec.wv.similarity(emoji1, emoji2)
    except:
        print('the cosine similarity between ' + emoji1 + ' and ' + emoji2 + ' could not be computed.')
    
    if(cosineSimilarity is not None):
        goldstandard.append(cosineSimilarity)
        selftrained.append(annotator_similarity_score_508.pop(0))
print(len(goldstandard))
print(len(selftrained))

spearmanRank = stats.spearmanr(goldstandard, selftrained)

print('Der Spearman Rank Correlation Coefficient is {}'.format(spearmanRank))

the cosine similarity between 🇬🇧 and 🇺🇸 could not be computed.
the cosine similarity between 🏅 and 🇺🇸 could not be computed.
the cosine similarity between 🇺🇸 and ❤ could not be computed.
the cosine similarity between 🇺🇸 and 💥 could not be computed.
the cosine similarity between 🎤 and 🇳🇬 could not be computed.
the cosine similarity between 🇳🇬 and 📲 could not be computed.
the cosine similarity between 👇 and 🇳🇬 could not be computed.
the cosine similarity between 🎧 and 🇳🇬 could not be computed.
the cosine similarity between 🇳🇬 and 🎶 could not be computed.
the cosine similarity between 👏 and ↪ could not be computed.
498
498
Der Spearman Rank Correlation Coefficient is SpearmanrResult(correlation=0.0380875629099537, pvalue=0.3963652227438895)


  if np.issubdtype(vec.dtype, np.int):


In [19]:
selftrained

['4.0',
 '3.95',
 '3.9',
 '3.85',
 '3.85',
 '3.85',
 '3.85',
 '3.8',
 '3.8',
 '3.8',
 '3.8',
 '3.8',
 '3.75',
 '3.75',
 '3.75',
 '3.7',
 '3.7',
 '3.7',
 '3.7',
 '3.7',
 '3.7',
 '3.7',
 '3.7',
 '3.7',
 '3.7',
 '3.65',
 '3.6',
 '3.55',
 '3.55',
 '3.55',
 '3.55',
 '3.5',
 '3.5',
 '3.45',
 '3.45',
 '3.4',
 '3.4',
 '3.4',
 '3.4',
 '3.4',
 '3.35',
 '3.35',
 '3.35',
 '3.35',
 '3.35',
 '3.3',
 '3.3',
 '3.25',
 '3.2',
 '3.2',
 '3.15',
 '3.15',
 '3.1',
 '3.1',
 '3.1',
 '3.1',
 '3.05',
 '3.05',
 '3.05',
 '3.05',
 '3.0',
 '3.0',
 '3.0',
 '2.95',
 '2.9',
 '2.9',
 '2.9',
 '2.85',
 '2.8',
 '2.8',
 '2.75',
 '2.75',
 '2.7',
 '2.7',
 '2.65',
 '2.6',
 '2.55',
 '2.55',
 '2.55',
 '2.5',
 '2.5',
 '2.5',
 '2.5',
 '2.5',
 '2.45',
 '2.45',
 '2.45',
 '2.45',
 '2.45',
 '2.45',
 '2.4',
 '2.4',
 '2.35',
 '2.35',
 '2.35',
 '2.3',
 '2.25',
 '2.25',
 '2.25',
 '2.2',
 '2.2',
 '2.2',
 '2.15',
 '2.15',
 '2.1',
 '2.05',
 '2.05',
 '2.05',
 '2.05',
 '2.05',
 '2.0',
 '2.0',
 '2.0',
 '1.95',
 '1.95',
 '1.95',
 '1.9',
 '1.9',

In [16]:
goldstandard

[-1.0842022e-19,
 0.0,
 1.0842022e-19,
 -3.689349e+19,
 -1.0842022e-19,
 -3.689349e+19,
 -2.0,
 0.0,
 -3.689349e+19,
 1.0842022e-19,
 1.0842022e-19,
 0.0,
 2.0,
 2.0,
 0.0,
 2.0,
 1.0842022e-19,
 2.0,
 1.0842022e-19,
 -2.0,
 3.689349e+19,
 0.0,
 -3.689349e+19,
 1.0842022e-19,
 -2.0,
 0.0,
 0.0,
 -1.0842022e-19,
 0.0,
 0.0,
 -3.689349e+19,
 1.0842022e-19,
 -3.689349e+19,
 3.689349e+19,
 3.689349e+19,
 -3.689349e+19,
 2.0,
 2.0,
 3.689349e+19,
 1.0842022e-19,
 0.0,
 3.689349e+19,
 2.0,
 0.0,
 3.689349e+19,
 -3.689349e+19,
 0.0,
 -3.689349e+19,
 1.0842022e-19,
 1.0842022e-19,
 2.0,
 0.0,
 0.0,
 0.0,
 -2.0,
 -3.689349e+19,
 2.0,
 0.0,
 0.0,
 0.0,
 -3.689349e+19,
 0.0,
 2.0,
 3.689349e+19,
 -3.689349e+19,
 0.0,
 -2.0,
 -2.0,
 -3.689349e+19,
 -2.0,
 -3.689349e+19,
 1.0842022e-19,
 0.0,
 2.0,
 0.0,
 0.0,
 1.0842022e-19,
 -1.0842022e-19,
 3.689349e+19,
 0.0,
 3.689349e+19,
 0.0,
 0.0,
 0.0,
 0.0,
 -2.0,
 -2.0,
 -1.0842022e-19,
 1.0842022e-19,
 1.0842022e-19,
 1.0842022e-19,
 -1.0842022e-19,
 2

In [14]:
thrones2vec.wv.similarity('😍', '💜')

  if np.issubdtype(vec.dtype, np.int):


-2.0