In [2]:
from __future__ import absolute_import, division, print_function

In [3]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [4]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [5]:
from numpy  import array
from scipy import stats

In [6]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [7]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [8]:
corpus_filename = '../data/extracted_emoji_sequences.txt'

In [9]:
corpus = open(corpus_filename).read().splitlines()


In [10]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

In [11]:
def onlyEmojiSequences(tokens):
    threshold_emojis = [x for x in tokens if len(x) > 1]
    return threshold_emojis

In [12]:
tokenized_corpus = tokenize_corpus(corpus)
emojiSequences = onlyEmojiSequences(tokenized_corpus)

In [13]:
token_count = sum([len(sentence) for sentence in emojiSequences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 610,256 tokens


# training the model

In [14]:
# Dimensionality of the resulting word vectors.
num_features = 300

# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 2

# Downsample setting for frequent words.
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
seed = 1

# think of how to set those variables so that variables from different tweets are not learned from together!


In [15]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [16]:
thrones2vec.build_vocab(emojiSequences)

2019-01-14 10:15:48,439 : INFO : collecting all words and their counts
2019-01-14 10:15:48,440 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-14 10:15:48,447 : INFO : PROGRESS: at sentence #10000, processed 26070 words, keeping 818 word types
2019-01-14 10:15:48,455 : INFO : PROGRESS: at sentence #20000, processed 52738 words, keeping 935 word types
2019-01-14 10:15:48,463 : INFO : PROGRESS: at sentence #30000, processed 80617 words, keeping 993 word types
2019-01-14 10:15:48,472 : INFO : PROGRESS: at sentence #40000, processed 106997 words, keeping 1019 word types
2019-01-14 10:15:48,481 : INFO : PROGRESS: at sentence #50000, processed 134684 words, keeping 1038 word types
2019-01-14 10:15:48,490 : INFO : PROGRESS: at sentence #60000, processed 161638 words, keeping 1053 word types
2019-01-14 10:15:48,499 : INFO : PROGRESS: at sentence #70000, processed 189166 words, keeping 1065 word types
2019-01-14 10:15:48,513 : INFO : PROGRESS: at sentence #800

In [17]:
print("Word2Vec vocabulary length:", len(thrones2vec.wv.vocab))


Word2Vec vocabulary length: 1082


In [18]:
thrones2vec.wv.vocab

{'⚠': <gensim.models.keyedvectors.Vocab at 0x1a1db612b0>,
 '😀': <gensim.models.keyedvectors.Vocab at 0x1a1f1bbe48>,
 '🎷': <gensim.models.keyedvectors.Vocab at 0x1a1f1b59e8>,
 '🙌': <gensim.models.keyedvectors.Vocab at 0x1a1f2bacf8>,
 '💐': <gensim.models.keyedvectors.Vocab at 0x1a1f2bacc0>,
 '🌈': <gensim.models.keyedvectors.Vocab at 0x1a1f2bad30>,
 '🍸': <gensim.models.keyedvectors.Vocab at 0x1a1f2ba358>,
 '🎶': <gensim.models.keyedvectors.Vocab at 0x1a1f2ba828>,
 '💜': <gensim.models.keyedvectors.Vocab at 0x1a1f2ba668>,
 '🌼': <gensim.models.keyedvectors.Vocab at 0x1a1f2ba6d8>,
 '💛': <gensim.models.keyedvectors.Vocab at 0x1a1f2ba278>,
 '❤': <gensim.models.keyedvectors.Vocab at 0x1a1f2bad68>,
 '☮': <gensim.models.keyedvectors.Vocab at 0x1a1f2babe0>,
 '✨': <gensim.models.keyedvectors.Vocab at 0x1a1f2ba908>,
 '🏕': <gensim.models.keyedvectors.Vocab at 0x1a1f2bac88>,
 '💖': <gensim.models.keyedvectors.Vocab at 0x1a1f2baf28>,
 '🌤': <gensim.models.keyedvectors.Vocab at 0x1a1f2ba4e0>,
 '🌸': <gensim.

In [19]:
thrones2vec.train(emojiSequences, total_examples=thrones2vec.corpus_count, epochs = 1)

2019-01-14 10:15:50,195 : INFO : training model with 12 workers on 1082 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=2
2019-01-14 10:15:50,487 : INFO : worker thread finished; awaiting finish of 11 more threads
2019-01-14 10:15:50,490 : INFO : worker thread finished; awaiting finish of 10 more threads
2019-01-14 10:15:50,496 : INFO : worker thread finished; awaiting finish of 9 more threads
2019-01-14 10:15:50,497 : INFO : worker thread finished; awaiting finish of 8 more threads
2019-01-14 10:15:50,498 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-01-14 10:15:50,499 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-01-14 10:15:50,500 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-01-14 10:15:50,501 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-01-14 10:15:50,502 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-01-14 10:15:50,506 : INF

(388536, 610256)

In [20]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [21]:
thrones2vec.save(os.path.join("trained", "mostSimilarWorkButSimilarityComparisonDoesnt.w2v"))

2019-01-14 10:15:51,407 : INFO : saving Word2Vec object under trained/mostSimilarWorkButSimilarityComparisonDoesnt.w2v, separately None
2019-01-14 10:15:51,408 : INFO : not storing attribute vectors_norm
2019-01-14 10:15:51,409 : INFO : not storing attribute cum_table
2019-01-14 10:15:51,434 : INFO : saved trained/mostSimilarWorkButSimilarityComparisonDoesnt.w2v


# explore the trained model

In [32]:
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "mostSimilarWorkButSimilarityComparisonDoesnt.w2v"))

2019-01-14 10:16:55,093 : INFO : loading Word2Vec object from trained/mostSimilarWorkButSimilarityComparisonDoesnt.w2v
2019-01-14 10:16:55,111 : INFO : loading wv recursively from trained/mostSimilarWorkButSimilarityComparisonDoesnt.w2v.wv.* with mmap=None
2019-01-14 10:16:55,112 : INFO : setting ignored attribute vectors_norm to None
2019-01-14 10:16:55,112 : INFO : loading vocabulary recursively from trained/mostSimilarWorkButSimilarityComparisonDoesnt.w2v.vocabulary.* with mmap=None
2019-01-14 10:16:55,113 : INFO : loading trainables recursively from trained/mostSimilarWorkButSimilarityComparisonDoesnt.w2v.trainables.* with mmap=None
2019-01-14 10:16:55,113 : INFO : setting ignored attribute cum_table to None
2019-01-14 10:16:55,114 : INFO : loaded trained/mostSimilarWorkButSimilarityComparisonDoesnt.w2v


In [33]:
vector1 = thrones2vec.wv['😳']

In [34]:
vector2 = thrones2vec.wv['😍']

In [35]:
vector3 = thrones2vec.wv['❤️']

KeyError: "word '❤️' not in vocabulary"

In [39]:
thrones2vec.wv.similarity('🎶', '💜')

0.47109443432216425

In [37]:
thrones2vec.wv.vocab

{'⚠': <gensim.models.keyedvectors.Vocab at 0x1a28c4b390>,
 '😀': <gensim.models.keyedvectors.Vocab at 0x1a28c4ba90>,
 '🎷': <gensim.models.keyedvectors.Vocab at 0x1a28c4bbe0>,
 '🙌': <gensim.models.keyedvectors.Vocab at 0x1a28c4bac8>,
 '💐': <gensim.models.keyedvectors.Vocab at 0x1a28c4bc88>,
 '🌈': <gensim.models.keyedvectors.Vocab at 0x1a28c4b080>,
 '🍸': <gensim.models.keyedvectors.Vocab at 0x1a28c4bb70>,
 '🎶': <gensim.models.keyedvectors.Vocab at 0x1a28c4bb00>,
 '💜': <gensim.models.keyedvectors.Vocab at 0x1a28c4b9e8>,
 '🌼': <gensim.models.keyedvectors.Vocab at 0x1a28c4bd30>,
 '💛': <gensim.models.keyedvectors.Vocab at 0x1a28c4bcc0>,
 '❤': <gensim.models.keyedvectors.Vocab at 0x1a28c4bc50>,
 '☮': <gensim.models.keyedvectors.Vocab at 0x1a28c4bd68>,
 '✨': <gensim.models.keyedvectors.Vocab at 0x1a28c4bda0>,
 '🏕': <gensim.models.keyedvectors.Vocab at 0x1a28c4bdd8>,
 '💖': <gensim.models.keyedvectors.Vocab at 0x1a28c4be10>,
 '🌤': <gensim.models.keyedvectors.Vocab at 0x1a28c4be48>,
 '🌸': <gensim.

In [36]:
thrones2vec.wv.most_similar('😍')


  if np.issubdtype(vec.dtype, np.int):


[('😘', 0.9771696925163269),
 ('❤️', 0.9631988406181335),
 ('😢', 0.1335771232843399),
 ('🙁', 0.11866120994091034),
 ('😳', 0.10258054733276367),
 ('🤔', 0.09710533171892166)]

In [682]:
# extracts the 508 Annotator Results as the Gold-Standard
corpus_filename = '../data/EmoSim508.json'
corpus = open(corpus_filename).read()
annotator_similarity_score_508 = list(array(re.findall('(?<=_Annotator_Agreement": )(.*?)(?=\})', corpus)))

# glyph_pairs_1016
unicode_pairs_1016 = re.findall('(?<=unicodelong": "\\\)(.*?)(?=")', corpus)    
glyph_pairs_1016 = [codecs.decode(unicode_pairs_1016[x].replace(str('\\\\'),str('\\')).replace('_',''), 'unicode_escape') for x in range(len(unicode_pairs_1016))]

# fdfdsfaf
goldstandard = []
selftrained = []
for x in range(len(annotator_similarity_score_508)):
    cosineSimilarity = None
    
    emoji1 = glyph_pairs_1016.pop(0)
    emoji2 = glyph_pairs_1016.pop(0)
    
    try:
        cosineSimilarity = thrones2vec.wv.similarity(emoji1, emoji2)
    except:
        print('the cosine similarity between ' + emoji1 + ' and ' + emoji2 + ' could not be computed.')
    
    if(cosineSimilarity is not None):
        goldstandard.append(cosineSimilarity)
        selftrained.append(annotator_similarity_score_508.pop(0))
print(len(goldstandard))
print(len(selftrained))

spearmanRank = stats.spearmanr(goldstandard, selftrained)

print('Der Spearman Rank Correlation Coefficient is {}'.format(spearmanRank))

the cosine similarity between 🎵 and 🎶 could not be computed.
the cosine similarity between 🎊 and 🎉 could not be computed.
the cosine similarity between ☺ and 😊 could not be computed.
the cosine similarity between ❤ and 💞 could not be computed.
the cosine similarity between 💕 and ❤ could not be computed.
the cosine similarity between 💞 and 💕 could not be computed.
the cosine similarity between 💘 and 💕 could not be computed.
the cosine similarity between 💕 and 😍 could not be computed.
the cosine similarity between 💜 and 💙 could not be computed.
the cosine similarity between 💗 and ❤ could not be computed.
the cosine similarity between 💗 and 💕 could not be computed.
the cosine similarity between ❤ and 💓 could not be computed.
the cosine similarity between 🎤 and 🎶 could not be computed.
the cosine similarity between 💖 and 💕 could not be computed.
the cosine similarity between 💕 and 💓 could not be computed.
the cosine similarity between ❤ and 💙 could not be computed.
the cosine similarity be

  if np.issubdtype(vec.dtype, np.int):


In [673]:
goldstandard

[0.97716975]