In [7]:
from __future__ import absolute_import, division, print_function

In [8]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [9]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [676]:
from numpy  import array
from scipy import stats

In [5]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [6]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [7]:
corpus_filename = '../data/extracted_emoji_sequences.txt'

In [8]:
corpus = open(corpus_filename).read().splitlines()


In [9]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

In [10]:
def onlyEmojiSequences(tokens):
    threshold_emojis = [x for x in tokens if len(x) > 1]
    return threshold_emojis

In [11]:
tokenized_corpus = tokenize_corpus(corpus)
emojiSequences = onlyEmojiSequences(tokenized_corpus)

In [12]:
token_count = sum([len(sentence) for sentence in emojiSequences])
print("The book corpus contains {0:,} tokens".format(token_count))

The book corpus contains 292,320 tokens


# training the model

In [13]:
# Dimensionality of the resulting word vectors.
num_features = 300

# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()

# Context window length.
context_size = 2

# Downsample setting for frequent words.
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
seed = 1

# think of how to set those variables so that variables from different tweets are not learned from together!


In [14]:
thrones2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [15]:
thrones2vec.build_vocab(emojiSequences)

2019-01-10 14:12:41,317 : INFO : collecting all words and their counts
2019-01-10 14:12:41,317 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-01-10 14:12:41,324 : INFO : PROGRESS: at sentence #10000, processed 35000 words, keeping 7 word types
2019-01-10 14:12:41,330 : INFO : PROGRESS: at sentence #20000, processed 70000 words, keeping 7 word types
2019-01-10 14:12:41,337 : INFO : PROGRESS: at sentence #30000, processed 105000 words, keeping 7 word types
2019-01-10 14:12:41,346 : INFO : PROGRESS: at sentence #40000, processed 140000 words, keeping 7 word types
2019-01-10 14:12:41,354 : INFO : PROGRESS: at sentence #50000, processed 175000 words, keeping 7 word types
2019-01-10 14:12:41,361 : INFO : PROGRESS: at sentence #60000, processed 210000 words, keeping 7 word types
2019-01-10 14:12:41,370 : INFO : PROGRESS: at sentence #70000, processed 245000 words, keeping 7 word types
2019-01-10 14:12:41,379 : INFO : PROGRESS: at sentence #80000, processed 280

In [16]:
print("Word2Vec vocabulary length:", len(thrones2vec.wv.vocab))


Word2Vec vocabulary length: 7


In [17]:
thrones2vec.wv.vocab

{'😳': <gensim.models.keyedvectors.Vocab at 0x1a1bc7b1d0>,
 '😢': <gensim.models.keyedvectors.Vocab at 0x1a1cad9438>,
 '🙁': <gensim.models.keyedvectors.Vocab at 0x1a1cad9358>,
 '🤔': <gensim.models.keyedvectors.Vocab at 0x1a1cad9550>,
 '😘': <gensim.models.keyedvectors.Vocab at 0x1a1cad9390>,
 '😍': <gensim.models.keyedvectors.Vocab at 0x1a1cad93c8>,
 '❤️': <gensim.models.keyedvectors.Vocab at 0x1a1cad9470>}

In [18]:
thrones2vec.train(emojiSequences, total_examples=thrones2vec.corpus_count, epochs = 1)

2019-01-10 14:12:42,746 : INFO : training model with 8 workers on 7 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=2
2019-01-10 14:12:42,835 : INFO : worker thread finished; awaiting finish of 7 more threads
2019-01-10 14:12:42,840 : INFO : worker thread finished; awaiting finish of 6 more threads
2019-01-10 14:12:42,841 : INFO : worker thread finished; awaiting finish of 5 more threads
2019-01-10 14:12:42,842 : INFO : worker thread finished; awaiting finish of 4 more threads
2019-01-10 14:12:42,842 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-01-10 14:12:42,842 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-01-10 14:12:42,843 : INFO : worker thread finished; awaiting finish of 1 more threads
2019-01-10 14:12:42,843 : INFO : worker thread finished; awaiting finish of 0 more threads
2019-01-10 14:12:42,843 : INFO : EPOCH - 1 : training on 292320 raw words (26312 effective words) took 0.1s, 326814 effective 

(26312, 292320)

In [19]:
if not os.path.exists("trained"):
    os.makedirs("trained")

In [37]:
thrones2vec.save(os.path.join("trained", "mostSimilarWorkButSimilarityComparisonDoesnt.w2v"))

2019-01-10 14:15:18,635 : INFO : saving Word2Vec object under trained/mostSimilarWorkButSimilarityComparisonDoesnt.w2v, separately None
2019-01-10 14:15:18,636 : INFO : not storing attribute vectors_norm
2019-01-10 14:15:18,636 : INFO : not storing attribute cum_table
2019-01-10 14:15:18,638 : INFO : saved trained/mostSimilarWorkButSimilarityComparisonDoesnt.w2v


# explore the trained model

In [21]:
thrones2vec = w2v.Word2Vec.load(os.path.join("trained", "testEmoji.w2v"))

2019-01-10 14:12:44,647 : INFO : loading Word2Vec object from trained/testEmoji.w2v
2019-01-10 14:12:44,648 : INFO : loading wv recursively from trained/testEmoji.w2v.wv.* with mmap=None
2019-01-10 14:12:44,649 : INFO : setting ignored attribute vectors_norm to None
2019-01-10 14:12:44,649 : INFO : loading vocabulary recursively from trained/testEmoji.w2v.vocabulary.* with mmap=None
2019-01-10 14:12:44,650 : INFO : loading trainables recursively from trained/testEmoji.w2v.trainables.* with mmap=None
2019-01-10 14:12:44,651 : INFO : setting ignored attribute cum_table to None
2019-01-10 14:12:44,651 : INFO : loaded trained/testEmoji.w2v


In [22]:
vector1 = thrones2vec.wv['😳']

In [23]:
vector2 = thrones2vec.wv['😍']

In [23]:
vector3 = thrones2vec.wv['❤️']

In [24]:
thrones2vec.wv.similarity('😳', '😳')

  if np.issubdtype(vec.dtype, np.int):


0.0

In [25]:
thrones2vec.wv.vocab

{'😳': <gensim.models.keyedvectors.Vocab at 0x1a1cad9cf8>,
 '😢': <gensim.models.keyedvectors.Vocab at 0x1a1cad9e10>,
 '🙁': <gensim.models.keyedvectors.Vocab at 0x1a1cad9e48>,
 '🤔': <gensim.models.keyedvectors.Vocab at 0x1a1cad9e80>,
 '😘': <gensim.models.keyedvectors.Vocab at 0x1a1cad9eb8>,
 '😍': <gensim.models.keyedvectors.Vocab at 0x1a1cad9ef0>,
 '❤️': <gensim.models.keyedvectors.Vocab at 0x1a1cad9f28>}

In [36]:
thrones2vec.wv.most_similar('😍')


  if np.issubdtype(vec.dtype, np.int):


[('😘', 0.9771696925163269),
 ('❤️', 0.9631988406181335),
 ('😢', 0.1335771232843399),
 ('🙁', 0.11866120994091034),
 ('😳', 0.10258054733276367),
 ('🤔', 0.09710533171892166)]

In [682]:
# extracts the 508 Annotator Results as the Gold-Standard
corpus_filename = '../data/EmoSim508.json'
corpus = open(corpus_filename).read()
annotator_similarity_score_508 = list(array(re.findall('(?<=_Annotator_Agreement": )(.*?)(?=\})', corpus)))

# glyph_pairs_1016
unicode_pairs_1016 = re.findall('(?<=unicodelong": "\\\)(.*?)(?=")', corpus)    
glyph_pairs_1016 = [codecs.decode(unicode_pairs_1016[x].replace(str('\\\\'),str('\\')).replace('_',''), 'unicode_escape') for x in range(len(unicode_pairs_1016))]

# fdfdsfaf
goldstandard = []
selftrained = []
for x in range(len(annotator_similarity_score_508)):
    cosineSimilarity = None
    
    emoji1 = glyph_pairs_1016.pop(0)
    emoji2 = glyph_pairs_1016.pop(0)
    
    try:
        cosineSimilarity = thrones2vec.wv.similarity(emoji1, emoji2)
    except:
        print('the cosine similarity between ' + emoji1 + ' and ' + emoji2 + ' could not be computed.')
    
    if(cosineSimilarity is not None):
        goldstandard.append(cosineSimilarity)
        selftrained.append(annotator_similarity_score_508.pop(0))
print(len(goldstandard))
print(len(selftrained))

spearmanRank = stats.spearmanr(goldstandard, selftrained)

print('Der Spearman Rank Correlation Coefficient is {}'.format(spearmanRank))

the cosine similarity between 🎵 and 🎶 could not be computed.
the cosine similarity between 🎊 and 🎉 could not be computed.
the cosine similarity between ☺ and 😊 could not be computed.
the cosine similarity between ❤ and 💞 could not be computed.
the cosine similarity between 💕 and ❤ could not be computed.
the cosine similarity between 💞 and 💕 could not be computed.
the cosine similarity between 💘 and 💕 could not be computed.
the cosine similarity between 💕 and 😍 could not be computed.
the cosine similarity between 💜 and 💙 could not be computed.
the cosine similarity between 💗 and ❤ could not be computed.
the cosine similarity between 💗 and 💕 could not be computed.
the cosine similarity between ❤ and 💓 could not be computed.
the cosine similarity between 🎤 and 🎶 could not be computed.
the cosine similarity between 💖 and 💕 could not be computed.
the cosine similarity between 💕 and 💓 could not be computed.
the cosine similarity between ❤ and 💙 could not be computed.
the cosine similarity be

  if np.issubdtype(vec.dtype, np.int):


In [673]:
goldstandard

[0.97716975]