In [23]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from numpy  import array
from scipy import stats
from gensim.models.callbacks import CallbackAny2Vec
from sklearn import preprocessing

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity



# PREPROCESSING

In [24]:
corpus_filename = '../data/extracted_emoji_sequences.txt'

In [25]:
corpus = open(corpus_filename).read().splitlines()


In [26]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

In [27]:
def onlyEmojiSequences(tokens):
    threshold_emojis = [x for x in tokens if len(x) > 1]
    return threshold_emojis

In [28]:
tokenized_corpus = tokenize_corpus(corpus)
emojiSequences = onlyEmojiSequences(tokenized_corpus)

In [29]:
token_count = sum([len(sentence) for sentence in emojiSequences])
print("The corpus contains {0:,} emoji points.".format(token_count))

The corpus contains 610,256 emoji points.


# SETTING VARIABLES

In [30]:
# Dimensionality of the resulting word vectors.
num_features = 300

# Minimum word count threshold.
min_word_count = 3

# Number of threads to run in parallel.
num_workers = multiprocessing.cpu_count()
# num_workers = 1


# Context window length.
context_size = 4

# Downsample setting for frequent words.
downsampling = 1e-3

# Seed for the RNG, to make the results reproducible.
# remove later
seed = 1

emoji2vec = w2v.Word2Vec(
    sg=1,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 0
        self.tempLoss = 0
    def on_epoch_end(self, model):
        print("Epoch #{}  Loss-Value: {}".format(self.epoch, (emoji2vec.get_latest_training_loss() - self.tempLoss)))
        self.epoch += 1
        self.tempLoss = emoji2vec.get_latest_training_loss()
        
epoch_logger = EpochLogger()
emoji2vec.build_vocab(emojiSequences, progress_per=2)

# TRAINING

In [31]:
emoji2vec.train(emojiSequences, total_examples=emoji2vec.corpus_count, epochs = 10, compute_loss=True, callbacks=[epoch_logger])

Epoch #0  Loss-Value: 136419.6875
Epoch #1  Loss-Value: 127308.78125
Epoch #2  Loss-Value: 86842.3125
Epoch #3  Loss-Value: 73771.375
Epoch #4  Loss-Value: 88124.03125
Epoch #5  Loss-Value: 82460.3125
Epoch #6  Loss-Value: 97178.375
Epoch #7  Loss-Value: 79529.5
Epoch #8  Loss-Value: 71072.6875
Epoch #9  Loss-Value: 82730.5625


(3889461, 6102560)

In [32]:
if not os.path.exists("trained"):
    os.makedirs("trained")
emoji2vec.save(os.path.join("trained", "canIseed.w2v"))

# EVALUATING TRAINED MODEL

In [33]:
emoji2vec = w2v.Word2Vec.load(os.path.join("trained", "canIseed.w2v"))

In [38]:
# extracts the 508 Annotator Results as the Gold-Standard
corpus_filename = '../data/EmoSim508.json'
corpus = open(corpus_filename).read()
annotator_similarity_score_508 = list(array(re.findall('(?<=_Annotator_Agreement": )(.*?)(?=\})', corpus)))

# extract Wijeratne's Cosine_Similarities of the model which was trained on Google_Sense_Labels
google_sense_labels_score_508 = list(array(re.findall('(?<=Google_Sense_Label": )(.*?)(?=\,)', corpus)))

# glyph_pairs_1016
unicode_pairs_1016 = re.findall('(?<=unicodelong": "\\\)(.*?)(?=")', corpus)    
glyph_pairs_1016 = [codecs.decode(unicode_pairs_1016[x].replace(str('\\\\'),str('\\')).replace('_',''), 'unicode_escape') for x in range(len(unicode_pairs_1016))]

# computation of Cosine Similarity
goldstandard = []
selftrained = []
google_sense_labels = []
for x in range(len(annotator_similarity_score_508)):
    cosineSimilarity = None
    
    emoji1 = glyph_pairs_1016.pop(0)
    emoji2 = glyph_pairs_1016.pop(0)

    try:
#         cosineSimilarity = emoji2vec.wv.similarity(emoji1, emoji2)
        cosineSimilarity = cosine_similarity(emoji2vec.wv.get_vector(emoji1).reshape(-1, 300), emoji2vec.wv.get_vector(emoji2).reshape(-1, 300))
    except Exception as error:
        print(error)
#         print('the cosine similarity between ' + emoji1 + ' and ' + emoji2 + ' could not be computed.')
    
    if(cosineSimilarity is not None):
        goldstandard.append(annotator_similarity_score_508.pop(0))
        selftrained.append(cosineSimilarity)
        google_sense_labels.append(float(google_sense_labels_score_508.pop(0)))
        

# skalierter GoldStandard
# min_max_scaler = preprocessing.MinMaxScaler()
# scaled_goldstandard = min_max_scaler.fit_transform(np.asarray(goldstandard).reshape(-1, 1))

print()

# computation of SPEARRANK CORRELATION COEFFICIENT
meinSPEARMAN = stats.spearmanr(goldstandard, np.reshape(selftrained, (-1,1)))
seinSPEARMAN = stats.spearmanr(goldstandard, google_sense_labels)
print('mein Spearman: {}'.format(meinSPEARMAN.correlation))
print('sein Spearman: {}'.format(seinSPEARMAN.correlation))


# # computation of MAE
# meinMAE = mean_absolute_error(scaled_goldstandard, selftrained)
# seinMAE = mean_absolute_error(scaled_goldstandard, google_sense_labels)
# print('mein MAE ist {}'.format(meinMAE))
# print('sein MAE ist {}'.format(seinMAE))


# # computation of MSE
# meinMSE = mean_squared_error(scaled_goldstandard, selftrained)
# seinMSE = mean_squared_error(scaled_goldstandard, google_sense_labels)
# print('mein MSE ist {}'.format(meinMSE))
# print('sein MSE ist {}'.format(seinMSE))




"word '🇬🇧' not in vocabulary"
"word '🇺🇸' not in vocabulary"
"word '🇺🇸' not in vocabulary"
"word '🇺🇸' not in vocabulary"
"word '🇳🇬' not in vocabulary"
"word '🇳🇬' not in vocabulary"
"word '🇳🇬' not in vocabulary"
"word '🇳🇬' not in vocabulary"
"word '🇳🇬' not in vocabulary"
"word '↪' not in vocabulary"

mein Spearman: 0.5729216737445608
sein Spearman: 0.7609726910462977




In [None]:
print(type(goldstandard[4]))

In [43]:
emoji2vec.wv.get_vector(emoji1).reshape(-1, 300)

array([[ 0.0113763 , -0.09432855, -0.01759987,  0.03411459,  0.08257309,
        -0.16001141, -0.06705479, -0.00603727,  0.06634685,  0.07260861,
        -0.04643961,  0.13102265, -0.03373604,  0.0737769 ,  0.0411712 ,
         0.04800228, -0.04882775, -0.00764793,  0.03592939, -0.02060673,
         0.10502496,  0.0054965 , -0.00784748, -0.036717  , -0.01199365,
         0.06115178,  0.01366161, -0.00773022, -0.09187113,  0.02630426,
         0.06331287,  0.08376262,  0.03039989, -0.09481341, -0.0909183 ,
         0.0226414 ,  0.06365262, -0.15650192,  0.026419  , -0.02922664,
        -0.03428204, -0.03727794,  0.03174405, -0.18580683,  0.00160092,
        -0.12204134, -0.07231367, -0.02418649, -0.05882958, -0.15579382,
         0.06736479,  0.01461748, -0.04428886, -0.01539759, -0.07367848,
        -0.06376503,  0.07014366,  0.10850102, -0.03635043,  0.0810407 ,
         0.0008184 , -0.06686612, -0.05080912,  0.02183636, -0.19613698,
         0.02239405, -0.01577152,  0.03986926,  0.0