In [1]:
import torch
import numpy as np
import torch.nn.functional as F
import torch.nn as A
import pandas as pd
from sklearn.manifold import TSNE
import sklearn.manifold 
from matplotlib import pyplot as plt
import re
import codecs
from numpy  import array
from scipy import stats
# from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity
import os
from tensorboardX import SummaryWriter
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader


from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity

# PREPROCESSING

In [2]:
corpus_filename = '../data/extracted_emoji_sequences.txt' 

In [3]:
corpus = open(corpus_filename).read().splitlines()

In [4]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

In [None]:
def onlyEmojiSequences(tokens):
    threshold_emojis = [x for x in tokens if len(x) > 1]
    return threshold_emojis

In [None]:
tokenized_corpus = tokenize_corpus(corpus)
emojiSequences = onlyEmojiSequences(tokenized_corpus)

In [None]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

# this is just the very basic translation both ways plus the length of vocabulary

In [None]:
window_size = 8
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

# sole purpose of this is to have pairs! of target and context word
# super simple once you have figured out the code!

# TRAINING

In [None]:
testWriter = SummaryWriter()



In [None]:
class LoadedDataSet(Dataset):

    def __init__(self, pairs):
        self.data = pairs

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        return context, target

In [None]:
def get_input_layer(indexes):
    x = torch.zeros(vocabulary_size, len(indexes)).float()
    for column in range(len(indexes)):
        for i in indexes:
            x[i][column] = 1.0
            return x

# this is a one hot encoded something

In [None]:
dimensionSize = 40
num_epochs = 20
lr = 0.001
batchSize = 150

inputLayer = torch.randn(dimensionSize, vocabulary_size, requires_grad=True)
outputLayer = torch.randn(vocabulary_size, dimensionSize, requires_grad=True)

loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([inputLayer, outputLayer], lr)

# print('total amount of batches {}'.format(len(idx_pairs) / batchSize))
for epo in range(num_epochs):
    priorLoss = 0
    
    dataset = LoadedDataSet(idx_pairs)
    loader = DataLoader(dataset, batchSize, shuffle=True)
    
    pbar = tqdm(loader)
#     pbar.set_description("[Epoch {}]".format(epo))

    for data, target in pbar:
        # one hot encoded tensor
        x = get_input_layer(data)

        # target word 
        y_true = target    

        # Hidden Layer: gradient magic happening ...
        z1 = torch.matmul(inputLayer, x)
        z2 = torch.matmul(outputLayer, z1)

        output = loss(torch.t(z2), y_true)  
        testWriter.add_scalar('lossvalue', output, epo)


#         priorLoss += output.item() 
        output.backward()

        optimizer.step()       
        optimizer.zero_grad()

        pbar.set_postfix(loss=output.item())
        
#     print(priorLoss)
    
    
testwriter.export_scalars_to_json("./all_scalars.json")
testWriter.close

100%|██████████| 9742/9742 [00:50<00:00, 192.49it/s, loss=7.49]
100%|██████████| 9742/9742 [00:56<00:00, 172.06it/s, loss=6.95]
 87%|████████▋ | 8429/9742 [00:50<00:07, 165.31it/s, loss=7.04]

In [None]:
if not os.path.exists("SOFTMAX_trained"):
    os.makedirs("SOFTMAX_trained")

In [None]:
torch.save(outputLayer, os.path.join("SOFTMAX_trained", "test#2.w2v"))

# EVALUATING TRAINED MODEL

In [None]:
loadedLayer = torch.load(os.path.join("SOFTMAX_trained", "test#2.w2v"))

In [None]:
# extracts the 508 Annotator Results as the Gold-Standard
corpus_filename = '../data/EmoSim508.json'
corpus = open(corpus_filename).read()
annotator_similarity_score_508 = list(array(re.findall('(?<=_Annotator_Agreement": )(.*?)(?=\})', corpus)))

# extract Wijeratne's Cosine_Similarities of the model which was trained on Google_Sense_Labels
google_sense_labels_score_508 = list(array(re.findall('(?<=Google_Sense_Label": )(.*?)(?=\,)', corpus)))

# glyph_pairs_1016
unicode_pairs_1016 = re.findall('(?<=unicodelong": "\\\)(.*?)(?=")', corpus)    
glyph_pairs_1016 = [codecs.decode(unicode_pairs_1016[x].replace(str('\\\\'),str('\\')).replace('_',''), 'unicode_escape') for x in range(len(unicode_pairs_1016))]

# computation of Cosine Similarity
goldstandard = []
selftrained = []
google_sense_labels = []
for x in range(len(annotator_similarity_score_508)):
    cosineSimilarity = None
    
    emoji1 = glyph_pairs_1016.pop(0)
    emoji2 = glyph_pairs_1016.pop(0)
    
    try:
        cosineSimilarity = cosine_similarity(loadedLayer.detach().cpu().numpy()[word2idx[emoji1]].reshape(-1,dimensionSize), loadedLayer.detach().cpu().numpy()[word2idx[emoji2]].reshape(-1,dimensionSize))[0][0]
    except:
        print('the cosine similarity between ' + emoji1 + ' and ' + emoji2 + ' could not be computed.')
    
    if(cosineSimilarity is not None):
        goldstandard.append(annotator_similarity_score_508.pop(0))
        selftrained.append(cosineSimilarity)
        google_sense_labels.append(float(google_sense_labels_score_508.pop(0)))
        

# skalierter GoldStandard
min_max_scaler = preprocessing.MinMaxScaler()
scaled_goldstandard = min_max_scaler.fit_transform(np.asarray(goldstandard).reshape(-1, 1))

print()

# computation of SPEARRANK CORRELATION COEFFICIENT
meinSPEARMAN = stats.spearmanr(goldstandard, selftrained)
seinSPEARMAN = stats.spearmanr(goldstandard, google_sense_labels)
print('mein Spearman: {}'.format(meinSPEARMAN.correlation))
print('sein Spearman: {}'.format(seinSPEARMAN.correlation))


# computation of MAE
meinMAE = mean_absolute_error(scaled_goldstandard, min_max_scaler.fit_transform(np.asarray(selftrained).reshape(-1, 1)))
seinMAE = mean_absolute_error(scaled_goldstandard, google_sense_labels)
print('mein MAE ist {}'.format(meinMAE))
print('sein MAE ist {}'.format(seinMAE))


# computation of MSE
meinMSE = mean_squared_error(scaled_goldstandard, min_max_scaler.fit_transform(np.asarray(selftrained).reshape(-1, 1)))
seinMSE = mean_squared_error(scaled_goldstandard, google_sense_labels)
print('mein MSE ist {}'.format(meinMSE))
print('sein MSE ist {}'.format(seinMSE))
