In [1]:
import time
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Layer, Dot
from tensorflow.keras.regularizers import Regularizer
from tensorflow.keras.losses import cosine_similarity, Loss
from web.embedding import Embedding
from web.evaluate import evaluate_on_all
from tensorflow.keras.initializers import Zeros


In [2]:
# Calculates cosine similarity between words based on synonyms, antonyms, hypernyms, and hyponyms.
class SemanticSim(object):
   
    def __init__(self, sem_info_path, voc2id,
                 relu_syn=False, relu_ant=False,
                 relu_hyper=False, relu_hypo=False,
                 relu_mer=False, gamma=0.1, sample_size=1000):
        
        # Initialize variables.
        self.sem_info_path = sem_info_path
        self.voc2id = voc2id
        self.relu_syn = relu_syn
        self.relu_ant = relu_ant
        self.relu_hyper = relu_hyper
        self.relu_hypo = relu_hypo
        self.relu_mer = relu_mer
        self.gamma = gamma
        self.sample_size = sample_size
        
        # Load semantic_info files.
        self.syn = self._load_file("synonyms")
        self.ant = self._load_file("antonyms")
        self.hyper = self._load_file("hypernyms")
        self.hypo = self._load_file("hyponyms")
        self.mer = self._load_file("meronyms")
        
        # Calculate sample size for each word class.
        self.n_samples = len(self.syn) + len(self.ant) + len(self.hyper) + len(self.hypo) + len(self.mer)
        if self.sample_size > self.n_samples: 
            self.sample_size = self.n_samples
        self.n_syn = int(len(self.syn) / self.n_samples * self.sample_size) + 1
        self.n_ant = int(len(self.ant) / self.n_samples * self.sample_size) + 1
        self.n_hyper = int(len(self.hyper) / self.n_samples * self.sample_size) + 1
        self.n_hypo = int(len(self.hypo) / self.n_samples * self.sample_size) + 1
        self.n_mer = int(len(self.mer) / self.n_samples * self.sample_size) + 1
        self.sample_size = self.n_syn + self.n_ant + self.n_hyper + self.n_hypo + self.n_mer
           
    def _load_file(self, name):
        
        # Load files from semantic_info.
        nym = list()
        with open(f"{self.sem_info_path}/{name}.txt", 'r') as file:
            for line in file:
                x = line.strip('\n').split()
                inds = list()
                for i in x:
                    try: inds.append(self.voc2id[i])
                    except KeyError: pass
                for i in itertools.combinations(inds, 2):
                    nym.append(i)
            return np.asarray(nym)        
            
    def __call__(self, x):
        
        '''@param x (tf.Variable): Word embeddings of dimension (N x K).'''
        
        inds = list()
        
        # Synonyms
        ind = np.random.randint(0, len(self.syn), size=(self.n_syn,))
        y = tf.gather(x, self.syn[ind], axis=0)
        
        # relu_syn=True will only adjust synonyms that have cosine similarity <0 - i.e. adjust outliers.
        if self.relu_syn:
            syn_sim = tf.reduce_sum(tf.nn.relu(cosine_similarity(y[:, 0], y[:, 1])))
        else:
            syn_sim = tf.reduce_sum(cosine_similarity(y[:, 0], y[:, 1]))
        inds.append(self.syn[ind])
        
        # Antonyms
        ind = np.random.randint(0, len(self.ant), size=(self.n_ant,))
        y = tf.gather(x, self.ant[ind], axis=0)
        if self.relu_ant:
            ant_sim = tf.reduce_sum(tf.nn.relu(cosine_similarity(y[:, 0], y[:, 1])))
        else:
            ant_sim = tf.reduce_sum(cosine_similarity(y[:, 0], y[:, 1]))
        inds.append(self.ant[ind])
        
        # Hypernyms
        ind = np.random.randint(0, len(self.hyper), size=(self.n_hyper,))
        y = tf.gather(x, self.hyper[ind], axis=0)
        if self.relu_hyper:
            hyper_sim = tf.reduce_sum(tf.nn.relu(cosine_similarity(y[:, 0], y[:, 1])))
        else:
            hyper_sim = tf.reduce_sum(cosine_similarity(y[:, 0], y[:, 1]))
        inds.append(self.hyper[ind])
        
        # Hyponyms
        ind = np.random.randint(0, len(self.hypo), size=(self.n_hypo,))
        y = tf.gather(x, self.hypo[ind], axis=0)
        if self.relu_hypo:
            hypo_sim = tf.reduce_sum(tf.nn.relu(cosine_similarity(y[:, 0], y[:, 1])))
        else:
            hypo_sim = tf.reduce_sum(cosine_similarity(y[:, 0], y[:, 1]))
        inds.append(self.hypo[ind])
        
        # Meronyms
        ind = np.random.randint(0, len(self.mer), size=(self.n_mer,))
        y = tf.gather(x, self.mer[ind], axis=0)
        if self.relu_mer:
            mer_sim = tf.reduce_sum(tf.nn.relu(cosine_similarity(y[:, 0], y[:, 1])))
        else:
            mer_sim = tf.reduce_sum(cosine_similarity(y[:, 0], y[:, 1]))
        inds.append(self.mer[ind])
        
        # SEMANTIC LOSS:
        if self.relu_syn:
            total = (syn_sim + ant_sim + hyper_sim + hypo_sim + mer_sim) / self.sample_size
        else:
            total = 1 + (syn_sim + ant_sim + hyper_sim + hypo_sim + mer_sim) / self.sample_size
        
        return inds, self.gamma * total

    def get_config(self):
        return {'gamma': float(self.gamma), 
                'sample_size': int(self.sample_size)}

In [3]:
# Custom embedding layer, which takes pre-trained embeddings and does not limit size of vocabulary.
class WordEmbedding(Layer):
    def __init__(self, vocab, dims, init_file=None, id2voc=None, kernel_regularizer=None, name=None):
        
        super(WordEmbedding, self).__init__(name=name)
        self.vocab = vocab
        self.dims = dims
        self.id2voc = id2voc
        self.init_file = init_file        
        self.init_emb = tf.Variable(Zeros()(shape=(self.vocab, self.dims), dtype=tf.float32), trainable=False)        
        self.w = self.add_weight(shape=(self.vocab, self.dims),
                                 initializer="glorot_uniform",
                                 trainable=True, 
                                 regularizer=kernel_regularizer,
                                 dtype=tf.float32)

        x = np.asarray(pd.read_csv(self.init_file, sep=' ', header=None))
        word2row = {word: int(i) for i, word in enumerate(x[:, 0])}
        order = list()
        for i in range(len(self.id2voc)):
            try:
                order.append(word2row[self.id2voc[i]])
            except KeyError:
                order.append(word2row["UNK"])
        x = x[order, 1:]                
        self.init_emb.assign(x)
        self.w.assign(x)        
        
    def call(self, inputs):
        return tf.nn.embedding_lookup(self.w, inputs)

In [4]:
# Semantic correction model.
class SemCorrect(Model):

    def __init__(self, vocab_size, dim, voc2id, id2voc, emb_init_file, sem_info_path,
                 relu_syn=False, relu_ant=False, relu_hyper=False, relu_hypo=False, relu_mer=False,
                 batch_size=128, num_batches=100, gamma=0.1, output_name=None, 
                 log_dir="./log/", config_dir="./config/"):
                
        super(SemCorrect, self).__init__()
        
        self.vocab = vocab_size
        self.dim = dim
        self.voc2id = voc2id
        self.id2voc = id2voc
        self.sem_info_path = sem_info_path
        self.emb_init_file = emb_init_file
        self.batch_size = batch_size
        self.num_batches = num_batches
        # Regularization hyperparameter.
        self.gamma = gamma
        self.output_name = output_name
        self.log_dir = log_dir
        self.config_dir = config_dir
        # If true, these will only correct outliers in each class.
        self.relu_syn= relu_syn
        self.relu_ant= relu_ant
        self.relu_hyper= relu_hyper
        self.relu_hypo= relu_hypo
        self.relu_mer= relu_mer

        self.best_results = 0
        self.best_int_avg = 0
        
        logger = get_logger(self.output_name, self.log_dir, self.config_dir)
        logger.setLevel("ERROR")
        
        self.sem_sim = SemanticSim(self.sem_info_path, self.voc2id, relu_syn=self.relu_syn,
                                   relu_ant=self.relu_ant, relu_hyper=self.relu_hyper,
                                   relu_hypo=self.relu_hypo, relu_mer=self.relu_mer,
                                   gamma=self.gamma, sample_size=self.batch_size)
        self.embeddings = WordEmbedding(self.vocab, self.dim, init_file=self.emb_init_file, 
                                        id2voc=self.id2voc, name="word_embeddings")
        self.dot = Dot(axes=-1)
        
    def call(self, inputs):
        
        inds, sem_loss = model.sem_sim(model.embeddings.weights[0])        
        sample_inds = [i for k in inds for j in k for i in j]
        
        # SYNTACTIC LOSS:
        # Pre-trained embeddings.
        init_emb = tf.expand_dims(model.embeddings.init_emb, 0)
        # Pre-trained and current samples.
        init_sample = tf.expand_dims(tf.nn.embedding_lookup(model.embeddings.init_emb, sample_inds), 0)
        curr_sample = tf.expand_dims(model.embeddings(sample_inds), 0)
        
        # Cosine similairty between dot(current (sample) embeddings and all initial embedding)
        # and dot(initial (sample) embeddings and all initial embeddings)
        syn_loss = 1 + tf.reduce_mean(cosine_similarity(model.dot([curr_sample, init_emb]), 
                                                        model.dot([init_sample, init_emb])))        
        # TOTAL LOSS
        loss = sem_loss + syn_loss
        return loss

    def checkpoint(self):
        # Save model with best average score across sample of easy-to-evaluate tasks.
        
        embed_matrix = tf.math.l2_normalize(self.embeddings.weights[0], axis=1)
        words = [self.id2voc[i] for i in range(len(self.id2voc))]
        voc2vec = dict(zip(words, iter(embed_matrix.numpy())))
        embedding = Embedding.from_dict(voc2vec)
        results = evaluate_on_all(embedding)
        results = {key: round(val[0], 4) for key, val in results.items()}
        curr_int = np.mean(list(results.values()))

        if curr_int >= self.best_int_avg:
            with open(f"{os.getcwd()}/embeddings/{self.output_name}.txt", 'w') as file:
                for key, values in voc2vec.items():
                    file.write(key)
                    [file.write(f" {v}") for v in values]
                    file.write('\n')
            self.best_results = results
            self.best_int_avg = curr_int
        return results
        
        
    def train(self, epochs):
        
        # Train model.
        
        self.best_int_avg = 0
        for epoch in range(epochs):
            print("\nCreating checkpoint:")
            results = self.checkpoint()
            print(results)
            start_time = time.time()
            print(f"Epoch {epoch + 1}/{epochs}:\n")
            optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

            # Iterate over the batches of the dataset.
            for step in range(self.num_batches):
                with tf.GradientTape() as tape:
                    loss = model(0)
                grads = tape.gradient(loss, self.trainable_weights)
                optimizer.apply_gradients(zip(grads, self.trainable_weights))
                print(f"\rStep: {step + 1}/{self.num_batches}; Elapsed time: {time.time() - start_time:.2f}; \
                      Training loss: {loss:.4f}", end='\r') 



In [5]:
# Filepaths
sem_info_path = f"{os.getcwd()}/semantic_info"
emb_init_file = f"{os.getcwd()}/embeddings/init_avg_emb.txt"

# Model config.
voc2id = dict()
i = 0
with open(emb_init_file, 'r') as file:
    for line in file:
        voc2id[line.strip().split(' ')[0]] = i
        i += 1
id2voc = {v: k for k, v in voc2id.items()}
vocab_size = len(voc2id)
embed_dim = 300

# Initialize model.
model = SemCorrect(vocab_size, embed_dim, voc2id, id2voc, emb_init_file, sem_info_path,
                   relu_syn=False, relu_ant=False, relu_hyper=False, relu_hypo=False, relu_mer=False,
                   batch_size=1000, num_batches=50, gamma=0.1, output_name="fin_avg_emb_False_01")

In [7]:
# Ignore warnings from evaluation task. Note: The warnings will not be printed here, but printed to logger.
import warnings
warnings.filterwarnings("ignore")

In [8]:
model.train(25)


Creating checkpoint:
{'AP': 0.2562, 'BLESS': 0.425, 'Battig': 0.2128, 'ESSLI_2c': 0.4444, 'ESSLI_2b': 0.675, 'ESSLI_1a': 0.4318, 'WS353': 0.2488, 'WS353R': 0.1394, 'WS353S': 0.3727, 'MEN': 0.1796, 'SimLex999': 0.1637, 'RW': 0.0871, 'RG65': 0.3842, 'MTurk': 0.3246, 'MSR': 0.0072, 'Google': 0.003, 'SemEval2012_2': 0.1001}
Epoch 1/25:

Step: 50/50; Elapsed time: 225.47;                       Training loss: 0.0856
Creating checkpoint:
{'AP': 0.2239, 'BLESS': 0.285, 'Battig': 0.1667, 'ESSLI_2c': 0.3556, 'ESSLI_2b': 0.575, 'ESSLI_1a': 0.4545, 'WS353': 0.1124, 'WS353R': 0.0799, 'WS353S': 0.1418, 'MEN': 0.1174, 'SimLex999': 0.1179, 'RW': 0.0497, 'RG65': 0.1763, 'MTurk': 0.2254, 'MSR': 0.0025, 'Google': 0.0017, 'SemEval2012_2': 0.0709}
Epoch 2/25:

Step: 50/50; Elapsed time: 224.79;                       Training loss: 0.0737
Creating checkpoint:
{'AP': 0.2114, 'BLESS': 0.275, 'Battig': 0.1648, 'ESSLI_2c': 0.4222, 'ESSLI_2b': 0.625, 'ESSLI_1a': 0.3636, 'WS353': 0.1137, 'WS353R': 0.0847, 'WS353