In [1]:
from models import Model
from helper import *
import tensorflow as tf, time, ctypes
from sparse import COO
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Layer, Softmax, ReLU, Flatten, Dense, Dot
from tensorflow.keras.regularizers import Regularizer
import pandas as pd
from tensorflow.python.keras import backend as K
from tensorflow.keras.losses import cosine_similarity, Loss
from web.embedding import Embedding
from web.evaluate import evaluate_on_all
from tensorflow.keras.initializers import Zeros


In [2]:
class Args(object):
    def __init__(self):
        self.gpu = '0'
        self.name = 'test_run'
        self.embed_loc = None
        self.embed_dim = 300
        self.total_sents = 56974869
        self.lr = 0.001
        self.batch_size = 128
        self.max_epochs = 50
        self.l2 = 0.00001
        self.seed = 1234
        self.sample = 1e-4
        self.num_neg = 100
        self.side_int = 10000
        self.gcn_layer = 1
        self.dropout = 1.0
        self.opt = 'adam'
        self.onlyDump = 'store_true'
        self.context = False
        self.restore ='store_true'
        self.emb_dir = './embeddings/'
        self.log_dir = './log/'
        self.config_dir ='./config/'
        self.max_sent_len = 50
        self.max_dep_len = 800
        self.gamma = 0.1

p = Args()

In [3]:
class SemanticSim(object):
   
    def __init__(self, sem_info_path, voc2id,
                 gamma=0.1, sample_size=1000):
        
        # super(SemanticSim, self).__init__()
        
        self.sem_info_path = sem_info_path
        self.voc2id = voc2id
        self.gamma = gamma
        self.sample_size = sample_size
        
        self.syn = self._load_file("synonyms")
        self.ant = self._load_file("antonyms")
        self.hyper = self._load_file("hypernyms")
        self.hypo = self._load_file("hyponyms")
        self.mer = self._load_file("meronyms")

        self.n_samples = len(self.syn) + len(self.ant) + len(self.hyper) + len(self.hypo) + len(self.mer)
        if self.sample_size > self.n_samples: 
            self.sample_size = self.n_samples
        self.n_syn = int(len(self.syn) / self.n_samples * self.sample_size) + 1
        self.n_ant = int(len(self.ant) / self.n_samples * self.sample_size) + 1
        self.n_hyper = int(len(self.hyper) / self.n_samples * self.sample_size) + 1
        self.n_hypo = int(len(self.hypo) / self.n_samples * self.sample_size) + 1
        self.n_mer = int(len(self.mer) / self.n_samples * self.sample_size) + 1
        self.sample_size = self.n_syn + self.n_ant + self.n_hyper + self.n_hypo + self.n_mer
           
    def _load_file(self, name):
        nym = list()
        with open(f"{self.sem_info_path}/{name}.txt", 'r') as file:
            for line in file:
                x = line.strip('\n').split()
                inds = list()
                for i in x:
                    try: inds.append(self.voc2id[i])
                    except KeyError: pass
                for i in itertools.combinations(inds, 2):
                    nym.append(i)
            return np.asarray(nym)        
            
    def __call__(self, x):
        
        inds = list()
        
        # Synonyms
        ind = np.random.randint(0, len(self.syn), size=(self.n_syn,))
        y = tf.gather(x, self.syn[ind], axis=0)
        syn_sim = tf.reduce_sum(tf.math.reduce_euclidean_norm(y[:, 0] - y[:, 1], axis=1))
        
        # Antonyms
        ind = np.random.randint(0, len(self.ant), size=(self.n_ant,))
        y = tf.gather(x, self.ant[ind], axis=0)
        ant_sim = tf.reduce_sum(tf.math.reduce_euclidean_norm(y[:, 0] - y[:, 1], axis=1))
        
        # Hypernyms
        ind = np.random.randint(0, len(self.hyper), size=(self.n_hyper,))
        y = tf.gather(x, self.hyper[ind], axis=0)
        hyper_sim = tf.reduce_sum(tf.math.reduce_euclidean_norm(y[:, 0] - y[:, 1], axis=1))
        #inds.append(self.hyper[ind])
        
        # Hyponyms
        ind = np.random.randint(0, len(self.hypo), size=(self.n_hypo,))
        y = tf.gather(x, self.hypo[ind], axis=0)
        hypo_sim = tf.reduce_sum(tf.math.reduce_euclidean_norm(y[:, 0] - y[:, 1], axis=1))
        
        # Meronyms
        ind = np.random.randint(0, len(self.mer), size=(self.n_mer,))
        y = tf.gather(x, self.mer[ind], axis=0)
        mer_sim = tf.reduce_sum(tf.math.reduce_euclidean_norm(y[:, 0] - y[:, 1], axis=1))

        total = (syn_sim - ant_sim + hyper_sim + hypo_sim + mer_sim)
        
        return self.gamma * total

    def get_config(self):
        return {'gamma': float(self.gamma), 
                'sample_size': int(self.sample_size)}

In [4]:
class WordEmbedding(Layer):
    def __init__(self, vocab, dims, init_file=None, id2voc=None, kernel_regularizer=None, name=None):
        
        super(WordEmbedding, self).__init__(name=name)
        self.vocab = vocab
        self.dims = dims
        self.id2voc = id2voc
        self.init_file = init_file        
        self.init_emb = tf.Variable(Zeros()(shape=(self.vocab, self.dims), dtype=tf.float32), trainable=False)        
        self.w = self.add_weight(shape=(self.vocab, self.dims),
                                 initializer="glorot_uniform",
                                 trainable=True, 
                                 regularizer=kernel_regularizer,
                                 dtype=tf.float32)

        x = np.asarray(pd.read_csv(self.init_file, sep=' ', header=None))
        word2row = {word: int(i) for i, word in enumerate(x[:, 0])}
        order = list()
        for i in range(len(self.id2voc)):
            try:
                order.append(word2row[self.id2voc[i]])
            except KeyError:
                order.append(word2row["UNK"])
        x = x[order, 1:]                
        self.init_emb.assign(x)
        self.w.assign(x)        
        
    def call(self, inputs):
        return tf.nn.embedding_lookup(self.w, inputs)

In [5]:
class SemCorrect(Model):

    def __init__(self, vocab_size, dim, voc2id, id2voc, emb_init_file, sem_info_path,
                 batch_size=128, num_batches=100, gamma=0.1, output_name=None, 
                 log_dir="./log/", config_dir="./config/"):
                
        super(SemCorrect, self).__init__()
        
        self.vocab = vocab_size
        self.dim = dim
        self.voc2id = voc2id
        self.id2voc = id2voc
        self.sem_info_path = sem_info_path
        self.emb_init_file = emb_init_file
        self.batch_size = batch_size
        self.num_batches = num_batches
        self.gamma = gamma
        self.output_name = output_name
        self.log_dir = log_dir
        self.config_dir = config_dir

        self.best_results = 0
        self.best_int_avg = 0
        
        logger = get_logger(self.output_name, self.log_dir, self.config_dir)
        logger.setLevel("ERROR")
        
        self.sem_sim = SemanticSim(self.sem_info_path, self.voc2id, 
                                   gamma=self.gamma, sample_size=self.batch_size)
        self.embeddings = WordEmbedding(self.vocab, self.dim, init_file=self.emb_init_file, 
                                        id2voc=self.id2voc, name="word_embeddings")
        self.dot = Dot(axes=-1)
        
    def call(self, inputs):
        
        sem_dist = self.sem_sim(self.embeddings.weights[0])
        
        emb = self.embeddings.weights[0]
        # Space Preservation        
        syn_score = tf.reduce_mean(self.dot([emb, self.embeddings.init_emb])) * self.sem_sim.sample_size
        
        syn_dist = tf.reduce_mean(tf.math.sqrt(emb_sq - 
                                  2 * tf.matmul(self.embeddings.weights[0], 
                                                tf.transpose(self.embeddings.init_emb)) + 
                                  tf.transpose(self.embeddings.init_emb_sq))) * self.sem_sim.sample_size
        # Calculate loss.
        loss = sem_dist + syn_dist
        
        print(sem_dist, syn_dist)
        
        return loss

    def checkpoint(self):
        embed_matrix = tf.math.l2_normalize(self.embeddings.weights[0], axis=1)
        words = [self.id2voc[i] for i in range(len(self.id2voc))]
        voc2vec = dict(zip(words, iter(embed_matrix.numpy())))
        embedding = Embedding.from_dict(voc2vec)
        results = evaluate_on_all(embedding)
        results = {key: round(val[0], 4) for key, val in results.items()}
        curr_int = np.mean(list(results.values()))

        if curr_int >= self.best_int_avg:
            with open(f"{os.getcwd()}/embeddings/{self.output_name}.txt", 'w') as file:
                for key, values in voc2vec.items():
                    file.write(key)
                    [file.write(f" {v}") for v in values]
                    file.write('\n')
            self.best_results = results
            self.best_int_avg = curr_int
        return results
        
        
    def train(self, epochs):

        self.best_int_avg = 0
        for epoch in range(epochs):
            start_time = time.time()
            print(f"\nEpoch {epoch + 1}/{epochs}:\n")
            optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

            # Iterate over the batches of the dataset.
            for step in range(self.num_batches):
                with tf.GradientTape() as tape:
                    loss = model(0)
                grads = tape.gradient(loss, self.trainable_weights)
                optimizer.apply_gradients(zip(grads, self.trainable_weights))
                print(f"\rStep: {step + 1}/{self.num_batches}; Elapsed time: {time.time() - start_time:.2f}; \
                      Training loss: {loss:.4f}", end='\r') 

            print("Checkpoint:")
            results = self.checkpoint()
            print(results)


In [6]:
sem_info_path = f"{os.getcwd()}/semantic_info"
emb_init_file = f"{os.getcwd()}/embeddings/init_syngcn_emb.txt"

voc2id = dict()
i = 0
with open(emb_init_file, 'r') as file:
    for line in file:
        voc2id[line.strip().split(' ')[0]] = i
        i += 1
id2voc = {v: k for k, v in voc2id.items()}
vocab_size = len(voc2id)

num_batches = 100
batch_size=128

In [7]:
model = SemCorrect(vocab_size, 300, voc2id, id2voc, emb_init_file, sem_info_path,
                   batch_size=10000, num_batches=50, gamma=1, output_name="fin_syngcn_emb_euc")

In [8]:
import warnings
warnings.filterwarnings("ignore")

In [14]:
model.dot([model.embeddings.weights[0], model.embeddings.init_emb]).shape

TensorShape([150000, 1])

ResourceExhaustedError: OOM when allocating tensor with shape[1,150000,150000] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:BatchMatMulV2]

In [11]:
model.embeddings.init_emb

<tf.Variable 'Variable:0' shape=(150000, 300) dtype=float32, numpy=
array([[ 1.8015e-01, -1.7162e-01,  4.4809e-02, ..., -2.1986e-01,
         4.2269e-02,  6.4632e-02],
       [-8.3958e-02,  3.5026e-01,  8.1890e-03, ..., -4.6744e-02,
         7.7000e-02, -7.9842e-02],
       [-1.7330e-03,  3.7910e-03, -5.3470e-03, ..., -2.7700e-04,
         1.2932e-02,  4.5100e-04],
       ...,
       [ 1.4008e-01,  7.1165e-01,  7.9465e-01, ..., -3.1664e-01,
        -1.7053e-01, -2.4140e-01],
       [-4.1891e-01,  6.6092e-01, -4.1997e-01, ..., -6.7517e-01,
        -9.1229e-01, -3.1517e-01],
       [-2.5545e-01,  1.4126e-01,  3.7910e-03, ..., -1.0351e-02,
        -2.9522e-01,  5.2476e-01]], dtype=float32)>

In [9]:
model.train(5)


Epoch 1/5:



ResourceExhaustedError: OOM when allocating tensor with shape[150000,150000] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:MatMul]

In [11]:
emb = model.embeddings.weights[0]