In [1]:
from models import Model
from helper import *
import tensorflow as tf, time, ctypes
from sparse import COO
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Layer, Softmax, ReLU, Flatten, Dense
from tensorflow.keras.regularizers import Regularizer
import pandas as pd
from tensorflow.python.keras import backend as K
from tensorflow.keras.losses import cosine_similarity, Loss
from web.embedding import Embedding
from web.evaluate import evaluate_on_all


In [2]:
class Args(object):
    def __init__(self):
        self.gpu = '0'
        self.name = 'test_run'
        self.embed_loc = None
        self.embed_dim = 300
        self.total_sents = 56974869
        self.lr = 0.001
        self.batch_size = 128
        self.max_epochs = 50
        self.l2 = 0.00001
        self.seed = 1234
        self.sample = 1e-4
        self.num_neg = 100
        self.side_int = 10000
        self.gcn_layer = 1
        self.dropout = 1.0
        self.opt = 'adam'
        self.onlyDump = 'store_true'
        self.context = False
        self.restore ='store_true'
        self.emb_dir = './embeddings/'
        self.log_dir = './log/'
        self.config_dir ='./config/'
        self.max_sent_len = 50
        self.max_dep_len = 800
        self.gamma = 0.1

p = Args()

In [3]:
! g++ batch_generator.cpp -o batchGen.so -fPIC -shared -pthread -O3 -march=native -std=c++11

In [4]:
# Load data.

voc2id = {k: int(v) for k, v in read_mappings('./data/voc2id.txt').items()}
id2freq = id2freq = {int(k): int(v) for k, v in read_mappings('./data/id2freq.txt').items()}
id2voc = {v: k for k, v in voc2id.items()}
vocab_size = len(voc2id)
wrd_list = [id2voc[i] for i in range(vocab_size)]
de2id = {k: int(v) for k, v in read_mappings('./data/de2id.txt').items()}
num_deLabel = len(de2id)

vocab = len(voc2id)
corpus_size = np.sum(list(id2freq.values()))
voc2freq = [id2freq[_id] for _id in range(len(voc2id))]

if not p.context: 
    p.win_size = 0

lib = ctypes.cdll.LoadLibrary('./batchGen.so')			# Loads the C++ code for making batches
lib.init()

# Creating pointers required for creating batches
edges = np.zeros(p.max_dep_len * p.batch_size * 3, dtype=np.int32)
wrds = np.zeros(p.max_sent_len * p.batch_size, dtype=np.int32)
samp = np.zeros(p.max_sent_len * p.batch_size, dtype=np.int32)
negs = np.zeros(p.max_sent_len * p.num_neg * p.batch_size, dtype=np.int32)
wlen = np.zeros(p.batch_size, dtype=np.int32)
elen = np.zeros(p.batch_size, dtype=np.int32)

# Pointer address of above arrays
edges_addr = edges.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
wrds_addr = wrds.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
negs_addr = negs.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
samp_addr = samp.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
wlen_addr = wlen.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
elen_addr = elen.ctypes.data_as(ctypes.POINTER(ctypes.c_int))



In [5]:
def get_batch():
    lib.reset()
    while True:
        eph_ovr = lib.getBatch(edges_addr, wrds_addr, negs_addr, samp_addr, elen_addr, wlen_addr, 
                        p.win_size, 0, p.batch_size, ctypes.c_float(0))
        if eph_ovr == 1: 
            break
        else:
            batch = {'edges': edges, 'wrds': wrds, 'elen': elen, 'wlen': wlen}
            yield (batch, 0)


In [6]:
lib.reset()
while True:
    eph_ovr = lib.getBatch(edges_addr, wrds_addr, negs_addr, samp_addr, elen_addr, wlen_addr, 
                    p.win_size, 0, p.batch_size, ctypes.c_float(0))
    if eph_ovr == 1: 
        break
    else:
        batch = {'edges': edges, 'wrds': wrds, 'elen': elen, 'wlen': wlen}
        break

In [7]:
def get_adj(batch, seq_len):
    """
    Parameters
    ----------
    batch: Batch returned by getBatch generator
    seq_len: Max length of sentence in the batch

    Returns
    -------
    Adjacency matrix shape=[Number of dependency labels, Batch size, seq_len, seq_len]
    """

    # Total number of edges in batch.
    num_edges = np.sum(batch['elen'])
    # Sentence number in batch.
    b_ind = np.expand_dims(np.repeat(np.arange(p.batch_size), batch['elen']), axis=1)
    # Reshape edges vector to have (parent, dep, label) format
    e_ind = np.reshape(batch['edges'], [-1, 3])[:num_edges]

    # Indexes to format (label, batch_num, parent, dep).
    adj_ind = np.concatenate([b_ind, e_ind], axis=1)
    adj_ind = adj_ind[:, [3,0,1,2]]
    # Put edge weight 1 for each edge in batch.
    adj_data = np.ones(num_edges, dtype=np.float32)

    return COO(adj_ind.T, adj_data, shape=(num_deLabel, p.batch_size, seq_len, seq_len)).todense()

In [8]:
def pad_data(data, dlen, sub_sample=[]):

    """
    Pads a given batch

    Parameters
    ----------
    data: List of tokenized sentences in a batch
    dlen: Total number of words in each sentence in a batch

    Returns
    -------
    data_pad: Padded word sequence
    data_mask: Masking for padded words
    max_len: Maximum length of sentence in the batch
    """
    
    max_len   = np.max(dlen)
    data_pad  = np.zeros([len(dlen), max_len], dtype=np.int32)
    data_mask = np.zeros([len(dlen), max_len], dtype=np.float32)

    offset = 0
    for i in range(len(dlen)):
        data_pad [i, :dlen[i]] = data[offset: offset + dlen[i]]
        data_mask[i, :dlen[i]] = 1
        if len(sub_sample) != 0:
            data_mask[i, :dlen[i]] *= sub_sample[offset: offset + dlen[i]]
        offset += dlen[i]

    return data_pad, data_mask, max_len

In [9]:
class GCN(Layer):

    def __init__(self, out_dim, n_labels, batch_size=128, gating=False, reg=None, name=None):

        super(GCN, self).__init__(name=name)
        
        self.out_dim = out_dim
        self.n_labels = n_labels
        self.batch_size = batch_size
        self.gating = gating
        self.reg = reg
        
        self.in_layers = list()
        self.out_layers = list()
        self.in_gates = list()
        self.out_gates = list()
        
        for i in range(self.n_labels):
            self.in_layers.append(Dense(self.out_dim, kernel_regularizer=self.reg, name=f"in_{i}"))
            self.out_layers.append(Dense(self.out_dim, kernel_regularizer=self.reg, name=f"out_{i}"))
            self.in_gates.append(Dense(1, activation="sigmoid", name=f"in_gate_{i}"))
            self.out_gates.append(Dense(1, activation="sigmoid", name=f"out_gate_{i}"))


    def call(self, gcn_input, adj_mat):

        # Rolling sum of adjacent nodes.
        max_nodes = adj_mat.shape[-1]
        adj_mat = tf.cast(adj_mat, dtype=tf.float32)
        out = tf.zeros([self.batch_size, max_nodes, self.out_dim])
        
        for i in range(self.n_labels):
            # In degrees.
            xW_in = self.in_layers[i](gcn_input)
            A_in = tf.transpose(adj_mat[i], [0, 2, 1])
            if self.gating: 
                xW_in = xW_in * self.in_gates[i](gcn_input)           
            h_in = tf.matmul(A_in, xW_in)
            
            # Out degrees.
            xW_out = self.out_layers[i](gcn_input)
            A_out = adj_mat[i]
            if self.gating: 
                xW_out = xW_out * self.out_gates[i](gcn_input)
            h_out = tf.matmul(A_out, xW_out)
            
            # Total
            out += (h_in + h_out)
            
        return out
        
     
        
        


In [10]:
class L2MeanRegularizer(tf.keras.regularizers.Regularizer):
    def __init__(self, l2=0.01):
        self.l2 = l2

    def __call__(self, x):
        return self.l2 * tf.math.reduce_mean(tf.math.square(x))

    def get_config(self):
        return {'l2': float(self.l2)}


In [11]:
class SemRegularizer(Regularizer):
    def __init__(self, sem_info_path, voc2id_path, 
                 gamma=0.1, sample_size=1000):
        
        self.sem_info_path = sem_info_path
        self.voc2id_path = voc2id_path
        self.gamma = gamma
        self.sample_size = sample_size
        
        self.voc2id = dict()
        self._load_voc2id()
        self.syn = self._load_file("synonyms")
        self.ant = self._load_file("antonyms")
        self.hyper = self._load_file("hypernyms")
        self.hypo = self._load_file("hyponyms")
        self.mer = self._load_file("meronyms")

        self.n_samples = len(self.syn) + len(self.ant) + len(self.hyper) + len(self.hypo) + len(self.mer)
        if self.sample_size > self.n_samples: 
            self.sample_size = self.n_samples
        self.n_syn = int(len(self.syn) / self.n_samples * self.sample_size)
        self.n_ant = int(len(self.ant) / self.n_samples * self.sample_size)
        self.n_hyper = int(len(self.hyper) / self.n_samples * self.sample_size)
        self.n_hypo = int(len(self.hypo) / self.n_samples * self.sample_size)
        self.n_mer = int(len(self.mer) / self.n_samples * self.sample_size)
        
        
    def _load_voc2id(self):
        with open(self.voc2id_path, 'r') as file:
            for line in file:
                x = line.strip('\n').split('\t')
                self.voc2id[x[0]] = int(x[1])        
    
    def _load_file(self, name):
        nym = list()
        with open(f"{self.sem_info_path}/{name}.txt", 'r') as file:
            for line in file:
                x = line.strip('\n').split()
                inds = list()
                for i in x:
                    try: inds.append(self.voc2id[i])
                    except KeyError: pass
                for i in itertools.combinations(inds, 2):
                    nym.append(i)
            return np.asarray(nym)        
            
    def __call__(self, x):
        
        # Synonyms
        ind = np.random.randint(0, len(self.syn), size=(self.n_syn,))
        y = tf.gather(x, self.syn[ind], axis=0)
        syn_sim = self.n_syn + tf.reduce_sum(cosine_similarity(y[:, 0], y[:, 1]))
        
        # Antonyms
        ind = np.random.randint(0, len(self.ant), size=(self.n_ant,))
        y = tf.gather(x, self.ant[ind], axis=0)
        ant_sim = self.n_ant - tf.reduce_sum(cosine_similarity(y[:, 0], y[:, 1]))
        
        # Hypernyms
        ind = np.random.randint(0, len(self.hyper), size=(self.n_hyper,))
        y = tf.gather(x, self.hyper[ind], axis=0)
        hyper_sim = self.n_hyper + tf.reduce_sum(cosine_similarity(y[:, 0], y[:, 1]))
        
        # Hyponyms
        ind = np.random.randint(0, len(self.hypo), size=(self.n_hypo,))
        y = tf.gather(x, self.hypo[ind], axis=0)
        hypo_sim = self.n_hypo + tf.reduce_sum(cosine_similarity(y[:, 0], y[:, 1]))
        
        # Meronyms
        ind = np.random.randint(0, len(self.mer), size=(self.n_mer,))
        y = tf.gather(x, self.mer[ind], axis=0)
        mer_sim = self.n_mer + tf.reduce_sum(cosine_similarity(y[:, 0], y[:, 1]))

        total = (syn_sim + ant_sim + hyper_sim + hypo_sim + mer_sim) / self.sample_size
        
        return self.gamma * total

    def get_config(self):
        return {'gamma': float(self.gamma), 
                'sample_size': int(self.sample_size)}

In [12]:
class WordEmbedding(Layer):
    def __init__(self, vocab, dims, init_file=None, id2voc=None, kernel_regularizer=None, name=None):
        
        super(WordEmbedding, self).__init__(name=name)
        self.vocab = vocab
        self.dims = dims
        self.id2voc = id2voc
        self.init_file = init_file
        
        
        self.w = self.add_weight(shape=(self.vocab, self.dims),
                                 initializer="glorot_uniform",
                                 trainable=True, 
                                 regularizer=kernel_regularizer)

        if self.init_file != None and self.id2voc != None:            
            x = np.asarray(pd.read_csv(self.init_file, sep=' ', header=None))
            word2row = {word: int(i) for i, word in enumerate(x[:, 0])}
            order = list()
            for i in range(len(self.id2voc)):
                try:
                    order.append(word2row[self.id2voc[i]])
                except KeyError:
                    order.append(word2row["UNK"])
            x = x[order, 1:]                        
            self.w.assign(x)
    
    def call(self, inputs):
        return tf.nn.embedding_lookup(self.w, inputs)

In [13]:
class SynGCN(Model):

    def __init__(self, vocab, dim, num_deLabel, id2voc, voc2freq, sem_info_path, voc2id_path, 
                 batch_size=128, emb_init_file=None, gcn_gating=True,
                 gamma=0.1, reg_sample_size=1000, num_neg=100, name=None, 
                 log_dir="./log/", config_dir="./config/"):
                
        super(SynGCN, self).__init__()
        
        self.vocab = vocab
        self.dim = dim
        self.sem_info_path = sem_info_path
        self.voc2id_path = voc2id_path
        self.emb_init_file = emb_init_file
        self.batch_size = batch_size
        self.gamma = gamma
        self.reg_sample_size = reg_sample_size
        self.num_deLabel = num_deLabel
        self.gcn_gating=gcn_gating
        self.voc2freq = voc2freq
        self.num_neg = num_neg
        self.id2voc = id2voc
        self.best_results = 0
        self.best_int_avg = 0
        self.output_name = name
        self.log_dir = log_dir
        self.config_dir = config_dir
        
        logger = get_logger(self.output_name, self.log_dir, self.config_dir)
        logger.setLevel("ERROR")
        
        self.l2_reg = L2MeanRegularizer(l2=0.1)
        self.sem_reg = SemRegularizer(self.sem_info_path, self.voc2id_path, 
                                      gamma=self.gamma, sample_size=self.reg_sample_size)
        self.embeddings = WordEmbedding(self.vocab, self.dim, init_file=self.emb_init_file, 
                                        id2voc=self.id2voc, kernel_regularizer=self.sem_reg, 
                                        name="word_embeddings")
        self.gcn = GCN(self.dim, self.num_deLabel, batch_size=self.batch_size, gating=self.gcn_gating, 
                      reg=self.l2_reg)
        self.relu = ReLU()
        self.context = WordEmbedding(self.vocab, self.dim, init_file=None, id2voc=None,
                                 name="context_embeddings", kernel_regularizer=self.l2_reg)        
        self.bin_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        
    def call(self, batch):
        
        # -- Preprocess batch data -- #
        # Pad sentences and sample words mask.
        words_pad, words_mask, seq_len = pad_data(batch['wrds'], batch['wlen'])
        # Gold label indices.
        target_words = tf.cast(tf.reshape(words_pad, [-1, 1]), tf.int64)
        # Get adjacency matrix.
        adj_mat = get_adj(batch, seq_len)        
        # Get negative samples.
        neg_ids, _, _ = tf.nn.fixed_unigram_candidate_sampler(true_classes=target_words, num_true=1,
                                                              num_sampled=self.num_neg * self.batch_size,
                                                              unique=True, distortion=0.75, 
                                                              range_max=self.vocab, unigrams=self.voc2freq)
        neg_ids = tf.cast(neg_ids, dtype=tf.int32)
        neg_ids = tf.reshape(neg_ids, [self.batch_size, self.num_neg])
        neg_ids = tf.reshape(tf.tile(neg_ids, [1, seq_len]), [self.batch_size, seq_len, self.num_neg])        
        # Concatenate true word and negative samples.
        target_ind = tf.concat([tf.expand_dims(words_pad, axis=2), neg_ids], axis=2)
        # Assign true labels.
        target_labels = tf.concat([tf.ones([self.batch_size, seq_len, 1], dtype=tf.float32), 
                                   tf.zeros([self.batch_size, seq_len, self.num_neg], dtype=tf.float32)], 
                                  axis=2) * tf.expand_dims(words_mask, -1)
        
        # -- Run model -- #
        gcn_out = self.gcn(self.embeddings(words_pad), adj_mat)
        target_embed = self.context(target_ind)
        pred = tf.reduce_sum(tf.expand_dims(gcn_out, axis=2) * 
                             target_embed, axis=3) * tf.expand_dims(words_mask, -1)        
        # Calculate loss.
        loss = self.bin_loss(tf.reshape(target_labels, -1), tf.reshape(pred, -1))
        
        return loss

    def checkpoint(self):
        embed_matrix = tf.math.l2_normalize(self.embeddings.weights[0], axis=1)
        words = [self.id2voc[i] for i in range(len(self.id2voc))]
        voc2vec = dict(zip(words, iter(embed_matrix.numpy())))
        embedding = Embedding.from_dict(voc2vec)
        results = evaluate_on_all(embedding)
        results = {key: round(val[0], 4) for key, val in results.items()}
        curr_int = np.mean(list(results.values()))
        # self.logger.info('Current Score: {}'.format(curr_int))

        if curr_int >= self.best_int_avg:
            with open(f"{os.getcwd()}/embeddings/{self.output_name}.txt", 'w') as file:
                for key, values in voc2vec.items():
                    file.write(key)
                    [file.write(f" {v}") for v in values]
                    file.write('\n')
            self.best_results = results
            self.best_int_avg = curr_int
        return results
        
        
    def train(self, epochs):

        self.best_int_avg = 0
        for epoch in range(epochs):
            start_time = time.time()
            num_batches = get_corpus_len() // self.batch_size
            print(f"\nEpoch {epoch + 1}/{epochs}:\n")
            optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

            # Iterate over the batches of the dataset.
            for step, (X, y) in enumerate(get_batch()):
                # Checkpoint every 100 batches.
                if (step) % 100 == 0:
                    results = self.checkpoint()
                    print(results)

                with tf.GradientTape() as tape:
                    loss = model(X)
                    loss += tf.reduce_sum(model.losses)

                grads = tape.gradient(loss, self.trainable_weights)
                optimizer.apply_gradients(zip(grads, self.trainable_weights))
                print(f"\rStep: {step + 1}/{num_batches}; Elapsed time: {time.time() - start_time:.2f}; \
                      Training loss: {loss:.4f}", end='\r') 

#                     print(f"Step: {step}/{num_batches}; Training loss: {loss:.4f}")

            # At end of each epoch.
            self.checkpoint()


In [None]:
sem_info_path = f"{os.getcwd()}/semantic_info"
voc2id_path = f"{os.getcwd()}/data/voc2id.txt"
emb_init_file = f"{os.getcwd()}/embeddings/init_syngcn_emb.txt"
model = SynGCN(vocab, p.embed_dim, num_deLabel, id2voc, voc2freq, sem_info_path, voc2id_path, 
                 batch_size=128, emb_init_file=emb_init_file, gcn_gating=True,
                 gamma=0.0, reg_sample_size=100000, num_neg=200, name="syngcn_emb")

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
model.train(1)