In [2]:
import os
import numpy as np
import itertools
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Layer, Embedding, Conv2D, BatchNormalization, Lambda
from tensorflow.keras.layers import Bidirectional, LSTM, Dense
from tensorflow.keras.regularizers import Regularizer
from tensorflow.keras.metrics import CosineSimilarity
import pandas as pd
from tensorflow.python.keras import backend as K

In [3]:
voc2id_path = f"{os.getcwd()}/data/voc2id.txt"
sem_path = f"{os.getcwd()}/semantic_info"
syn_path = f"{sem_path}/antonyms.txt"

In [24]:
with open(f"{os.getcwd()}/data/data.txt", 'r') as file:
    for line in file:
        x = line.strip('\n').split(' ')
        y = x[2: int(x[0]) + 2]
        for i in y:
            if int(i) > 149999:
                print(i)


In [15]:
line

'15 14 24351 24351 10 7 436 2083 26 8385 121958 4986 215 13 6932 2293 2 1|0|26 5|1|11 5|2|23 5|3|34 5|4|7 7|6|11 5|7|9 9|8|7 7|9|38 9|10|13 13|11|2 13|12|7 10|13|16 5|14|10\n'

In [19]:
id2voc = dict()
with open(voc2id_path, 'r') as file:
    for line in file:
        x = line.strip('\n').split('\t')
        id2voc[x[1]] = x[0]
        

In [20]:
voc2id = dict()
with open(voc2id_path, 'r') as file:
    for line in file:
        x = line.strip('\n').split('\t')
        voc2id[x[0]] = int(x[1])
        

In [21]:
for i in y:
    print(id2voc[i])

anarchism
anarchism
is
a
political
philosophy
that
advocates
self-governed
societies
based
on
voluntary
institutions
.


In [65]:
synonyms = list()
with open(syn_path, 'r') as file:
    for line in file:
        x = line.strip('\n').split()
        inds = list()
        for i in x:
            try: inds.append(voc2id[i])
            except KeyError: pass
        for i in itertools.combinations(inds, 2):
            synonyms.append(i)
synonyms = np.asarray(synonyms)


In [66]:
synonyms

array([[62598, 23042],
       [16438, 23325],
       [12452,  1281],
       ...,
       [ 4169, 15863],
       [ 2514, 72354],
       [ 1394,   270]])

In [5]:
class SemanticRegularizer(Regularizer):
    def __init__(self, gamma=0.1):
        self.gamma = gamma

    def __call__(self, x):
        return self.gamma * tf.math.reduce_sum(x)

    def get_config(self):
        return {'gamma': float(self.gamma)}

In [6]:
class Embedding(Layer):
    def __init__(self, vocab, dims, init_file=None):
        
        super(Embedding, self).__init__()
        self.vocab = vocab
        self.dims = dims
        self.init_file = init_file
        
        self.w = self.add_weight(shape=(self.vocab, self.dims),
                                 initializer="glorot_uniform",
                                 trainable=True, regularizer=SemanticRegularizer())

        if self.init_file != None:            
            x = tf.Variable(initial_value=np.asarray(pd.read_csv(self.init_file, sep=' ', header=None).iloc[:, 1:], dtype="float32"), 
                                 dtype=tf.float32, trainable=True)
            self.w.assign(x)
    
    def call(self, inputs):
        return tf.nn.embedding_lookup(self.w, inputs)

In [44]:
emb = Embedding(150000, 300, init_file=f"{os.getcwd()}/embeddings/init_rand_emb.txt")

In [45]:
inds = np.random.randint(0, len(synonyms), size=(1000,))

In [46]:
y = tf.gather(emb.w, synonyms[inds], axis=0)

In [47]:
sim = -tf.reduce_mean(tf.keras.losses.cosine_similarity(y[:, 0], y[:, 1]))

In [48]:
sim

<tf.Tensor: shape=(), dtype=float32, numpy=0.0004963443>

TensorShape([1000, 300])