In [None]:
import gensim
from numpy import linalg
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from collections import defaultdict
import random
import os

from gensim.scripts.glove2word2vec import glove2word2vec


# load embeddings and normalizing them

In [None]:
model_glove = glove2word2vec('../data/embeddings/glove.6B/glove.6B.300d.txt', 'gensim_glove_300d.txt') ## needed for glove embeddings only

In [None]:
model_glove = gensim.models.KeyedVectors.load_word2vec_format("gensim_glove_300d.txt", binary=False) ## needed for only glove

In [None]:
def generate_norm_embedding(model, output_path):
    temp_file = open(output_path,'wb')
    temp_file.write(str.encode(str(len(model.vocab))+' '+str(model.vector_size)+'\n'))
    
    for each_word in tqdm(model.vocab):
        temp_file.write(str.encode(each_word+' '))
        temp_file.write(model[each_word]/linalg.norm(model[each_word]))
        temp_file.write(str.encode('\n'))
    
    temp_file.close()

In [None]:
generate_norm_embedding(model_glove,'glove_norm_300.mod')

### Load the required word-embeddings

In [None]:
model_gn = gensim.models.KeyedVectors.load_word2vec_format('glove_norm_300.mod',binary=True) # ss

In [None]:
current_model = model_gn 

# Loading the antonyms

In [None]:
list_antonym = []

with open('Antonym_sets/LenciBenotto.val') as fp:
    for line in fp:
        parts = line.split()
        if parts[3]=='antonym':
            word1 = parts[0].split('-')[0]
            word2 = parts[1].split('-')[0]
            if word1 in current_model and word2 in current_model:
                list_antonym.append((word1.strip().lower(), word2.strip().lower()))


with open('Antonym_sets/LenciBenotto.test') as fp:
    for line in fp:
        parts = line.split()
        if parts[3]=='antonym':
            word1 = parts[0].split('-')[0]
            word2 = parts[1].split('-')[0]
            if word1 in current_model and word2 in current_model:
                list_antonym.append((word1.strip().lower(), word2.strip().lower()))
                
with open('Antonym_sets/EVALution.val') as fp:
    for line in fp:
        parts = line.split()
        if parts[3]=='antonym':
            word1 = parts[0].split('-')[0]
            word2 = parts[1].split('-')[0]
            if word1 in current_model and word2 in current_model:
                list_antonym.append((word1.strip().lower(), word2.strip().lower()))
                
with open('Antonym_sets/EVALution.test') as fp:
    for line in fp:
        parts = line.split()
        if parts[3]=='antonym':
            word1 = parts[0].split('-')[0]
            word2 = parts[1].split('-')[0]
            if word1 in current_model and word2 in current_model:
                list_antonym.append((word1.strip().lower(), word2.strip().lower()))
                
                
list_antonym = list(dict.fromkeys(list_antonym).keys())

In [None]:
similarity_matrix = defaultdict(list)
for each_pair in tqdm(list_antonym):
    word1 = each_pair[0]
    word2 = each_pair[1]
    if word1 < word2:
        similarity_matrix[word1].append(word2)
    else:
        similarity_matrix[word2].append(word1)
    
all_similarity = defaultdict(dict)
for each_key in tqdm(similarity_matrix):
    for each_value in similarity_matrix[each_key]:
#         cosine_similarity([current_model[each_key]]
        all_similarity[each_key][each_value] = abs(cosine_similarity([current_model[each_key]],[current_model[each_value]])[0][0])
    
final_antonym_list = []
for index_counter, each_key in enumerate(tqdm(all_similarity)):
#     print(each_key,all_similarity[each_key])
    listofTuples = sorted(all_similarity[each_key].items() ,  key=lambda x: x[1])
#     print(listofTuples)
    final_antonym_list.append((each_key, listofTuples[0][0]))
print(len(final_antonym_list))

list_antonym = final_antonym_list

## Decide on the size of the antonym vector

In [None]:
num_antonym = 1468

In [None]:
## Find the antonym difference vectors
antonymy_vector = []
for each_word_pair in list_antonym:
    antonymy_vector.append(current_model[each_word_pair[0]]- current_model[each_word_pair[1]])
antonymy_vector = np.array(antonymy_vector)
print(antonymy_vector.shape)

## Subset Dimension Selection Method

In [None]:
import random

from scipy.spatial.distance import cosine as scipy_cosine
random.seed(42)

t1 = np.array(antonymy_vector)
dimension_similarity_matrix = defaultdict(dict)
for index_1, each_dim1 in enumerate(tqdm(t1)):
    for index_2, each_dim2 in enumerate(t1):
        dimension_similarity_matrix[index_1][index_2] = abs(1-scipy_cosine(each_dim1, each_dim2))
        
        
def get_set_score(final_list, each_dim):
    final_output = 0.0
    for each_vec in final_list:
        final_output += dimension_similarity_matrix[each_vec][each_dim]
    return final_output/(len(final_list))
        
def select_subset_dimension(dim_vector, num_dim):
    working_list = np.array(dim_vector)
    
    working_position_index = [i for i in range(working_list.shape[0])]
    final_position_index = []
    

    print('working list is ready, shape', working_list.shape)
    sel_dim = random.randrange(0, working_list.shape[0])

    final_position_index.append(sel_dim)
    
    working_position_index.remove(sel_dim)

    
    for test_count in tqdm(range(num_dim-1)):
        min_dim = None
        min_score = 1000
        for temp_index, each_dim in enumerate(working_position_index):
#             print(each_dim)
            temp_score = get_set_score(final_position_index, each_dim)
            if temp_score< min_score:
                min_score= temp_score
                min_dim = each_dim
        print(test_count,min_dim)
        final_position_index.append(min_dim)
        working_position_index.remove(min_dim)
#         print(working_list.shape, len(final_list))
    return final_position_index

## Generate the ORTHOGONAL DIMENSION Order

In [None]:
num_antonym = 1443
orthogonal_antonymy_vector =np.array(select_subset_dimension(antonymy_vector, num_antonym))  
print(orthogonal_antonymy_vector.shape)

## Generate the RANDOM DIMENSION Order

In [None]:
random_antonym_vector = [i for i in range(len(antonymy_vector))]
random.shuffle(random_antonym_vector)
print(len(random_antonym_vector))

## Generate the MAXIMUM VARIANCE DIMENSION Order

In [None]:
embedding_size = antonymy_vector.shape[0]   
print('The embedding size is', embedding_size)


variance_antonymy_vector_inverse = np.linalg.pinv(np.transpose(antonymy_vector))

embedding_matrix = []


total_words = 0
for each_word in tqdm(current_model.vocab):
    total_words += 1

    new_vector = np.matmul(variance_antonymy_vector_inverse,current_model[each_word])
    
    embedding_matrix.append(new_vector)


In [None]:
del new_vector

In [None]:
variance_list = []

embedding_matrix = np.array(embedding_matrix)

for each_dimension in tqdm(range(embedding_matrix.shape[1])):
    variance_list.append(np.var(embedding_matrix[:,each_dimension]))

In [None]:
variance_antonymy_vector = [each for each in sorted(range(len(variance_list)), key=lambda i: variance_list[i], reverse=True)]

In [None]:
del embedding_matrix

In [None]:
del variance_list

# Transformation to polar space

In [None]:
def transform_to_antonym_space(current_model, output_file_path, binary, current_antonymy_vector_inverse):
    embedding_size = current_antonymy_vector_inverse.shape[0]   ##CHANGE THIS ACCORDINGLY!!!
    print('New model size is',len(current_model.vocab), embedding_size)

    temp_file = None
    
    if binary:
        temp_file = open(output_file_path,'wb')
        temp_file.write(str.encode(str(len(current_model.vocab))+' '+str(embedding_size)+'\n'))
    else:
        temp_file = open(output_file_path,'w')
        temp_file.write(str(len(current_model.vocab))+' '+str(embedding_size)+'\n')

    total_words = 0
    for each_word in tqdm(current_model.vocab):
        total_words += 1
        if binary:
            temp_file.write(str.encode(each_word+' '))
        else:
            temp_file.write(each_word+' ')

        new_vector = np.matmul(current_antonymy_vector_inverse,current_model[each_word])

        new_vector = new_vector/linalg.norm(new_vector)

        
        
        if binary:
            temp_file.write(new_vector)
            temp_file.write(str.encode('\n'))
        else:
            temp_file.write(str(new_vector))
            temp_file.write('\n')


    temp_file.close()



# Standard normal transform 

In [None]:
def standard_normal_dist_model(model, new_filename):
    embedding_matrix = []
    embedding_vocab = []

    temp_file = open(new_filename,'wb')
    temp_file.write(str.encode(str(model.vectors.shape[0])+' '+str(model.vectors.shape[1])+'\n'))
    
    for each_word in tqdm(model.vocab):
        embedding_matrix.append(model[each_word])
        embedding_vocab.append(each_word)
    
    embedding_matrix = np.array(embedding_matrix)
    
    print('The shape of embedding matrix is {}'.format(embedding_matrix.shape))
    
    norm_embedding_matrix = (embedding_matrix - embedding_matrix.mean(0))/ embedding_matrix.std(0)
    
    for word_counter, each_word in enumerate(tqdm(embedding_vocab)):
#         assert each_word==embedding_vocab[word_counter],'Not matching!!!'
        
        temp_file.write(str.encode(each_word+' '))
        new_vector = norm_embedding_matrix[word_counter]
        temp_file.write(new_vector)
        temp_file.write(str.encode('\n'))
        
    del embedding_matrix
    del embedding_vocab
    temp_file.close()

## Compute the task score for different dimension size

In [None]:
def generate_embedding_path(current_model, embedding_path, binary, antonym_vector, curr_dim):
    curr_antonym_vector = antonymy_vector[antonym_vector[:curr_dim]]
    curr_antonymy_vector_inverse = np.linalg.pinv(np.transpose(curr_antonym_vector))
    transform_to_antonym_space(current_model, embedding_path, binary,curr_antonymy_vector_inverse)

In [None]:
os.makedirs('../output/polar_glove_embeddings', exist_ok=True)

In [None]:
curr_dim = 500 # Number of POLAR dimenions

for method_name, antonym_vector_method in [
        ('rand_antonym_', random_antonym_vector),
        ('orthogonal_antonymy_', orthogonal_antonymy_vector),
        ('variance_antonymy_', variance_antonymy_vector)]:
    embedding_path = f'../output/polar_glove_embeddings/{method_name}'+str(curr_dim)+'.bin'
    generate_embedding_path(current_model, embedding_path,True,antonym_vector_method, curr_dim)

    print('loading the model')
    temp_model = gensim.models.KeyedVectors.load_word2vec_format(embedding_path, binary=True)
    print('loading done..')

    std_nrml_embedding_path = f'../output/polar_glove_embeddings/{method_name}gl_'+str(curr_dim)+'_StdNrml.bin'
    standard_normal_dist_model(temp_model, std_nrml_embedding_path)