In [None]:
# Importing Libraries
from collections import defaultdict
from pathlib import Path
import numpy as np
import pandas as pd
from collections import namedtuple
from tabulate import tabulate

import torch
import os

from adversarial_debiasing import AdversarialDebiasing
from load_data import load_data, transform_data, Datapoint

from load_vectors import load_pretrained_vectors, load_vectors
import config
import utility_functions

import gensim
import gzip

In [None]:
# For autoreloading changes made in other python scripts
%load_ext autoreload
%autoreload 2

In [None]:
WORD2VEC_FILE = "data/GoogleNews-vectors-negative300.bin.gz"

In [None]:
def load_word2vec_format(f, max_num_words=None):
          """Loads word2vec data from a file handle.

          Similar to gensim.models.keyedvectors.KeyedVectors.load_word2vec_format
          but takes a file handle as input rather than a filename. This lets us use
          GFile. Also only accepts binary files.

          Args:
            f: file handle
            max_num_words: number of words to load. If None, load all.

          Returns:
            Word2vec data as keyedvectors.EuclideanKeyedVectors.
          """
          header = f.readline()
          vocab_size, vector_size = (
              int(x) for x in header.rstrip().split())  # throws for invalid file format
          print("vector_size",vector_size)
          result = gensim.models.keyedvectors.EuclideanKeyedVectors()
          num_words = 0
          result.vector_size = vector_size
          result.syn0 = np.zeros((vocab_size, vector_size), dtype=np.float32)

          def add_word(word, weights):
            word_id = len(result.vocab)
            if word in result.vocab:
              print("duplicate word '%s', ignoring all but first", word)
              return
            result.vocab[word] = gensim.models.keyedvectors.Vocab(
                index=word_id, count=vocab_size - word_id)
            result.syn0[word_id] = weights
            result.index2word.append(word)

          if max_num_words and max_num_words < vocab_size:
            num_embeddings = max_num_words
          else:
            num_embeddings = vocab_size
          print("Loading ",num_embeddings," embeddings")

          binary_len = np.dtype(np.float32).itemsize * vector_size
          for _ in range(vocab_size):
            # mixed text and binary: read text first, then binary
            word = []
            while True:
              ch = f.read(1)
              if ch == b' ':
                break
              if ch == b'':
                raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?")
              if ch != b'\n':  # ignore newlines in front of words (some binary files have)
                word.append(ch)
            word = gensim.utils.to_unicode(b''.join(word), encoding='utf-8', errors='strict')
            weights = np.frombuffer(f.read(binary_len), dtype=np.float32)
            add_word(word, weights)
            num_words = num_words + 1
            if max_num_words and num_words == max_num_words:
              break
          if result.syn0.shape[0] != len(result.vocab):
            print(
                "duplicate words detected, shrinking matrix size from %i to %i",
                result.syn0.shape[0], len(result.vocab))
          result.syn0 = np.ascontiguousarray(result.syn0[:len(result.vocab)])
          assert (len(result.vocab), vector_size) == result.syn0.shape

          print("loaded %s matrix", result.syn0.shape)
          return result

In [None]:
with gzip.GzipFile(fileobj=open(WORD2VEC_FILE, "rb", buffering=0)) as f:
    word_vectors = load_word2vec_format(f, max_num_words=2000000)

In [None]:
# Loading the word vectors dictionary
word_vectors = load_pretrained_vectors(config.wiki_embedding_data_path, config.save_dir, config.wiki_save_file, config.use_glove)

In [None]:
# Load the google analogies training dataset:
analogy_dataset = load_data()
analogy_dataset

In [None]:
# Transform the data such that it includes the embeddings of the words in consideration
transformed_analogy_dataset, gender_subspace = transform_data(word_vectors, analogy_dataset, use_boluk = False)

In [None]:
# Testing the transformed analogy dataset
print(transformed_analogy_dataset[0].analogy_embeddings.shape)
print(transformed_analogy_dataset[0].gt_embedding.shape)
print(transformed_analogy_dataset[0].protected.shape)

In [None]:
# Training the variant of the model without debiasing
non_debiased_model = AdversarialDebiasing(debias=False, num_epochs=50)
non_debiased_model.fit(dataset=transformed_analogy_dataset)

In [None]:
W1 = non_debiased_model.get_model_weights()
print(np.dot(W1.detach().numpy().T,gender_subspace.T))

In [None]:
# Training the variant of the model with debiasing
debiased_model = AdversarialDebiasing(num_epochs=500)
debiased_model.fit(dataset=transformed_analogy_dataset)

In [None]:
W1 = debiased_model.get_model_weights()
print(np.dot(W1.detach().numpy().T,gender_subspace.T))

In [None]:
# Examples to test the models upon
datapoints, test_analogies = [], []
with open(os.path.join('data', 'sexism-traps.txt'), 'r') as f:
    # Reading each line
    for line in f.readlines():
        words = line.split()
        if words[0] == ':':
            continue
        test_analogies.append(words)
        word_embeddings = word_vectors[words]
        word_embeddings = np.reshape(word_embeddings, (1, -1))
        datapoints.append(word_embeddings)
datapoints = np.vstack(datapoints)
print(datapoints.shape)

In [None]:
# Qualitative evaluation of the non-debiased model
non_debiased_predictions = non_debiased_model.predict(datapoints)
non_debiased_most_similar_list = utility_functions.obtain_most_similar(non_debiased_predictions, word_vectors)

In [None]:
# Displaying the similarity list for the non-debiased model
non_debiased_most_similar_list_data_frames = []
for i in range(len(non_debiased_most_similar_list)):
    # print("{} : {} :: {} : ".format(test_analogies[i][0], test_analogies[i][1], test_analogies[i][2]))
    temp_data_frame = pd.DataFrame(non_debiased_most_similar_list[i][1:], columns = ['Neighbor', 'Similarity'])
    non_debiased_most_similar_list_data_frames.append(temp_data_frame)
    # print(tabulate(temp_data_frame, headers='keys', tablefmt='psql', showindex=False))

In [None]:
# Qualitative evaluation of the debiased model
debiased_predictions = debiased_model.predict(datapoints)
debiased_most_similar_list = utility_functions.obtain_most_similar(debiased_predictions, word_vectors)

In [None]:
# Displaying the similarity list for the debiased model
debiased_most_similar_list_data_frames = []
for i in range(len(debiased_most_similar_list)):
    # print("{} : {} :: {} : ".format(test_analogies[i][0], test_analogies[i][1], test_analogies[i][2]))
    temp_data_frame = pd.DataFrame(debiased_most_similar_list[i][1:], columns = ['Neighbor', 'Similarity'])
    debiased_most_similar_list_data_frames.append(temp_data_frame)
    # print(tabulate(temp_data_frame, headers='keys', tablefmt='psql', showindex=False))

In [None]:
# Combining the dataframes pertaining to both the variants of the model
iterables = [['Biased', 'Debiased'], ['Neighbour', 'Similarity']]
index = pd.MultiIndex.from_product(iterables)
overall_data_frames_list = []
for i in range(len(non_debiased_most_similar_list)):
    overall_list = []
    print("{} : {} :: {} : ".format(test_analogies[i][0], test_analogies[i][1], test_analogies[i][2]))
    for j in range(len(non_debiased_most_similar_list[i][1:])):
        temp_list = []
        temp_list.append(non_debiased_most_similar_list[i][j][0])
        temp_list.append(round(non_debiased_most_similar_list[i][j][1], 3))
        temp_list.append(debiased_most_similar_list[i][j][0])
        temp_list.append(round(debiased_most_similar_list[i][j][1], 3))
        overall_list.append(temp_list)
    temp_df = pd.DataFrame(overall_list, columns = index)
    # print(temp_df.to_string(index = False))
    print(tabulate(temp_df, headers = ['Biased\nNeighbour', 'Biased\nSimilarity', 'Debiased\nNeighbour', 'Debiased\nSimilarity'], tablefmt = 'psql', showindex = False))
    overall_data_frames_list.append(temp_df)

In [None]:
# Fake dataset for testing purposes

# embedding_dim = 100
# analogy_dataset = [
#     Datapoint(
#     analogy_embeddings=np.random.normal(0, 1, size=(3 * embedding_dim, 1)), 
#     gt_embedding=np.random.normal(0, 1, size=(embedding_dim, 1)),
#     protected_embedding=np.random.uniform(0, 1, size=(1))) for n in range(0, 1000)
# ]