#### Importing Libraries

In [9]:
from collections import defaultdict
from pathlib import Path
import numpy as np
import pandas as pd
from collections import namedtuple
from tabulate import tabulate

import torch
import os

from adversarial_debiasing import AdversarialDebiasing
from load_data import load_data, transform_data, Datapoint

from load_vectors import load_pretrained_vectors, load_vectors
import config
import utility_functions
import qualitative_evaluation

import gensim
import gzip
import pickle

#### Autoreloading changes made in other python scripts

In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Loading the desired word vectors

In [24]:
# For Wikipedia2Vec - pass "Wikipedia2Vec"
# For Glove - pass "Glove"
# For GoogleNews (Word2Vec) - pass "GoogleNews"

word_vectors = load_pretrained_vectors("GoogleNews")

#### Loading the Google Analogies Training Dataset

In [25]:
analogy_dataset = load_data()
analogy_dataset[0:6]

[Raw_Datapoint(x1='Athens', x2='Greece', x3='Baghdad', y='Iraq', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Bangkok', y='Thailand', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Beijing', y='China', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Berlin', y='Germany', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Bern', y='Switzerland', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Cairo', y='Egypt', task='capital-common-countries')]

#### Transforming the above dataset to include the respective word embeddings

In [26]:
transformed_analogy_dataset, gender_subspace = transform_data(word_vectors, analogy_dataset, use_boluk = False)
transformed_analogy_dataset[0]

Datapoint(analogy_embeddings=array([-6.68945312e-02, -3.49121094e-02,  6.05468750e-02,  1.48437500e-01,
        1.80664062e-02,  1.34887695e-02, -2.09960938e-01, -2.50000000e-01,
       -1.62109375e-01,  1.82617188e-01, -1.88476562e-01, -1.99218750e-01,
       -1.69921875e-01, -7.86132812e-02, -1.85394287e-03,  2.64892578e-02,
       -1.18652344e-01,  6.93359375e-02,  5.44433594e-02, -1.07421875e-01,
       -2.80761719e-02,  3.11279297e-02,  2.21679688e-01, -2.41210938e-01,
       -1.60156250e-01, -5.15747070e-03, -9.61914062e-02,  1.77734375e-01,
       -3.85742188e-02,  1.15722656e-01,  1.61132812e-01, -1.25976562e-01,
       -4.90722656e-02, -2.38281250e-01, -1.10351562e-01, -1.59912109e-02,
       -1.00097656e-01,  1.10473633e-02,  7.17773438e-02,  1.21093750e-01,
       -7.42187500e-02,  3.23486328e-03,  1.25000000e-01,  4.17480469e-02,
        1.10839844e-01,  4.32128906e-02, -1.08032227e-02, -1.89453125e-01,
       -9.71679688e-02,  1.46484375e-01, -1.13769531e-01,  3.04687500e-

#### Obtaining the dimensionality of the word embeddings

In [27]:
word_embedding_dim = transformed_analogy_dataset[0].gt_embedding.shape[0]

#### Test the dimensions of the transformed analogy dataset components

In [28]:
# Testing the transformed analogy dataset
assert transformed_analogy_dataset[0].analogy_embeddings.shape[0] == word_embedding_dim * 3
assert transformed_analogy_dataset[0].gt_embedding.shape[0] == word_embedding_dim
assert transformed_analogy_dataset[0].protected.shape[0] == 1

print(transformed_analogy_dataset[0].analogy_embeddings.shape)
print(transformed_analogy_dataset[0].gt_embedding.shape)
print(transformed_analogy_dataset[0].protected.shape)

(900,)
(300,)
(1,)


In [None]:
# # To run the grid-search and obtain the np.dot(w.T, g) values
# learning_rate_list = [2 ** -12, 2 ** -6, 2 ** -3]
# adversary_loss_weight_list = [1.0, 0.5, 0.1]

# # For the saved model checkpoints pertaining to the word embedding type
# word_embedding_type = 'GNews'

# # Performing the grid search
# utility_functions.grid_search(learning_rate_list, adversary_loss_weight_list, word_embedding_dim, gender_subspace, transformed_analogy_dataset, word_embedding_type, 'models')

#### Flag to indicate whether you want to use a pre-trained model or you want to train a model from scratch

In [29]:
use_pretrained = True

#### In case you want to use a pre-trained model, then specify the type of word embeddings upon which the model was trained

In [30]:
# "GNews" for Google News (Word2Vec)
# "WikipediaVec" for Wikipedia2Vec

word_embedding_type = "GNews"

#### In case of using a pre-trained model

In [55]:
if use_pretrained:
    
    # Obtaining the saved parameters dictionary
    pretrained_parameters = utility_functions.obtain_trained_parameters('models')
    
    # Obtaining the best weights for the non-debiased model
    non_debiased_W1 = pretrained_parameters['non_debiased'][word_embedding_type]["W1"]
    
    # Creating an instance of the non-debiased model
    non_debiased_model = AdversarialDebiasing(word_embedding_dim = word_embedding_dim, debias = False)
    non_debiased_model.W1 = non_debiased_W1
    
    # Obtaining the best weights for the debiased model
    debiased_W1 = pretrained_parameters['debiased'][word_embedding_type]["W1"]
    
    # Creating an instance of the debiased model
    debiased_model = AdversarialDebiasing(word_embedding_dim = word_embedding_dim, debias = True)
    debiased_model.W1 = debiased_W1

#### In case of using a model trained from scratch

In [None]:
if not use_pretrained:
    
    # Best parameters from grid search
    best_adversary_loss_weight = 0.1
    best_learning_rate = 2 ** -6
    
    # Creating an instance of the non-debiased model
    non_debiased_model = AdversarialDebiasing(word_embedding_dim = word_embedding_dim, num_epochs = 500, debias = False, \
                                             gender_subspace = gender_subspace, batch_size = 256, \
                                              adversary_loss_weight = best_adversary_loss_weight, \
                                             classifier_learning_rate = best_learning_rate, \
                                             adversary_learning_rate = best_learning_rate)
    
    # Fitting the non-debiased model to the training dataset
    print("****************** Training the non-debiased model ********************")
    non_debiased_model.fit(dataset = transformed_analogy_dataset)
    
    # Creating an instance of the debiased model
    debiased_model = AdversarialDebiasing(word_embedding_dim = word_embedding_dim, num_epochs = 500, debias = True, \
                                             gender_subspace = gender_subspace, batch_size = 256, \
                                              adversary_loss_weight = best_adversary_loss_weight, \
                                             classifier_learning_rate = best_learning_rate, \
                                             adversary_learning_rate = best_learning_rate)
    
    # Fitting the non-debiased model to the training dataset
    print("****************** Training the debiased model ********************")
    debiased_model.fit(dataset = transformed_analogy_dataset)

#### Qualitative Evaluation

In [59]:
# Get sexism traps as word embeddings and words
datapoints, test_analogies = qualitative_evaluation.get_datapoints(word_vectors)

# Handling memory issues
word_vectors.init_sims(replace = True)

# Predictions of the non debiased model
non_debiased_predictions = qualitative_evaluation.get_non_debiased_predictions([datapoints[0]], word_embedding_dim)
non_debiased_most_similar_list = utility_functions.obtain_most_similar(non_debiased_predictions, word_vectors)

# Predictions of the debiased model
debiased_predictions = debiased_model.predict([datapoints[0]])
debiased_most_similar_list = utility_functions.obtain_most_similar(debiased_predictions, word_vectors)

# Print similarity results for both models
qualitative_evaluation.print_combined_table(non_debiased_most_similar_list, debiased_most_similar_list, test_analogies)

KeyboardInterrupt: 