#### Importing Libraries

In [1]:
from collections import defaultdict
from pathlib import Path
import numpy as np
import pandas as pd
from collections import namedtuple
from tabulate import tabulate

import torch
import os

from adversarial_debiasing import AdversarialDebiasing
from load_data import load_data, transform_data, Datapoint

from load_vectors import load_pretrained_vectors, load_vectors
import config
import utility_functions
import qualitative_evaluation

import gensim
import gzip
import pickle

#### Autoreloading changes made in other python scripts

In [2]:
%load_ext autoreload
%autoreload 2

#### Loading the desired word vectors

In [3]:
# For Wikipedia2Vec - pass "Wikipedia2Vec"
# For Glove - pass "Glove"
# For GoogleNews (Word2Vec) - pass "GoogleNews"

word_vectors = load_pretrained_vectors("Glove")

Loading from saved file.


#### Loading the Google Analogies Training Dataset

In [4]:
analogy_dataset = load_data()
print("Total Number of Analogies : {}".format(len(analogy_dataset)))
print("The first 5 analogies:")
analogy_dataset[0:6]

Total Number of Analogies : 19544
The first 5 analogies:


[Raw_Datapoint(x1='Athens', x2='Greece', x3='Baghdad', y='Iraq', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Bangkok', y='Thailand', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Beijing', y='China', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Berlin', y='Germany', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Bern', y='Switzerland', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Cairo', y='Egypt', task='capital-common-countries')]

#### Transforming the above dataset to include the respective word embeddings

In [5]:
transformed_analogy_dataset, gender_subspace = transform_data(word_vectors, analogy_dataset, use_boluk = False)
print("Transformed datapoint for the first analogy")
transformed_analogy_dataset[0]

Transformed datapoint for the first analogy


Datapoint(analogy_embeddings=array([ 1.27120003e-01,  2.20400006e-01,  3.34540009e-01, -5.78999996e-01,
       -6.81469977e-01,  2.51949996e-01, -7.63480008e-01, -5.78790009e-01,
       -2.91530013e-01,  3.86209995e-01, -3.96910012e-01,  3.20870012e-01,
       -9.76779982e-02,  3.21809985e-02,  1.18600003e-01,  3.22279990e-01,
       -3.06719989e-01, -4.63580012e-01, -1.31380007e-01, -5.71169972e-01,
       -2.21819997e-01,  3.22770000e-01, -6.74659982e-02, -4.50760007e-01,
       -2.54599988e-01, -8.35819989e-02,  2.44670004e-01,  4.41489995e-01,
       -2.03150004e-01, -1.37189999e-01, -5.03970027e-01, -9.37030017e-02,
        9.36819986e-02,  3.27230006e-01,  5.13209999e-01, -5.35640001e-01,
        2.68350005e-01, -2.15599999e-01,  1.25379995e-01, -1.73810005e-01,
        5.55570006e-01, -1.00089997e-01,  7.84850001e-01, -1.97459996e-01,
       -1.11900002e-01, -8.64040013e-03,  6.45210028e-01,  2.02199996e-01,
        2.98460007e-01, -3.47479992e-02,  2.13060006e-01,  4.60249990e-

#### Obtaining the dimensionality of the word embeddings

In [6]:
word_embedding_dim = transformed_analogy_dataset[0].gt_embedding.shape[0]
print("Dimensions of the word embedding : {}".format(word_embedding_dim))

Dimensions of the word embedding : 300


#### Test the dimensions of the transformed analogy dataset components

In [7]:
# Testing the transformed analogy dataset
assert transformed_analogy_dataset[0].analogy_embeddings.shape[0] == word_embedding_dim * 3
assert transformed_analogy_dataset[0].gt_embedding.shape[0] == word_embedding_dim
assert transformed_analogy_dataset[0].protected.shape[0] == 1

print("Dimensions of the network input : {}".format(transformed_analogy_dataset[0].analogy_embeddings.shape))
print("Dimensions of the ground-truth embedding : {}".format(transformed_analogy_dataset[0].gt_embedding.shape))
print("Dimensions of the ground-truth protected variable : {}".format(transformed_analogy_dataset[0].protected.shape))

Dimensions of the network input : (900,)
Dimensions of the ground-truth embedding : (300,)
Dimensions of the ground-truth protected variable : (1,)


#### Flag to indicate whether you want to use a pre-trained model or you want to train a model from scratch

In [8]:
use_pretrained = True

#### In case you want to use a pre-trained model, then specify the type of word embeddings upon which the model was trained

In [9]:
# "GNews" for Google News (Word2Vec)
# "WikipediaVec" for Wikipedia2Vec
# "Glove" for Glove vectors

word_embedding_type = "Glove"

#### In case of using a pre-trained model

In [10]:
if use_pretrained:
    
    # Obtaining the saved parameters dictionary
    pretrained_parameters = utility_functions.obtain_trained_parameters('models')
    
    # Obtaining the best weights for the non-debiased model
    non_debiased_W1 = pretrained_parameters['non_debiased'][word_embedding_type]["W1"]
    
    # Creating an instance of the non-debiased model
    non_debiased_model = AdversarialDebiasing(word_embedding_dim = word_embedding_dim, debias = False)
    non_debiased_model.W1 = non_debiased_W1
    
    # Obtaining the best weights for the debiased model
    debiased_W1 = pretrained_parameters['debiased'][word_embedding_type]["W1"]
    
    # Creating an instance of the debiased model
    debiased_model = AdversarialDebiasing(word_embedding_dim = word_embedding_dim, debias = True)
    debiased_model.W1 = debiased_W1

#### In case of using a model trained from scratch

In [None]:
if not use_pretrained:
    
    # Best parameters from grid search
    best_adversary_loss_weight = 1.0
    best_learning_rate = 2 ** -6
    
    # Creating an instance of the non-debiased model
    non_debiased_model = AdversarialDebiasing(word_embedding_dim = word_embedding_dim, num_epochs = 500, debias = False, \
                                             gender_subspace = gender_subspace, batch_size = 256, \
                                              adversary_loss_weight = best_adversary_loss_weight, \
                                             classifier_learning_rate = best_learning_rate, \
                                             adversary_learning_rate = best_learning_rate)
    
    # Fitting the non-debiased model to the training dataset
    print("****************** Training the non-debiased model ********************")
    non_debiased_model.fit(dataset = transformed_analogy_dataset)
    
    # Creating an instance of the debiased model
    debiased_model = AdversarialDebiasing(word_embedding_dim = word_embedding_dim, num_epochs = 500, debias = True, \
                                             gender_subspace = gender_subspace, batch_size = 256, \
                                              adversary_loss_weight = best_adversary_loss_weight, \
                                             classifier_learning_rate = best_learning_rate, \
                                             adversary_learning_rate = best_learning_rate)
    
    Fitting the non-debiased model to the training dataset
    print("****************** Training the debiased model ********************")
    debiased_model.fit(dataset = transformed_analogy_dataset)

In [11]:
# Handling memory issues
word_vectors.init_sims(replace = True)

#### Qualitative Evaluation (on some evaluation analogies)

In [12]:
# Get sexism traps as word embeddings and words
datapoints, test_analogies = qualitative_evaluation.get_datapoints(word_vectors)

# Predictions of the non debiased model
non_debiased_predictions = qualitative_evaluation.get_non_debiased_predictions(datapoints, word_embedding_dim)
non_debiased_most_similar_list = utility_functions.obtain_most_similar(non_debiased_predictions, word_vectors)

# Predictions of the debiased model
debiased_predictions = debiased_model.predict(datapoints)
debiased_most_similar_list = utility_functions.obtain_most_similar(debiased_predictions, word_vectors)

# Print similarity results for both models
qualitative_evaluation.print_combined_table(non_debiased_most_similar_list, debiased_most_similar_list, test_analogies)

he : doctor :: she : 
+-------------+--------------+---------------+--------------+
| Biased      |       Biased | Debiased      |     Debiased |
| Neighbour   |   Similarity | Neighbour     |   Similarity |
|-------------+--------------+---------------+--------------|
| nurse       |        0.653 | mordrid       |        0.544 |
| doctors     |        0.652 | gynecologist  |        0.485 |
| physician   |        0.627 | midwife       |        0.48  |
| pregnant    |        0.618 | kerish        |        0.467 |
| pregnancy   |        0.589 | obstetrician  |        0.464 |
| she         |        0.584 | nurse         |        0.462 |
| midwife     |        0.568 | pediatrician  |        0.454 |
| her         |        0.566 | naturopath    |        0.451 |
| patient     |        0.565 | dermatologist |        0.45  |
+-------------+--------------+---------------+--------------+
he : director :: she : 
+-------------+--------------+--------------------+--------------+
| Biased      |    