In [1]:
# Importing Libraries
from collections import defaultdict
from pathlib import Path
import numpy as np
import pandas as pd
from collections import namedtuple

import torch
import os

from adversarial_debiasing import AdversarialDebiasing
from load_data import load_data, transform_data, Datapoint

from load_vectors import load_pretrained_vectors, load_vectors
import config
import utility_functions

In [2]:
# For autoreloading changes made in other python scripts
%load_ext autoreload
%autoreload 2

In [3]:
# Loading the word vectors dictionary
word_vectors = load_pretrained_vectors(config.glove_embedding_data_path, config.save_dir, config.glove_save_file, \
                                       config.use_glove)

Loading from saved file.


In [4]:
# Testing the word vectors dictionary
temp = word_vectors['athens', 'greece']
print(temp.shape)

(2, 100)


In [5]:
# Load the google analogies training dataset:
analogy_dataset = load_data()
analogy_dataset

task='capital-world'),
 Raw_Datapoint(x1='Ashgabat', x2='Turkmenistan', x3='Banjul', y='Gambia', task='capital-world'),
 Raw_Datapoint(x1='Ashgabat', x2='Turkmenistan', x3='Beijing', y='China', task='capital-world'),
 Raw_Datapoint(x1='Ashgabat', x2='Turkmenistan', x3='Beirut', y='Lebanon', task='capital-world'),
 Raw_Datapoint(x1='Ashgabat', x2='Turkmenistan', x3='Belgrade', y='Serbia', task='capital-world'),
 Raw_Datapoint(x1='Ashgabat', x2='Turkmenistan', x3='Belmopan', y='Belize', task='capital-world'),
 Raw_Datapoint(x1='Ashgabat', x2='Turkmenistan', x3='Berlin', y='Germany', task='capital-world'),
 Raw_Datapoint(x1='Ashgabat', x2='Turkmenistan', x3='Bern', y='Switzerland', task='capital-world'),
 Raw_Datapoint(x1='Ashgabat', x2='Turkmenistan', x3='Bishkek', y='Kyrgyzstan', task='capital-world'),
 Raw_Datapoint(x1='Ashgabat', x2='Turkmenistan', x3='Bratislava', y='Slovakia', task='capital-world'),
 Raw_Datapoint(x1='Ashgabat', x2='Turkmenistan', x3='Brussels', y='Belgium', task='c

In [6]:
# Transform the data such that it includes the embeddings
transformed_analogy_dataset = transform_data(word_vectors, analogy_dataset)

In [7]:
# Testing the transformed analogy dataset
print(transformed_analogy_dataset[0].analogy_embeddings.shape)
print(transformed_analogy_dataset[0].gt_embedding.shape)
print(transformed_analogy_dataset[0].protected_embedding.shape)

(300,)
(100,)
(100,)


In [8]:
# Now we fit a dataset.

# embedding_dim = 100
# analogy_dataset = [
#     Datapoint(
#     analogy_embeddings=np.random.normal(0, 1, size=(3 * embedding_dim, 1)), 
#     gt_embedding=np.random.normal(0, 1, size=(embedding_dim, 1)),
#     protected_embedding=np.random.uniform(0, 1, size=(1))) for n in range(0, 1000)
# ]



model = AdversarialDebiasing()
model.fit(dataset=transformed_analogy_dataset)


poch 34; iter: 120; batch classifier loss: 0.366347; batch adversarial loss: 0.002427
epoch 34; iter: 130; batch classifier loss: 0.354348; batch adversarial loss: 0.003420
epoch 34; iter: 140; batch classifier loss: 0.315832; batch adversarial loss: 0.003015
[35/50] Running epoch
epoch 35; iter: 0; batch classifier loss: 0.355837; batch adversarial loss: 0.003152
epoch 35; iter: 10; batch classifier loss: 0.333980; batch adversarial loss: 0.002701
epoch 35; iter: 20; batch classifier loss: 0.308434; batch adversarial loss: 0.003043
epoch 35; iter: 30; batch classifier loss: 0.338200; batch adversarial loss: 0.002599
epoch 35; iter: 40; batch classifier loss: 0.346206; batch adversarial loss: 0.003001
epoch 35; iter: 50; batch classifier loss: 0.357855; batch adversarial loss: 0.003464
epoch 35; iter: 60; batch classifier loss: 0.372433; batch adversarial loss: 0.004840
epoch 35; iter: 70; batch classifier loss: 0.352413; batch adversarial loss: 0.002970
epoch 35; iter: 80; batch class

<adversarial_debiasing.AdversarialDebiasing at 0x1f363cfb308>

In [15]:
# Examples to test the model upon
datapoints = []
test_analogies = []
with open(os.path.join('data', 'sexism-traps.txt'), 'r') as f:
    # Reading each line
    for line in f.readlines():
        words = line.split()
        if words[0] == ':':
            continue
        test_analogies.append(words)
        word_embeddings = word_vectors[words]
        word_embeddings = np.reshape(word_embeddings, (1, -1))
        datapoints.append(word_embeddings)
datapoints = np.vstack(datapoints)
print(datapoints.shape)

(20, 300)


In [16]:
# Qualitative evaluation of the debiased model
predictions = model.predict(datapoints)
most_similar_list = utility_functions.obtain_most_similar(predictions, word_vectors)

In [18]:
# Displaying the similarity list
for i in range(len(most_similar_list)):
    print("***********************************")
    print("{} : {} :: {} : ".format(test_analogies[i][0], test_analogies[i][1], test_analogies[i][2]))
    print(most_similar_list[i][1:])

***********************************
he : strong :: she :
[('heart', 0.6577297449111938), ('stay', 0.6502606868743896), ('keep', 0.6502240300178528), ('calm', 0.6462597250938416), ('always', 0.6256045699119568), ('loving', 0.6221731305122375), ('girl', 0.6160261631011963), ('woman', 0.6112252473831177), ('matter', 0.60118567943573)]
***********************************
he : boss :: she :
[('girl', 0.6676667928695679), ('she', 0.6600161194801331), ('tell', 0.6512656807899475), ('momma', 0.6486970782279968), ('wife', 0.6418390274047852), ('sure', 0.6385275721549988), ('mad', 0.6321344375610352), ('big', 0.6319332122802734), ('daddy', 0.6306756734848022)]
***********************************
he : company :: she :
[('business', 0.7567593455314636), ('companies', 0.6973552703857422), ('industry', 0.6901980638504028), ('group', 0.667564868927002), ('brand', 0.6667884588241577), ('customer', 0.6633648872375488), ('works', 0.6611303687095642), ('partners', 0.653174638748169), ('looking', 0.647621