In [82]:
# Importing Libraries
from collections import defaultdict
from operator import itemgetter
from pathlib import Path
import numpy as np
import pandas as pd
from collections import namedtuple
from tabulate import tabulate
import re 

import torch
import os

from adversarial_debiasing import AdversarialDebiasing
from load_data import load_data, transform_data, Datapoint

from load_vectors import load_pretrained_vectors, load_vectors
import config
import utility_functions
import qualitative_evaluation

import gensim
import gzip
import pickle

[autoreload of load_vectors failed: Traceback (most recent call last):
  File "/home/max/anaconda3/envs/fact/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/max/anaconda3/envs/fact/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 394, in superreload
    module = reload(module)
  File "/home/max/anaconda3/envs/fact/lib/python3.7/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/home/max/anaconda3/envs/fact/lib/python3.7/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 630, in _exec
  File "<frozen importlib._bootstrap_external>", line 728, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/home/max/develop/master_develop/FACT-AI/load_vectors.py", line 8, in <module>
    import wget
ModuleNotFoundError: No module named 'wget'
]


In [77]:
# For autoreloading changes made in other python scripts
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
# Loading the word vectors dictionary
# For Wikipedia2Vec - use config.wiki_embedding_data_path and config.wiki_embedding_type
# For Glove - use config.glove_embedding_data_path and config.glove_embedding_type
# For GoogleNews (Word2Vec) - use config.google_embedding_data_path and config.google_embedding_type
word_vectors = load_pretrained_vectors(config.google_embedding_data_path, config.save_dir, config.google_embedding_type)


FileNotFoundError: [Errno 2] No such file or directory: 'data/GoogleNews-vectors-negative300.bin.gz'

In [46]:
# Load the google analogies training dataset:
analogy_dataset = load_data()
analogy_dataset[0:10]

[Raw_Datapoint(x1='Athens', x2='Greece', x3='Baghdad', y='Iraq', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Bangkok', y='Thailand', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Beijing', y='China', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Berlin', y='Germany', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Bern', y='Switzerland', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Cairo', y='Egypt', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Canberra', y='Australia', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Hanoi', y='Vietnam', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Havana', y='Cuba', task='capital-common-countries'),
 Raw_Datapoint(x1='Athens', x2='Greece', x3='Helsinki', y='Finland', task='capital-common-cou

In [48]:
# Transform the data such that it includes the embeddings of the words in consideration
transformed_analogy_dataset, gender_subspace = transform_data(word_vectors, analogy_dataset, use_boluk = False)


# Obtaining the dimensionality of the word embeddings
word_embedding_dim = transformed_analogy_dataset[0].gt_embedding.shape[0]

# Testing the transformed analogy dataset
assert transformed_analogy_dataset[0].analogy_embeddings.shape[0] == word_embedding_dim * 3
assert transformed_analogy_dataset[0].gt_embedding.shape[0] == word_embedding_dim
assert transformed_analogy_dataset[0].protected.shape[0] == 1

print(transformed_analogy_dataset[0].analogy_embeddings.shape)
print(transformed_analogy_dataset[0].gt_embedding.shape)
print(transformed_analogy_dataset[0].protected.shape)

NameError: name 'word_vectors' is not defined

In [None]:
# To run the grid-search and obtain the np.dot(w.T, g) values
learning_rate_list = [2 ** -12, 2 ** -6, 2 ** -3]
adversary_loss_weight_list = [1.0, 0.5, 0.1]

# For the saved model checkpoints pertaining to the word embedding type
word_embedding_type = 'GNews'

# Performing the grid search
utility_functions.grid_search(learning_rate_list, adversary_loss_weight_list, word_embedding_dim, gender_subspace, transformed_analogy_dataset, word_embedding_type, 'models')


In [84]:
def load_model(model_path: Path, word_embedding_dim, gender_subspace):
    # with open(str(model_path), "rb") as f:
    #     state_dict = pickle.load(f)
    # device = torch.device('cpu')
    # print(model_path)
    state_dict = torch.load(str(model_path), map_location=torch.device('cpu'))
    model = AdversarialDebiasing(
                    seed = 42,
                    word_embedding_dim = word_embedding_dim,
                    num_epochs = 500,
                    debias = False,
                    gender_subspace = gender_subspace,
                    batch_size = 256,
                    adversary_loss_weight = 0.1,
                    classifier_learning_rate = 2 ** -6,
                    adversary_learning_rate = 2 ** -6
                )
    
    model.W1 = state_dict["W1"]
    model.W2 = state_dict["W2"]
    
    return model

debiased_models = defaultdict(list)
non_debiased_models = defaultdict(list)


ModelResult = namedtuple('ModelResult', ['best_model', 'last_model', 'embedding_type', 'learning_rate', 'adversary_weight', 'debiased'])

for model_base_path in [Path('models/debiased'), Path('models/non_debiased')]:
    l, debiased = (non_debiased_models, False) if 'non_debiased' in str(model_base_path) else (debiased_models, True)

    for model_path in model_base_path.iterdir():
        if '_last' in str(model_path):
            continue
            
        m = re.search('^([A-Za-z]+)_([\d.]+)_([\d.]+)(_last){0,1}.pckl$', str(model_path.name))
        embeddings = m.group(1)
        learning_rate = m.group(2)
        adversary_weight = m.group(3)
        
        best_model = load_model(model_path, word_embedding_dim, gender_subspace)
        
        last_model_path = model_path.parent / f"{model_path.stem}_last{model_path.suffix}"
        last_model = load_model(last_model_path, word_embedding_dim, gender_subspace)
        
        
        l[embeddings].append(ModelResult(best_model, last_model, embeddings, learning_rate, adversary_weight, debiased))
        
        

In [85]:
print(debiased_models, len(debiased_models))
print(non_debiased_models, len(non_debiased_models))


defaultdict(<class 'list'>, {'GNews': [ModelResult(best_model=<adversarial_debiasing.AdversarialDebiasing object at 0x7f5e7a5fab50>, last_model=<adversarial_debiasing.AdversarialDebiasing object at 0x7f5e7a68e150>, embedding_type='GNews', learning_rate='0.125', adversary_weight='1.0', debiased=True), ModelResult(best_model=<adversarial_debiasing.AdversarialDebiasing object at 0x7f5e7921b910>, last_model=<adversarial_debiasing.AdversarialDebiasing object at 0x7f5e7921b550>, embedding_type='GNews', learning_rate='0.000244140625', adversary_weight='1.0', debiased=True), ModelResult(best_model=<adversarial_debiasing.AdversarialDebiasing object at 0x7f5e8010a390>, last_model=<adversarial_debiasing.AdversarialDebiasing object at 0x7f5e7a2b1410>, embedding_type='GNews', learning_rate='0.015625', adversary_weight='0.5', debiased=True), ModelResult(best_model=<adversarial_debiasing.AdversarialDebiasing object at 0x7f5e7bb70b90>, last_model=<adversarial_debiasing.AdversarialDebiasing object at 0

In [89]:
# Last model
learning_rates = list(set(model.learning_rate for models in debiased_models.values() for model in models)) 
adversary_weights = list(set(model.learning_rate for models in debiased_models.values() for model in models)) 

adversary_weights = sorted(adversary_weights)
learning_rates = sorted(learning_rates)

box_df_debiased = pd.DataFrame([], columns=learning_rates, index=adversary_weights)

for model_result in debiased_models['GNews']:
    box_df_debiased.loc[model_result.adversary_weight, model_result.learning_rate] = np.dot(model_result.last_model.W1.clone().detach().numpy().T, gender_subspace.T).item()

box_df_debiased

Unnamed: 0,0.000244140625,0.015625,0.125
0.000244140625,,,
0.015625,,,
0.125,,,
1.0,0.0,0.0,0.0
0.5,0.0,0.0,0.0
0.1,0.0,0.0,0.0


In [83]:
# Best model
box_df_debiased = pd.DataFrame([], columns=learning_rates, index=adversary_weights)

for model_result in debiased_models:
    box_df_debiased.loc[model_result.adversary_weight, model_result.learning_rate] = np.dot(model_result.best_model.W1.clone().detach().numpy().T, gender_subspace).item()

box_df_debiased


Unnamed: 0,0.000244140625,0.015625,0.125
0.1,0.0,0.0,0.0
0.5,0.0,0.0,0.0
1.0,0.0,0.0,0.0


In [81]:
# 
box_df_biased = pd.DataFrame([], columns=learning_rates, index=adversary_weights)

for model_result in debiased_models:
    box_df_biased.loc[model_result.adversary_weight, model_result.learning_rate] = np.dot(model_result.last_model.W1.clone().detach().numpy().T, gender_subspace).item()

box_df_biased



# debiased_model_best.W1 = best_state_dict["W1"]
# debiased_model_best.W2 = best_state_dict["W2"]
# debiased_model_last.W1 = last_state_dict["W1"]
# debiased_model_last.W2 = last_state_dict["W2"]
# 
# print("Best : {}".format(np.dot(debiased_model_best.W1.clone().detach().cpu().numpy().T, gender_subspace.T)))
# 
# print("Last : {}".format(np.dot(debiased_model_last.W1.clone().detach().cpu().numpy().T, gender_subspace.T)))


Unnamed: 0,0.000244140625,0.015625,0.125
0.1,0.0,0.0,0.0
0.5,0.0,0.0,0.0
1.0,0.0,0.0,0.0


In [None]:
debiased_model.fit(dataset=transformed_analogy_dataset)

#### Qualitative Evaluation

In [None]:

# Get sexism traps as word embeddings and words
datapoints, test_analogies = qualitative_evaluation.get_datapoints(word_vectors)

# Predictions of the non debiased model
non_debiased_predictions = qualitative_evaluation.get_non_debiased_predictions(datapoints, word_embedding_dim)
non_debiased_most_similar_list = utility_functions.obtain_most_similar(non_debiased_predictions, word_vectors)

# Predictions of the debiased model
debiased_predictions = debiased_model.predict(datapoints)
debiased_most_similar_list = utility_functions.obtain_most_similar(debiased_predictions, word_vectors)

# Print similarity results for both models
qualitative_evaluation.print_combined_table(non_debiased_most_similar_list, debiased_most_similar_list, test_analogies)