### Gender mitigated word embedding using adversarial feature learning

#### In this project, we will try to mitiage the gender information in word embedding, based on [GloVe](https://nlp.stanford.edu/projects/glove/) and [Adversarial feature learning](https://arxiv.org/abs/1705.11122).

#### 1 GloVe Model

In [1]:
import torch as t
from nltk.tokenize import word_tokenize
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.init import xavier_normal
import torch.optim as optim

In [109]:
class GloVe(nn.Module):
    def __init__(self, co_oc,  embedding_size, x_max = 100, alpha = 0.75):
        """
        co_oc: co-occurrence ndarray
        """
        super(GloVe, self).__init__()
        
        self.embedding_size = embedding_size
        self.x_max = x_max
        self.alpha = alpha
        
        '''co_oc matrix'''
        self.co_oc = co_oc + 1.0
        self.vocabulary_size,_ = co_oc.shape

        self.in_embed = nn.Embedding(self.vocabulary_size, self.embedding_size)
        self.in_embed.weight = xavier_normal(self.in_embed.weight) #normalize
        
        self.in_bias = nn.Embedding(self.vocabulary_size, 1) #bias.shape =[vocabularySize,1]
        self.in_bias.weight = xavier_normal(self.in_bias.weight)
        
        self.out_embed = nn.Embedding(self.vocabulary_size, self.embedding_size)
        self.out_embed.weight = xavier_normal(self.out_embed.weight)
        
        self.out_bias = nn.Embedding(self.vocabulary_size, 1)
        self.out_bias.weight = xavier_normal(self.out_bias.weight)
        
        
    def convert_to_index(self, in_ind, out_ind):
        u = min(in_ind, out_ind)
        v = max(in_ind, out_ind)
        return int((2*self.vocabulary_size -u + 1) * u / 2 + v - u)
    
    def forward(self, batch_input, batch_output):
        """
        return the loss
        """
        assert len(batch_input) == len(batch_output)
        
        batch_size = len(batch_input)

        co_occurences = np.array([self.co_oc[batch_input[i], batch_output[i]] for i in range(batch_size)])
        weights = np.array([self._weight(var) for var in co_occurences])
        
        co_occurences = Variable(t.from_numpy(co_occurences)).float() #variable can do backpropagation
        weights = Variable(t.from_numpy(weights)).float()
        
        batch_input = Variable(t.from_numpy(batch_input))
        batch_output = Variable(t.from_numpy(batch_output))
        
        input_embed = self.in_embed(batch_input)
        output_embed = self.out_embed(batch_output)
        input_bias = self.in_bias(batch_input)
        output_bias = self.out_bias(batch_output)
        
        loss = (t.pow(
            ((input_embed * output_embed).sum(1) + input_bias + output_bias).squeeze(1) - t.log(co_occurences), 2
        ) * weights).sum() / batch_size
        
        print(loss.data[0])
        return loss 
    
    def _weight(self, x):
        return 1 if x > self.x_max else (x / self.x_max) ** self.alpha
    
    def embeddings(self):
        return self.in_embed.weight.data.cpu().numpy() + self.out_embed.weight.data.cpu().numpy()
    
        

In [110]:
def get_batch(co_oc_matrix, batch_size):
    in_index  = np.random.choice(np.arange(len(co_oc_matrix)), size = batch_size, replace = False)
    out_index  = np.random.choice(np.arange(len(co_oc_matrix)), size = batch_size, replace = False)
    return in_index, out_index
        
    

In [127]:
#test get_batch()
np.random.seed(1)
in_index, out_index = get_batch(comat, 50)
print(in_index, out_index)

[ 306  888  126  486 1022  226  742  872  200  133  309  576  101  705  493
  321  553  108  216  919  181  259  528 1048  283 1056  395  385  943  730
  673   90  795   49  691  336  298  419   94  156   80  650  886  301  981
  148  455   41   65   99] [ 485  201  562  241  193  935  670 1009  510  270   31  933  432 1044  548
   93  117  126  108  960  169  203  153  253  396 1089  690  598  429  414
  980  313 1075  805 1087  205   86  668  725  716  734  834 1001  695  431
 1040  362  111  423   65]


In [119]:
context_size = 3
def get_co_oc_matrix(words_file):
    with open(words_file, 'r') as f:
        text = f.read().lower()
    word_list = word_tokenize(text)
    text_size = len(word_list)
    vocab = np.unique(word_list)
    vocabulary_size = len(vocab)

    word2ind = {word:ind for ind,word in enumerate(vocab)}
    comat = np.zeros((vocabulary_size, vocabulary_size))
    for i in range(text_size): #main word
        for j in range(1, context_size + 1): #all the context words
            ind = word2ind[word_list[i]] 
            if i - j > 0:
                lind = word2ind[word_list[i-j]]
                comat[ind, lind] += 1.0/j
            if i+j < text_size:
                rind = word2ind[word_list[i+j]]
                comat[ind, rind] += 1./j
    co_oc = np.transpose(np.nonzero(comat)) #non-zero index
    return comat, co_oc

In [123]:
def train_GloVe(co_oc_matrix, embeding_size, batch_size = 50, iterations = 1000):
    glove = GloVe(co_oc_matrix, embeding_size)
    optimizer = optim.Adagrad(glove.parameters(), 0.05)
    
    for i in range(iterations):
        in_data, out_data = get_batch(co_oc_matrix, batch_size)
        
        loss = glove(in_data, out_data)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    word_embeddings = glove.embeddings()
    
    return word_embeddings

In [128]:
comat, co_oc = get_co_oc_matrix('test.txt')
print(comat.shape)
print(co_oc.shape)

(1115, 1115)
(19772, 2)


In [None]:
word_embeddings = train_GloVe(comat, 5, 10, 3000)