In [1]:
import io
import numpy as np
import torch
from torch import nn
import argparse
import time

from Dictionary import Dictionary
from Discriminator import Discriminator
from TrainModel import TrainModel
from Evaluator import Evaluator

Impossible to import Faiss library!! Switching to standard nearest neighbors search implementation, this will be significantly slower.



In [2]:
#default & settings

class parameters(object):  
    def __init__(self):
    
        self.useGPU = True        # Use GPU or not
        self.max_n_embed = 1000        # Max number of embeddings loaded, negative means load all
        self.n_epoch_adv = 5      # Number of epochs for adversial training
        self.epoch_size_adv = 1000 # Iterations per epoch for adversial training
        self.batch_size_adv = 32   # Batch size for adversial training
        self.dis_steps_adv = 5     # Discriminator steps
        self.feedback_coeff = 1    # Discriminator loss feedback coefficient
        self.map_beta = 0.001      # Beta for orthogonalization
        self.csls_k = 10           # k nearest neighbors in CSLS

param_list = parameters()

# test purpose
if param_list.useGPU:
    print param_list.n_epoch_adv

5


In [3]:
def load_embeddings(source_embedding_path, target_embedding_path, maxCount = 1e10):
    # Load embeddings    
    # read txt embeddings for English(2519370 words with 300 dim) and Chinese(332647 words with 300 dim)
    
    word2id = {}     # e.g. u'\u5e74 = year
    vectors = []
    count = 0
    with io.open(target_embedding_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        for i, line in enumerate(f):
            #print i,line
            #print i
            
            if param_list.max_n_embed>=0 and count>=param_list.max_n_embed:
                break
            count += 1
            if i == 0:
                split = line.split()
            else:
                word, vect = line.rstrip().split(' ', 1) #stripe space from end
                #print word #real chars

                vect = np.fromstring(vect, sep=' ')

                if np.linalg.norm(vect) == 0:  # avoid to have null embeddings
                    vect[0] = 0.001 #add a little amount...
                
                word2id[word] = count-2
                vectors.append(vect[None])
    
#     print len(vectors[0]),word2id
    print "Finished loading", count, "words..."
    id2word = {v: k for k, v in word2id.items()}  #reverse of word2id
    dic = Dictionary(id2word, word2id, "zh")
    #print "len is",dic.__len__()
    embeddings = np.concatenate(vectors, 0)
    embeddings = torch.from_numpy(embeddings).float()
    return dic, embeddings

In [4]:
# load source embedding
source_embedding_path = "data/wiki.en.vec"
target_embedding_path = "data/wiki.zh.vec"
src_dic, _src_emb = load_embeddings(source_embedding_path,source_embedding_path, 100)
src_emb = nn.Embedding(len(src_dic), 300, sparse=True) #dim is set to 300..

# load target embedding
tgt_dic, _tgt_emb = load_embeddings(target_embedding_path,target_embedding_path, 100)
tgt_emb = nn.Embedding(len(tgt_dic), 300, sparse=True)


Finished loading 1000 words...
Finished loading 1000 words...


In [5]:
# Mapping
mapping = nn.Linear(300, 300, bias=False)

In [6]:
discriminator = Discriminator()

In [7]:
discriminator.layers

Sequential(
  (0): Dropout(p=0.1)
  (1): Linear(in_features=300, out_features=2048, bias=True)
  (2): LeakyReLU(0.2)
  (3): Dropout(p=0)
  (4): Linear(in_features=2048, out_features=2048, bias=True)
  (5): LeakyReLU(0.2)
  (6): Dropout(p=0)
  (7): Linear(in_features=2048, out_features=1, bias=True)
  (8): Sigmoid()
)

In [8]:
# use gpu
if param_list.useGPU:
    src_emb.cuda()
    tgt_emb.cuda()
    mapping.cuda()
    discriminator.cuda()

In [9]:
# do not normalize embeddings
# params.src_mean = normalize_embeddings(src_emb.weight.data, "")
# params.tgt_mean = normalize_embeddings(tgt_emb.weight.data, "")

In [10]:
#Now we have these four core part cuda: src_emb.cuda(), tgt_emb.cuda(), mapping.cuda(), discriminator.cuda()

### train model initialization

In [11]:
trainer = TrainModel(src_emb, tgt_emb, mapping, discriminator, src_dic, tgt_dic, 'sgd', 0.1, param_list)
#trainer = TrainModel(1)

In [12]:
mapping.parameters

<bound method Linear.parameters of Linear(in_features=300, out_features=300, bias=False)>

In [13]:
trainer.map_optimizer.param_groups

[{'dampening': 0,
  'lr': 0.1,
  'momentum': 0,
  'nesterov': False,
  'params': [Parameter containing:
   -5.7562e-02 -5.1616e-02  3.8042e-02  ...   2.2706e-02 -2.0915e-02 -4.0955e-02
   -2.1812e-02 -2.0067e-02  4.5006e-02  ...  -1.6091e-02  1.0634e-02 -3.8151e-02
    3.0340e-02 -5.7707e-02  4.9239e-02  ...  -3.3092e-02 -1.0309e-04  3.8321e-02
                   ...                   ⋱                   ...                
    9.2728e-03  5.2941e-02  5.1151e-02  ...   1.1323e-02 -3.7980e-02 -4.6198e-02
   -1.4374e-02 -2.6682e-02 -4.7653e-02  ...   4.3119e-02 -5.5847e-02  1.4095e-02
   -3.0719e-02  4.6355e-02 -4.5438e-02  ...  -5.0887e-02  4.9836e-02  2.8728e-02
   [torch.cuda.FloatTensor of size 300x300 (GPU 0)]],
  'weight_decay': 0}]

### Evaluator initialization

In [14]:
evaluator = Evaluator(trainer)

### Unsupervised Training

In [15]:
#Adversarial Training
print('--------- ADVERSARIAL TRAINING -------\n')
#epoch_size = 1000000
for epoch in xrange(param_list.n_epoch_adv):
    print('Starting %i th epoch in adversarial training...' % epoch)
    tic = time.time()
    n_words_proc = 0
    stats = {'DIS_COSTS': []}
    for n_iter in range(0, param_list.n_epoch_adv, param_list.batch_size_adv):
        # discriminator training
        for _ in range(param_list.dis_steps_adv):
            trainer.dis_step(stats)
        # discriminator fooling
        n_words_proc += trainer.mapping_step(stats)
        #print(stats)
        
        # log stats
        if n_iter % 2000 == 0:
            stats_str = [('DIS_COSTS', 'Discriminator loss')]
            stats_log = ['%s: %.4f' % (v, np.mean(stats[k]))
                         for k, v in stats_str if len(stats[k]) > 0]
            stats_log.append('%i samples/s' % int(n_words_proc / (time.time() - tic)))
            print(('%06i - ' % n_iter) + ' - '.join(stats_log))

            # reset
            tic = time.time()
            n_words_proc = 0
            for k, _ in stats_str:
                del stats[k][:]
        evaluator.evaluate()

    #print stats

--------- ADVERSARIAL TRAINING -------

Starting 0 th epoch in adversarial training...
000000 - Discriminator loss: 0.6602 - 175 samples/s
Loaded 3 pairs of words in the dictionary (3 unique). 21594 other pairs contained at least one unknown word (21593 in lang1, 21541 in lang2)
Using nn for matching pairs
Loaded 3 pairs of words in the dictionary (3 unique). 21594 other pairs contained at least one unknown word (21593 in lang1, 21541 in lang2)
('Using CSLS with k = ', 10)
Starting 1 th epoch in adversarial training...
000000 - Discriminator loss: 0.6099 - 4458 samples/s
Loaded 3 pairs of words in the dictionary (3 unique). 21594 other pairs contained at least one unknown word (21593 in lang1, 21541 in lang2)
Using nn for matching pairs
Loaded 3 pairs of words in the dictionary (3 unique). 21594 other pairs contained at least one unknown word (21593 in lang1, 21541 in lang2)
('Using CSLS with k = ', 10)
Starting 2 th epoch in adversarial training...
000000 - Discriminator loss: 0.5698 