In [1]:
import numpy as np
import operator
from ours_eval import csls_knn_10_score, evaluation
from icp import ICPTrainer
import multiprocessing
import matplotlib.pyplot as plt
import time
import utils
import params
import sklearn.cluster
from sklearn.decomposition import PCA

In [9]:
def sub_icp(src_W, tgt_W, n_icp_runs):
    def run_icp(s0, i):
        np.random.seed(s0 + i)
        icp = ICPTrainer(src_W.copy(), tgt_W.copy(), True, params.n_pca)
        t0 = time.time()
        indices_x, indices_y, rec, bb = icp.train_icp(params.icp_init_epochs)
        dt = time.time() - t0
        print("%d: Rec %f BB %d Time: %f" % (i, rec, bb, dt))
        return indices_x, indices_y, rec, bb
    data = np.zeros((n_icp_runs, 2)) #100, 2

    best_idx_x = None
    best_idx_y = None

    min_rec = 1e8
    s0 = np.random.randint(50000)
    results = []
    if params.n_processes == 1:
        for i in range( n_icp_runs):
            results += [run_icp(s0, i)]
    else:
        pool = multiprocessing.Pool(processes=params.n_processes)
        for result in tqdm.tqdm(pool.imap_unordered(run_icp, range(n_icp_runs)), total=n_icp_runs):
            results += [result]
        pool.close()

    min_rec = 1e8
    min_bb = None
    for i, result in enumerate(results):
        indices_x, indices_y, rec, bb = result
        data[i, 0] = rec
        data[i, 1] = bb
        if rec < min_rec:
            best_idx_x = indices_x
            best_idx_y = indices_y
            min_rec = rec
            min_bb = bb


    idx = np.argmin(data[:, 0], 0)
    print("Init - Achieved: Rec %f BB %d" % (data[idx, 0], data[idx, 1]))
    icp_train = ICPTrainer(src_W, tgt_W, False, src_W.shape[0])
    _, _, rec, bb = icp_train.train_icp(params.icp_train_epochs, True, best_idx_x, best_idx_y)
    print("Training - Achieved: Rec %f BB %d" % (rec, bb))

    TX = icp_train.icp.TX
    TY = icp_train.icp.TY
    return TX, TY

In [3]:
src_id2word, src_word2id, src_embeddings = utils.read_txt_embeddings('data/wiki.%s.vec' % params.src_lang, params.n_eval_ex, False) #n_eval_ex = 200000
tgt_id2word, tgt_word2id, tgt_embeddings = utils.read_txt_embeddings('data/wiki.%s.vec' % params.tgt_lang, params.n_eval_ex, False)
src_normed = src_embeddings / np.linalg.norm(src_embeddings, ord=2, axis=1, keepdims=True)
tgt_normed = tgt_embeddings / np.linalg.norm(tgt_embeddings, ord=2, axis=1, keepdims=True)
cross_dict_src2tgt = utils.load_dictionary('data/%s-%s.5000-6500.txt' % (params.src_lang, params.tgt_lang), src_word2id, tgt_word2id)
cross_dict_tgt2src = utils.load_dictionary('data/%s-%s.5000-6500.txt' % (params.tgt_lang, params.src_lang), tgt_word2id, src_word2id)

Loaded 200000 pre-trained word embeddings.
Loaded 200000 pre-trained word embeddings.
data/en-es.5000-6500.txt
Found 2975 pairs of words in the dictionary (1500 unique). 0 other pairs contained at least one unknown word (0 in lang1, 0 in lang2)
data/es-en.5000-6500.txt
Found 2416 pairs of words in the dictionary (1500 unique). 0 other pairs contained at least one unknown word (0 in lang1, 0 in lang2)


In [4]:
src_full = np.load("data/%s_%d.npy" % (params.src_lang, params.n_init_ex)) # 5000, 5000 english
tgt_full = np.load("data/%s_%d.npy" % (params.tgt_lang, params.n_init_ex)) # 300, 5000 es
print(src_full.shape)

(5000, 300)


In [10]:
# original -baseline
#TX, TY = sub_icp(src_full[:5000,:].T, tgt_full[:5000,:].T)
TX, TY = sub_icp(src_full[:2500,:].T, tgt_full[:2500,:].T, 20)

0: Rec 6.945307 BB 348 Time: 13.663379
1: Rec 6.707236 BB 337 Time: 13.698650
2: Rec 6.947372 BB 332 Time: 13.657169
3: Rec 6.951586 BB 354 Time: 13.569556
4: Rec 6.565661 BB 365 Time: 13.584978
Init - Achieved: Rec 6.565661 BB 365
Training - Achieved: Rec 10.149690 BB 793


In [11]:
from utils import get_nn_avg_dist
def csls_knn_10_score(emb_trans, emb_tgt, dico):
    emb_trans = emb_trans / np.linalg.norm(emb_trans, ord=2, axis=1, keepdims=True)
    emb_tgt = emb_tgt / np.linalg.norm(emb_tgt, ord=2, axis=1, keepdims=True)
    emb_trans = emb_trans.astype('float32')
    emb_tgt = emb_tgt.astype('float32')
    # I use csls_knn_10 directly
    average_dist1 = get_nn_avg_dist(emb = emb_tgt, query = emb_trans, knn = 10) #(200000,)
    average_dist2 = get_nn_avg_dist(emb = emb_trans, query = emb_tgt, knn = 10) #(200000,)
    
    query = emb_trans[dico[:, 0]] # dico[:, 0] is from source Domain, # dico[:, 1] is from target domain
    scores = 2 * query.dot(emb_tgt.T) #2975*200000
    scores -= average_dist1[dico[:, 0]][:, None] # right hand side: 2975, 1
    scores -= average_dist2[None,:] # right hand side: 1, 200000
    
    return scores

In [12]:
# original - baseline
TranslatedX = src_embeddings.dot(np.transpose(TX))
scores = csls_knn_10_score(emb_trans=TranslatedX, emb_tgt=tgt_embeddings, dico=cross_dict_src2tgt)
evaluation(scores, cross_dict_src2tgt)

1500 source words -   - Precision at k = 1: 0.066667
1500 source words -   - Precision at k = 5: 0.066667
1500 source words -   - Precision at k = 10: 0.333333


[('precision_at_1', 0.06666666666666667),
 ('precision_at_5', 0.06666666666666667),
 ('precision_at_10', 0.33333333333333337)]