In [1]:
import gensim
import os
import codecs
import io
import logging
import pandas as pd 
import numpy as np


# Log output. Also useful to show program is doing things
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [12]:
# models trained using gensim implementation of word2vec
print('Loading models...')
model_source = gensim.models.Word2Vec.load('model_CBOW_jp_wzh_2.w2v')
model_target = gensim.models.Word2Vec.load('model_CBOW_en_wzh_2.w2v')

2018-05-24 09:53:45,782 : INFO : loading Word2Vec object from model_CBOW_jp_wzh_2.w2v


Loading models...


2018-05-24 09:53:46,190 : INFO : loading wv recursively from model_CBOW_jp_wzh_2.w2v.wv.* with mmap=None
2018-05-24 09:53:46,192 : INFO : setting ignored attribute syn0norm to None
2018-05-24 09:53:46,193 : INFO : setting ignored attribute cum_table to None
2018-05-24 09:53:46,194 : INFO : loaded model_CBOW_jp_wzh_2.w2v
2018-05-24 09:53:46,268 : INFO : loading Word2Vec object from model_CBOW_en_wzh_2.w2v
2018-05-24 09:53:46,567 : INFO : loading wv recursively from model_CBOW_en_wzh_2.w2v.wv.* with mmap=None
2018-05-24 09:53:46,568 : INFO : loading syn0 from model_CBOW_en_wzh_2.w2v.wv.syn0.npy with mmap=None
2018-05-24 09:53:46,591 : INFO : setting ignored attribute syn0norm to None
2018-05-24 09:53:46,592 : INFO : loading syn1neg from model_CBOW_en_wzh_2.w2v.syn1neg.npy with mmap=None
2018-05-24 09:53:46,614 : INFO : setting ignored attribute cum_table to None
2018-05-24 09:53:46,615 : INFO : loaded model_CBOW_en_wzh_2.w2v


In [13]:
sources = []
targets = []
with open("./ja-en.txt",'r',encoding='utf-8') as f:
    line = f.readline()
    while line:
        source,target = line.split()
        sources.append(source)
        targets.append(target)
        line = f.readline()

In [14]:
sources

['から',
 'として',
 '日本',
 '日本',
 'この',
 'まで',
 'まで',
 'また',
 '昭和',
 'もの',
 '大学',
 '大学',
 '大学',
 '大学',
 'テレビ',
 'テレビ',
 'テレビ',
 '放送',
 '放送',
 '東京',
 'より',
 '削除',
 '削除',
 '削除',
 '削除',
 '削除',
 '現在',
 '現在',
 '現在',
 '平成',
 '学校',
 '学校',
 'これ',
 '世界',
 'アメリカ',
 'でも',
 'について',
 '作品',
 '作品',
 '時代',
 '映画',
 '映画',
 '映画',
 '映画',
 '監督',
 '監督',
 '鉄道',
 '鉄道',
 '鉄道',
 '鉄道',
 '使用',
 '出演',
 '出演',
 '出演',
 '明治',
 '時間',
 '時間',
 '時間',
 '開始',
 '開始',
 '開始',
 'シリーズ',
 'のみ',
 '記事',
 '記事',
 '記事',
 '記事',
 '活動',
 '選手',
 '選手',
 'しかし',
 '研究',
 '研究',
 '会話',
 '会話',
 '大阪',
 '大会',
 'チーム',
 'チーム',
 '存在',
 '優勝',
 '作詞',
 '所属',
 '所属',
 '国際',
 '一部',
 '一部',
 '結果',
 '結果',
 'アルバム',
 'アルバム',
 'バス',
 'バス',
 'バス',
 'ゲーム',
 'ゲーム',
 'ゲーム',
 '問題',
 '問題',
 '問題',
 '問題',
 'リーグ',
 'リーグ',
 '利用',
 '開発',
 '事業',
 '決定',
 '決定',
 '音楽',
 '転送',
 '転送',
 '転送',
 '転送',
 '情報',
 '情報',
 '公式',
 '公式',
 '参加',
 '参加',
 '地域',
 '地域',
 '変更',
 '変更',
 '変更',
 '編曲',
 '編曲',
 '編曲',
 'サイト',
 'サイト',
 '丁目',
 '記録',
 '記録',
 '関係',
 '関係',
 'クラブ',
 'クラブ',
 '中央',
 '中央',
 '中央',
 

In [15]:
targets

['from',
 'as',
 'japan',
 'nippon',
 'this',
 'until',
 'till',
 'also',
 'showa',
 'things',
 'universities',
 'university',
 'college',
 'colleges',
 'tv',
 'televisions',
 'television',
 'broadcast',
 'broadcasting',
 'tokyo',
 'than',
 'deleted',
 'deleting',
 'deletion',
 'remove',
 'delete',
 'current',
 'currently',
 'present',
 'heisei',
 'schools',
 'school',
 'this',
 'world',
 'america',
 'but',
 'about',
 'works',
 'discography',
 'era',
 'movie',
 'cinema',
 'film',
 'movies',
 'director',
 'supervision',
 'railroads',
 'railroad',
 'railways',
 'railway',
 'use',
 'appearances',
 'appearance',
 'cast',
 'meiji',
 'hours',
 'hour',
 'time',
 'begin',
 'start',
 'initiation',
 'series',
 'only',
 'article',
 'story',
 'stories',
 'articles',
 'activities',
 'players',
 'athletes',
 'but',
 'studies',
 'research',
 'conversation',
 'conversations',
 'osaka',
 'convention',
 'teams',
 'team',
 'existence',
 'winning',
 'lyrics',
 'affiliation',
 'belong',
 'international',
 

In [16]:
pairs = pd.DataFrame()
pairs['source'] = pd.Series(sources)
pairs['target'] = pd.Series(targets)

In [17]:
pairs

Unnamed: 0,source,target
0,から,from
1,として,as
2,日本,japan
3,日本,nippon
4,この,this
5,まで,until
6,まで,till
7,また,also
8,昭和,showa
9,もの,things


In [18]:
print('Removing missing vocabulary...')

missing = 0

for n in range (len(pairs)):
    if pairs['source'][n] not in model_source.wv.vocab or pairs['target'][n] not in model_target.wv.vocab:
        missing = missing + 1
        pairs = pairs.drop(n)

pairs = pairs.reset_index(drop = True)
print ('Amount of missing vocab: ', missing)

Removing missing vocabulary...
Amount of missing vocab:  17016


In [19]:
# make list of pair words, excluding the missing vocabs 
# removed in previous step
pairs['vector_source'] = [model_source[pairs['source'][n]] for n in range (len(pairs))]
pairs['vector_target'] = [model_target[pairs['target'][n]] for n in range (len(pairs))]

# first 5000 from both languages, to train translation matrix
source_training_set = pairs['vector_source'][:10000]
target_training_set = pairs['vector_target'][:10000]

matrix_train_source = pd.DataFrame(source_training_set.tolist()).values
matrix_train_target = pd.DataFrame(target_training_set.tolist()).values


In [20]:
matrix_train_source.shape

(8953, 100)

In [21]:
print ('Generating translation matrix')
# Matrix W is given in  http://stackoverflow.com/questions/27980159/fit-a-linear-transformation-in-python
translation_matrix = np.linalg.pinv(matrix_train_source).dot(matrix_train_target).T
print ('Generated translation matrix')
translation_matrix.shape

Generating translation matrix
Generated translation matrix


(100, 100)

In [12]:
# Returns list of topn closest vectors to vectenter
def most_similar_vector(self, vectenter, topn=5):
    self.init_sims()
    dists = np.dot(self.wv.syn0norm, vectenter)
    if not topn:
        return dists
    best = np.argsort(dists)[::-1][:topn ]
        # ignore (don't return) words from the input
    result = [(self.wv.index2word[sim], float(dists[sim])) for sim in best]
    return result[:topn]

def top_translations(w,numb=5):
    val = most_similar_vector(model_target,translation_matrix.dot(model_source[w]),numb)
    #print 'traducwithscofres ', val
    return val


def top_translations_list(w, numb=5):
    val = [top_translations(w,numb)[k][0] for k in range(numb)]
    return val

temp = 1
#top_matches = [ pairs['target'][n] in top_translations_list(pairs['source'][n]) for n in range(5000,5003)] 

# print out source word and translation
def display_translations():
    for word_num in range(range_start, range_end):
        source_word =  pairs['source'][word_num]
        translations = top_translations_list(pairs['source'][word_num]) 
        print (source_word, translations)


In [14]:
# range to use to check accuracy
range_start = 7000
range_end = 8000

#display_translations()

# now we can check for accuracy on words 5000-6000, 1-5000 used to traning
# translation matrix

# returns matrix of true or false, true if translation is accuracy, false if not
# accurate means the first translation (most similiar vector in target language)
# is identical
accuracy_at_five = [pairs['target'][n] in top_translations_list(pairs['source'][n]) for n in range(range_start, range_end)]
print ('Accuracy @5 is ', sum(accuracy_at_five), '/', len(accuracy_at_five))

accuracy_at_one = [pairs['target'][n] in top_translations_list(pairs['source'][n],1) for n in range(range_start, range_end)]
print ('Accuracy @1 is ', sum(accuracy_at_one), '/', len(accuracy_at_one))


Accuracy @5 is  34 / 1000
Accuracy @1 is  16 / 1000


In [15]:
52/22

2.3636363636363638

In [16]:
accuracy_at_ten = [pairs['target'][n] in top_translations_list(pairs['source'][n],10) for n in range(range_start, range_end)]
print ('Accuracy @10 is ', sum(accuracy_at_ten), '/', len(accuracy_at_ten))

Accuracy @10 is  42 / 1000


In [17]:
accuracy_at_ten = [pairs['target'][n] in top_translations_list(pairs['source'][n],20) for n in range(range_start, range_end)]
print ('Accuracy @20 is ', sum(accuracy_at_ten), '/', len(accuracy_at_ten))

Accuracy @20 is  45 / 1000


In [18]:
np.save("jp_en.npy",translation_matrix)

In [19]:
# models trained using gensim implementation of word2vec
print('Loading models...')
model_source = gensim.models.Word2Vec.load('model_CBOW_zh_wzh_2.w2v')
model_target = gensim.models.Word2Vec.load('model_CBOW_en_wzh_2.w2v')

2018-05-15 17:51:51,891 : INFO : loading Word2Vec object from model_CBOW_zh_wzh_2.w2v


Loading models...


2018-05-15 17:51:52,957 : INFO : loading wv recursively from model_CBOW_zh_wzh_2.w2v.wv.* with mmap=None
2018-05-15 17:51:52,958 : INFO : setting ignored attribute syn0norm to None
2018-05-15 17:51:52,959 : INFO : setting ignored attribute cum_table to None
2018-05-15 17:51:52,959 : INFO : loaded model_CBOW_zh_wzh_2.w2v
2018-05-15 17:51:53,332 : INFO : loading Word2Vec object from model_CBOW_en_wzh_2.w2v
2018-05-15 17:51:53,578 : INFO : loading wv recursively from model_CBOW_en_wzh_2.w2v.wv.* with mmap=None
2018-05-15 17:51:53,579 : INFO : loading syn0 from model_CBOW_en_wzh_2.w2v.wv.syn0.npy with mmap=None
2018-05-15 17:51:53,688 : INFO : setting ignored attribute syn0norm to None
2018-05-15 17:51:53,689 : INFO : loading syn1neg from model_CBOW_en_wzh_2.w2v.syn1neg.npy with mmap=None
2018-05-15 17:51:53,804 : INFO : setting ignored attribute cum_table to None
2018-05-15 17:51:53,805 : INFO : loaded model_CBOW_en_wzh_2.w2v


In [20]:
sources = []
targets = []
with open("./zh-en.txt",'r',encoding='utf-8') as f:
    line = f.readline()
    while line:
        source,target = line.split()
        sources.append(source)
        targets.append(target)
        line = f.readline()

In [21]:
pairs = pd.DataFrame()
pairs['source'] = pd.Series(sources)
pairs['target'] = pd.Series(targets)

In [22]:
pairs

Unnamed: 0,source,target
0,年,year
1,月,moon
2,月,months
3,月,month
4,日,day
5,和,and
6,村,village
7,人,man
8,人,people
9,%,%


In [23]:
print('Removing missing vocabulary...')

missing = 0

for n in range (len(pairs)):
    if pairs['source'][n] not in model_source.wv.vocab or pairs['target'][n] not in model_target.wv.vocab:
        missing = missing + 1
        pairs = pairs.drop(n)

pairs = pairs.reset_index(drop = True)
print ('Amount of missing vocab: ', missing)

Removing missing vocabulary...
Amount of missing vocab:  10349


In [24]:
# make list of pair words, excluding the missing vocabs 
# removed in previous step
pairs['vector_source'] = [model_source[pairs['source'][n]] for n in range (len(pairs))]
pairs['vector_target'] = [model_target[pairs['target'][n]] for n in range (len(pairs))]

# first 5000 from both languages, to train translation matrix
source_training_set = pairs['vector_source'][:10000]
target_training_set = pairs['vector_target'][:10000]

matrix_train_source = pd.DataFrame(source_training_set.tolist()).values
matrix_train_target = pd.DataFrame(target_training_set.tolist()).values


In [25]:
matrix_train_source.shape

(10000, 100)

In [26]:
print ('Generating translation matrix')
# Matrix W is given in  http://stackoverflow.com/questions/27980159/fit-a-linear-transformation-in-python
translation_matrix = np.linalg.pinv(matrix_train_source).dot(matrix_train_target).T
print ('Generated translation matrix')
translation_matrix.shape

Generating translation matrix
Generated translation matrix


(100, 100)

In [27]:
# range to use to check accuracy
range_start = 3000
range_end = 4000

#display_translations()

# now we can check for accuracy on words 5000-6000, 1-5000 used to traning
# translation matrix

# returns matrix of true or false, true if translation is accuracy, false if not
# accurate means the first translation (most similiar vector in target language)
# is identical
accuracy_at_five = [pairs['target'][n] in top_translations_list(pairs['source'][n]) for n in range(range_start, range_end)]
print ('Accuracy @5 is ', sum(accuracy_at_five), '/', len(accuracy_at_five))

accuracy_at_one = [pairs['target'][n] in top_translations_list(pairs['source'][n],1) for n in range(range_start, range_end)]
print ('Accuracy @1 is ', sum(accuracy_at_one), '/', len(accuracy_at_one))

2018-05-15 17:52:07,799 : INFO : precomputing L2-norms of word weight vectors


Accuracy @5 is  81 / 1000
Accuracy @1 is  42 / 1000


In [28]:
np.save("zh_en.npy",translation_matrix)

In [23]:
#models trained using gensim implementation of word2vec
print('Loading models...')
model_source = gensim.models.Word2Vec.load('model_CBOW_jp_200_wzh.w2v')
model_target = gensim.models.Word2Vec.load('model_CBOW_en_200_wzh.w2v')

2018-05-24 10:10:07,751 : INFO : loading Word2Vec object from model_CBOW_jp_200_wzh.w2v


Loading models...


2018-05-24 10:10:08,139 : INFO : loading wv recursively from model_CBOW_jp_200_wzh.w2v.wv.* with mmap=None
2018-05-24 10:10:08,140 : INFO : setting ignored attribute syn0norm to None
2018-05-24 10:10:08,141 : INFO : setting ignored attribute cum_table to None
2018-05-24 10:10:08,142 : INFO : loaded model_CBOW_jp_200_wzh.w2v
2018-05-24 10:10:08,185 : INFO : loading Word2Vec object from model_CBOW_en_200_wzh.w2v
2018-05-24 10:10:08,632 : INFO : loading wv recursively from model_CBOW_en_200_wzh.w2v.wv.* with mmap=None
2018-05-24 10:10:08,634 : INFO : setting ignored attribute syn0norm to None
2018-05-24 10:10:08,635 : INFO : setting ignored attribute cum_table to None
2018-05-24 10:10:08,636 : INFO : loaded model_CBOW_en_200_wzh.w2v


In [24]:
sources = []
targets = []
with open("./ja-en.txt",'r',encoding='utf-8') as f:
    line = f.readline()
    while line:
        source,target = line.split()
        sources.append(source)
        targets.append(target)
        line = f.readline()

In [25]:
pairs = pd.DataFrame()
pairs['source'] = pd.Series(sources)
pairs['target'] = pd.Series(targets)

In [26]:
print('Removing missing vocabulary...')

missing = 0

for n in range (len(pairs)):
    if pairs['source'][n] not in model_source.wv.vocab or pairs['target'][n] not in model_target.wv.vocab:
        missing = missing + 1
        pairs = pairs.drop(n)

pairs = pairs.reset_index(drop = True)
print ('Amount of missing vocab: ', missing)

Removing missing vocabulary...
Amount of missing vocab:  20113


In [27]:
# make list of pair words, excluding the missing vocabs 
# removed in previous step
pairs['vector_source'] = [model_source[pairs['source'][n]] for n in range (len(pairs))]
pairs['vector_target'] = [model_target[pairs['target'][n]] for n in range (len(pairs))]

# first 5000 from both languages, to train translation matrix
source_training_set = pairs['vector_source'][:10000]
target_training_set = pairs['vector_target'][:10000]

matrix_train_source = pd.DataFrame(source_training_set.tolist()).values
matrix_train_target = pd.DataFrame(target_training_set.tolist()).values


In [28]:
matrix_train_source.shape

(5856, 200)

In [29]:
print ('Generating translation matrix')
# Matrix W is given in  http://stackoverflow.com/questions/27980159/fit-a-linear-transformation-in-python
translation_matrix = np.linalg.pinv(matrix_train_source).dot(matrix_train_target).T
print ('Generated translation matrix')
translation_matrix.shape

Generating translation matrix
Generated translation matrix


(200, 200)

In [30]:
# Returns list of topn closest vectors to vectenter
def most_similar_vector(self, vectenter, topn=5):
    self.init_sims()
    dists = np.dot(self.wv.syn0norm, vectenter)
    if not topn:
        return dists
    best = np.argsort(dists)[::-1][:topn ]
        # ignore (don't return) words from the input
    result = [(self.wv.index2word[sim], float(dists[sim])) for sim in best]
    return result[:topn]

def top_translations(w,numb=5):
    val = most_similar_vector(model_target,translation_matrix.dot(model_source[w]),numb)
    #print 'traducwithscofres ', val
    return val


def top_translations_list(w, numb=5):
    val = [top_translations(w,numb)[k][0] for k in range(numb)]
    return val

temp = 1
#top_matches = [ pairs['target'][n] in top_translations_list(pairs['source'][n]) for n in range(5000,5003)] 

# print out source word and translation
def display_translations():
    for word_num in range(range_start, range_end):
        source_word =  pairs['source'][word_num]
        translations = top_translations_list(pairs['source'][word_num]) 
        print (source_word, translations)


In [31]:
# range to use to check accuracy
range_start = 5000
range_end = 5800

#display_translations()

# now we can check for accuracy on words 5000-6000, 1-5000 used to traning
# translation matrix

# returns matrix of true or false, true if translation is accuracy, false if not
# accurate means the first translation (most similiar vector in target language)
# is identical
accuracy_at_five = [pairs['target'][n] in top_translations_list(pairs['source'][n]) for n in range(range_start, range_end)]
print ('Accuracy @5 is ', sum(accuracy_at_five), '/', len(accuracy_at_five))

accuracy_at_one = [pairs['target'][n] in top_translations_list(pairs['source'][n],1) for n in range(range_start, range_end)]
print ('Accuracy @1 is ', sum(accuracy_at_one), '/', len(accuracy_at_one))


2018-05-24 10:10:34,059 : INFO : precomputing L2-norms of word weight vectors


Accuracy @5 is  113 / 800
Accuracy @1 is  68 / 800


In [32]:
np.save("jp_en_200.npy",translation_matrix)

In [33]:
# models trained using gensim implementation of word2vec
print('Loading models...')
model_source = gensim.models.Word2Vec.load('model_CBOW_zh_200_wzh.w2v')
model_target = gensim.models.Word2Vec.load('model_CBOW_en_200_wzh.w2v')

2018-05-24 11:00:28,858 : INFO : loading Word2Vec object from model_CBOW_zh_200_wzh.w2v


Loading models...


2018-05-24 11:00:29,478 : INFO : loading wv recursively from model_CBOW_zh_200_wzh.w2v.wv.* with mmap=None
2018-05-24 11:00:29,479 : INFO : setting ignored attribute syn0norm to None
2018-05-24 11:00:29,481 : INFO : setting ignored attribute cum_table to None
2018-05-24 11:00:29,482 : INFO : loaded model_CBOW_zh_200_wzh.w2v
2018-05-24 11:00:29,552 : INFO : loading Word2Vec object from model_CBOW_en_200_wzh.w2v
2018-05-24 11:00:29,987 : INFO : loading wv recursively from model_CBOW_en_200_wzh.w2v.wv.* with mmap=None
2018-05-24 11:00:29,988 : INFO : setting ignored attribute syn0norm to None
2018-05-24 11:00:29,989 : INFO : setting ignored attribute cum_table to None
2018-05-24 11:00:29,990 : INFO : loaded model_CBOW_en_200_wzh.w2v


In [34]:
pairs = pd.DataFrame()
pairs['source'] = pd.Series(sources)
pairs['target'] = pd.Series(targets)

In [35]:
print('Removing missing vocabulary...')

missing = 0

for n in range (len(pairs)):
    if pairs['source'][n] not in model_source.wv.vocab or pairs['target'][n] not in model_target.wv.vocab:
        missing = missing + 1
        pairs = pairs.drop(n)

pairs = pairs.reset_index(drop = True)
print ('Amount of missing vocab: ', missing)

Removing missing vocabulary...
Amount of missing vocab:  24454


In [36]:
# make list of pair words, excluding the missing vocabs 
# removed in previous step
pairs['vector_source'] = [model_source[pairs['source'][n]] for n in range (len(pairs))]
pairs['vector_target'] = [model_target[pairs['target'][n]] for n in range (len(pairs))]

# first 5000 from both languages, to train translation matrix
source_training_set = pairs['vector_source'][:10000]
target_training_set = pairs['vector_target'][:10000]

matrix_train_source = pd.DataFrame(source_training_set.tolist()).values
matrix_train_target = pd.DataFrame(target_training_set.tolist()).values

In [38]:
matrix_train_source.shape

(1515, 200)

In [39]:
print ('Generating translation matrix')
# Matrix W is given in  http://stackoverflow.com/questions/27980159/fit-a-linear-transformation-in-python
translation_matrix = np.linalg.pinv(matrix_train_source).dot(matrix_train_target).T
print ('Generated translation matrix')
translation_matrix.shape

Generating translation matrix
Generated translation matrix


(200, 200)

In [37]:
# Returns list of topn closest vectors to vectenter
def most_similar_vector(self, vectenter, topn=5):
    self.init_sims()
    dists = np.dot(self.wv.syn0norm, vectenter)
    if not topn:
        return dists
    best = np.argsort(dists)[::-1][:topn ]
        # ignore (don't return) words from the input
    result = [(self.wv.index2word[sim], float(dists[sim])) for sim in best]
    return result[:topn]

def top_translations(w,numb=5):
    val = most_similar_vector(model_target,translation_matrix.dot(model_source[w]),numb)
    #print 'traducwithscofres ', val
    return val


def top_translations_list(w, numb=5):
    val = [top_translations(w,numb)[k][0] for k in range(numb)]
    return val

temp = 1
#top_matches = [ pairs['target'][n] in top_translations_list(pairs['source'][n]) for n in range(5000,5003)] 

# print out source word and translation
def display_translations():
    for word_num in range(range_start, range_end):
        source_word =  pairs['source'][word_num]
        translations = top_translations_list(pairs['source'][word_num]) 
        print (source_word, translations)


In [41]:
# range to use to check accuracy
range_start = 1400
range_end = 1500

#display_translations()

# now we can check for accuracy on words 5000-6000, 1-5000 used to traning
# translation matrix

# returns matrix of true or false, true if translation is accuracy, false if not
# accurate means the first translation (most similiar vector in target language)
# is identical
accuracy_at_five = [pairs['target'][n] in top_translations_list(pairs['source'][n]) for n in range(range_start, range_end)]
print ('Accuracy @5 is ', sum(accuracy_at_five), '/', len(accuracy_at_five))

accuracy_at_one = [pairs['target'][n] in top_translations_list(pairs['source'][n],1) for n in range(range_start, range_end)]
print ('Accuracy @1 is ', sum(accuracy_at_one), '/', len(accuracy_at_one))


Accuracy @5 is  12 / 100
Accuracy @1 is  11 / 100


In [42]:
np.save("zh_en_200.npy",translation_matrix)