In [1]:
import numpy as np
import pandas as pd
from gensim import corpora, models, similarities
from gensim.models import word2vec

from sklearn import datasets
from sklearn import linear_model
from sklearn import svm
from sklearn import preprocessing
import random
import pickle
import time

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


import torch
from torch import nn
from torch.autograd import Variable
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import torch.nn.functional as F

import matplotlib.pyplot as plt
%matplotlib inline

import os

In [2]:
base_path = "/home/R2016hwang/research/"
model_name_en = base_path + "word2vec/model_CBOW_en_200_wzh.w2v"
model_name_jp = base_path + "word2vec/model_CBOW_jp_200_wzh.w2v"
model_name_zh = base_path + "word2vec/model_CBOW_zh_200_wzh.w2v"

model_en = word2vec.Word2Vec.load(model_name_en)
model_jp = word2vec.Word2Vec.load(model_name_jp)
model_zh = word2vec.Word2Vec.load(model_name_zh)

trans_jp_en = np.load("word2vec/jp_en_200.npy")
trans_zh_en = np.load("word2vec/zh_en_200.npy")

datapath = base_path + "data_prepare/cleaned_jp_en_zh.csv"

2018-05-25 05:13:22,598 : INFO : loading Word2Vec object from /home/R2016hwang/research/word2vec/model_CBOW_en_200_wzh.w2v
2018-05-25 05:13:23,113 : INFO : loading wv recursively from /home/R2016hwang/research/word2vec/model_CBOW_en_200_wzh.w2v.wv.* with mmap=None
2018-05-25 05:13:23,114 : INFO : setting ignored attribute syn0norm to None
2018-05-25 05:13:23,115 : INFO : setting ignored attribute cum_table to None
2018-05-25 05:13:23,116 : INFO : loaded /home/R2016hwang/research/word2vec/model_CBOW_en_200_wzh.w2v
2018-05-25 05:13:23,184 : INFO : loading Word2Vec object from /home/R2016hwang/research/word2vec/model_CBOW_jp_200_wzh.w2v
2018-05-25 05:13:23,476 : INFO : loading wv recursively from /home/R2016hwang/research/word2vec/model_CBOW_jp_200_wzh.w2v.wv.* with mmap=None
2018-05-25 05:13:23,477 : INFO : setting ignored attribute syn0norm to None
2018-05-25 05:13:23,478 : INFO : setting ignored attribute cum_table to None
2018-05-25 05:13:23,479 : INFO : loaded /home/R2016hwang/resear

In [3]:
maxlen = 20 # Default: 0 -> infinite
epoch = 10
dim_lstm = 200
dim_1 = 800
# dim_2 = 100
# dim_3 = 50
dropout_rate = 0.0
bias_y = 0
loss_function = "mse"
mode = "reg" # reg, binary
rnn_type = "bi-lstm" # lstm, bi-lstm
bi_lstm_mode = "sum" #concat, sum
print("maxlen", maxlen, "epoch", epoch, "dim_lstm", dim_lstm)
print("dim_Dense", dim_1)
print("dropout_rate", dropout_rate, ", LSTM type:", rnn_type, bi_lstm_mode)
p_activation = ["relu", "relu", "relu"]
print("Activation function:", p_activation)
print("bias of y:", bias_y)
print("loss_function:", loss_function)
start = 0
step = 10
print("start:", start, "end:", )
print("------------------------------")
random.seed(1234)

maxlen 20 epoch 10 dim_lstm 200
dim_Dense 800
dropout_rate 0.0 , LSTM type: bi-lstm sum
Activation function: ['relu', 'relu', 'relu']
bias of y: 0
loss_function: mse
start: 0 end:
------------------------------


In [4]:
torch.manual_seed(1)    # reproducible

# Hyper Parameters
EPOCH = 1          # 训练整批数据多少次
BATCH_SIZE = 200
TIME_STEP = 8      # rnn 时间步数 / 图片高度
INPUT_SIZE = 200     # rnn 每步输入值 / 图片每行像素
HIDDEN_SIZE = 200
BIDIRECTION = False
LR = 0.01           # learning rate
DROP_OUT = 0.0

In [5]:
def prepare_train(dir_en_jp, second_language="jp", start=None, end=None):

    # df_en_mapping = pd.read_csv(dir_en)
    # df_jp_mapping = pd.read_csv(dir_jp)
    second_article = second_language+"_article"
    
    df_en_jp = pd.read_csv(dir_en_jp,
                              names=["HEADLINE_ALERT_TEXT_x","HEADLINE_ALERT_TEXT_y", "HEADLINE_ALERT_TEXT","jp_article","en_article","zh_article"],
                              header=0)
    df_en_mapping = df_en_jp[["en_article"]].iloc[start:end]
    df_jp_mapping = df_en_jp[[second_article]].iloc[start:end]

    print("Reading English Data:", len(df_en_mapping))
    print("Reading "+second_language+" Data:", len(df_jp_mapping))

    sample_size = len(df_en_mapping)

    assert len(df_en_mapping) == len(df_jp_mapping)

    # Convert mapping to list type and then concat to the a list
    print("Merging the English and Japanes news dataframe...")
    df_train_1 = pd.concat([df_en_mapping, df_jp_mapping], axis = 1)
    df_train_1['similarity'] = pd.Series(np.ones(sample_size,)*5)
    df_train_1['dis_similarity'] = pd.Series(np.ones(sample_size,)*1)

    # Remove null line
    print("Drop the null line...")
    # df_train_1 = df_train_1.dropna(subset=['en_article'])
    df_train_1 = df_train_1[df_train_1['en_article'] != '<NULL>']

    # Expand the training data
    en_article_wrong = df_train_1.en_article.iloc[random.sample(range(len(df_train_1)),len(df_train_1))]
    en_article_wrong.index = df_train_1.index
    print((en_article_wrong == df_train_1.en_article).value_counts())
    df_train_1['en_article_wrong'] = en_article_wrong

    # Convert dateframe to list
    train_1 = df_train_1[['en_article',second_article,'similarity']].values.tolist()
    train_2 = df_train_1[['en_article_wrong',second_article,'dis_similarity']].values.tolist()

    return train_1, train_2, df_train_1

In [6]:
def padding(sequence, maxlen=maxlen, padding_value=0.0):
    np_sequance = np.array(sequence)
    #print(np_sequance.shape)
    if np_sequance.shape[0] == 0:
        #return float('nan')
        print("长度为零")
        #return float('nan')
        return np.zeros((maxlen, INPUT_SIZE))
    if np_sequance.shape[0] < maxlen:
        z = np.zeros((maxlen, INPUT_SIZE))
    #    print(z.shape)
        z[:np_sequance.shape[0], :np_sequance.shape[1]] = np_sequance
    else:
        z = np_sequance[:maxlen, :]
    return z


In [7]:
def find_ranking_batch(projection1, projection2, dlmodel, batch=10):
    sim_results = []
    rank_results = []
    sample_length = len(projection2)

    # Iterate each of the ariticle from projection1 (999) as proj1
    # Calculate the simialrity of proj1 with all ariticles in projection2 (999)
    # for i, proj1 in enumerate(projection1):
    for i in range(0, sample_length, batch):
        print("Find answer for doc.", i, i+batch)
        proj1 = projection1[i:i+batch]

        proj1_tile = np.repeat(proj1, sample_length, axis=0)
        proj2_tile = np.tile(projection2, (batch,1,1))
        print(proj1_tile.shape)
        print(proj2_tile.shape)

        # For each batch, we should tile each of the element
        sim = dlmodel.predict([proj1_tile, proj2_tile])[:,0]
        for j in range(0, sample_length, batch):
            rank = pd.Series(sim[j:j+sample_length]).rank(ascending = False)[i]
            sim_results.append(sim)
            rank_results.append(rank)

    # sim_results contains 999*999 similairty matrix
    return sim_results, rank_results

def find_ranking(projection1, projection2, dlmodel):
    sim_results = []
    rank_results = []

    # Iterate each of the ariticle from projection1 (999) as proj1
    # Calculate the simialrity of proj1 with all ariticles in projection2 (999)
    for i, proj1 in enumerate(projection1):
        print("Find answer for doc.", i)
        proj1_tile = np.tile(proj1, (len(projection2), 1, 1))
        sim = dlmodel.predict([proj1_tile, projection2])[:,0]
        rank = pd.Series(sim).rank(ascending = False)[i]
        sim_results.append(sim)
        rank_results.append(rank)

    # sim_results contains 999*999 similairty matrix
    return sim_results, rank_results


In [8]:
def find_top(rank_results, top):
    s = pd.Series(rank_results)
    n_top = (s <= top).sum()
    return n_top


def average_docment(document_embedding):
    return np.average(document_embedding, axis=0)

def sum_docment(document_embedding):
    return np.sum(document_embedding, axis=0)

In [9]:
def doc2embed(doc,model,translation_matrix=None):
    # r = [model_en[token] for token in doc.split()]
    r = []
    r_failed = []

    
    for token in str(doc).split(' '):
        if token in model:
            if type(translation_matrix) != type(None) :
                #print("translation_matrix ",translation_matrix.shape)
                #print(np.array(model[token]).shape)
                r.append(np.array(model[token]).dot(translation_matrix))
            else:
                r.append(model[token])
        else:
            r_failed.append(token)

    # if len(r_failed) != 0:
    #     print " ".join(r_failed)
    #print("Failed Number",len(r_failed))
    return r


In [10]:
pairs_correct_enjp, pairs_wrong_enjp, df_pairs_enjp = prepare_train(datapath,second_language='jp')

Reading English Data: 81283
Reading jp Data: 81283
Merging the English and Japanes news dataframe...
Drop the null line...
False    80968
True       315
Name: en_article, dtype: int64


In [11]:
df_pairs_enjp

Unnamed: 0,en_article,jp_article,similarity,dis_similarity,en_article_wrong
0,china slow mid-term prospect bright-fitch,中国 経済成長 やや 減速 中期 見通し 良好 フィッチ,5.0,1.0,ifr-us corp bonds-ig midday bank get jump eUSu...
1,US stock open low ahead greenspan remark,米国 株式市場 序盤 小幅安 グリーンスパン 議長 証言 控え,5.0,1.0,refile-ecb plan extra bln 28-day dollar refina...
2,US 3-month bill high rate pct,表 米 落札 結果 最高 落札 金利 3か月 6か月,5.0,1.0,instant view-analysts comment ecb news confer...
3,US 3-month bill high rate pct,表 米 落札 結果 最高 落札 金利 3か月 6か月,5.0,1.0,citi merrill mull same ceo candidates-cnbc
4,US 3-month bill high rate pct,表 米 落札 結果 最高 落札 金利 3か月 6か月,5.0,1.0,seoul share hover 4-week low samsung elec fall
5,US stock open slightly high ahead fed,米国 株式市場 序盤 小 反発 FOMC 結果 見守る,5.0,1.0,treasury paulson lead china group december
6,rpt-US stock open slightly high ahead fed,米国 株式市場 序盤 小 反発 FOMC 結果 見守る,5.0,1.0,analysis-europe should focus yuan not dollar
7,blair say worry oil want low price,原油価格 問題 議題 なる 可能性 英 首相,5.0,1.0,inflation expectation have rise -fed lacker
8,s korea fin min see gdp 4-5 pct report,今年 下期 韓国 GDP 成長率 見通し 財政 経済,5.0,1.0,genzyme sanofi odds threshold price wsj
9,s korea fin min see gdp 4-5 pct report,再送 今年 下期 韓国 GDP 成長率 見通し 財政 経済,5.0,1.0,south korean win hit 8-week high v dollar


In [12]:
pairs_correct_enzh, pairs_wrong_enzh, df_pairs_enzh = prepare_train(datapath,second_language='zh')

Reading English Data: 81283
Reading zh Data: 81283
Merging the English and Japanes news dataframe...
Drop the null line...
False    80959
True       324
Name: en_article, dtype: int64


In [13]:
df_pairs_enzh

Unnamed: 0,en_article,zh_article,similarity,dis_similarity,en_article_wrong
0,china slow mid-term prospect bright-fitch,重發 中國 經濟 惠譽 預計 GDP 增速 今明兩年 放緩 至 與 但 中期 前景 看好,5.0,1.0,exclusive-germany push libor probe deutsche ba...
1,US stock open low ahead greenspan remark,美國股市 微幅 開 低 市場 等待 格老 證詞 演說,5.0,1.0,france eye more mere growth deal eUSummit-source
2,US 3-month bill high rate pct,美國 債市 三個 月期 國庫券 標售 得標 利率 為 投標 倍數 為,5.0,1.0,opec considers third supply cut defend oil price
3,US 3-month bill high rate pct,美國 債市 三個 月期 國庫券 標售 得標 利率 為 投標 倍數 為,5.0,1.0,mortgages/agencies-spreads mixed freddie bill ...
4,US 3-month bill high rate pct,美國 債市 三個 月期 國庫券 標售 得標 利率 為 投標 倍數 為,5.0,1.0,australia share spring back up pct bank miner
5,US stock open slightly high ahead fed,美國股市 開盤 微幅 走高 市場 等待 FED 利率 聲明,5.0,1.0,table-nz current account deficit nz bln
6,rpt-US stock open slightly high ahead fed,美國股市 開盤 微幅 走高 市場 等待 FED 利率 聲明,5.0,1.0,moody downgrade credit agricole socgen rating...
7,blair say worry oil want low price,國際 油市 布萊爾 稱 G 領導人 擔憂 油價 希望 各國 為 抑制 油價 作出努力,5.0,1.0,bofa see brent average
8,s korea fin min see gdp 4-5 pct report,韓國 經濟 財長 預期 下半年 GDP 較 上年 同期 成長 報導,5.0,1.0,wrapup 8-china storm hamper rescue quake toll
9,s korea fin min see gdp 4-5 pct report,韓國 經濟 財長 預期 下半年 GDP 較 上年 同期 成長 報導,5.0,1.0,german gdp growth could top pct year-diw


In [14]:
print("Using the new test data to evaluate.......")
df_pairs_evaluate = df_pairs_enjp.iloc[50000:55000:5]

df_pairs_evaluate['word2vec_en'] = df_pairs_evaluate['en_article'].apply(doc2embed,args=(model_en,))
df_pairs_evaluate['word2vec_en_wrong'] = df_pairs_evaluate['en_article_wrong'].apply(doc2embed,args=(model_en,))
df_pairs_evaluate['word2vec_jp'] = df_pairs_evaluate['jp_article'].apply(doc2embed,args=(model_jp,trans_jp_en))


df_pairs_evaluate['padding_en'] = df_pairs_evaluate['word2vec_en'].apply(padding, args=(TIME_STEP,))
df_pairs_evaluate['padding_en_wrong'] = df_pairs_evaluate['word2vec_en_wrong'].apply(padding, args=(TIME_STEP,))
df_pairs_evaluate['padding_jp'] = df_pairs_evaluate['word2vec_jp'].apply(padding, args=(TIME_STEP,))

df_pairs_evaluate.dropna(axis=0, how='all')


features_en_new = np.stack(df_pairs_evaluate["padding_en"].values)
features_jp_new = np.stack(df_pairs_evaluate["padding_jp"].values)
features_en_new_wrong = np.stack(df_pairs_evaluate["padding_en_wrong"].values)

Using the new test data to evaluate.......


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexin

In [15]:
features_en_new.shape

(1000, 8, 200)

In [16]:
# ---- Training data ---- #
df_pairs_sample = df_pairs_enjp.iloc[0:50000]

df_pairs_sample['word2vec_en'] = df_pairs_sample['en_article'].apply(doc2embed,args=(model_en,))
df_pairs_sample['word2vec_en_wrong'] = df_pairs_sample['en_article_wrong'].apply(doc2embed,args=(model_en,))
df_pairs_sample['word2vec_jp'] = df_pairs_sample['jp_article'].apply(doc2embed,args=(model_jp,trans_jp_en))


# ---- Padding the vector ---- #
df_pairs_sample['padding_en'] = df_pairs_sample['word2vec_en'].apply(padding, args=(TIME_STEP,))
df_pairs_sample['padding_en_wrong'] = df_pairs_sample['word2vec_en_wrong'].apply(padding, args=(TIME_STEP,))
df_pairs_sample['padding_jp'] = df_pairs_sample['word2vec_jp'].apply(padding, args=(TIME_STEP,))

df_pairs_sample.dropna(axis=0, how='all')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


长度为零
长度为零
长度为零
长度为零
长度为零
长度为零
长度为零
长度为零
长度为零
长度为零


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


长度为零
长度为零
长度为零
长度为零
长度为零
长度为零
长度为零


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,en_article,jp_article,similarity,dis_similarity,en_article_wrong,word2vec_en,word2vec_en_wrong,word2vec_jp,padding_en,padding_en_wrong,padding_jp
0,china slow mid-term prospect bright-fitch,中国 経済成長 やや 減速 中期 見通し 良好 フィッチ,5.0,1.0,ifr-us corp bonds-ig midday bank get jump eUSu...,"[[-0.692976, 0.993873, 1.56972, -0.849406, -0....","[[-0.109248, -0.814155, -0.125125, -0.441452, ...","[[-0.090137516401, -1.93797823832, -1.07036912...","[[-0.692975759506, 0.993872761726, 1.569722294...","[[-0.109248496592, -0.814155459404, -0.1251250...","[[-0.090137516401, -1.93797823832, -1.07036912..."
1,US stock open low ahead greenspan remark,米国 株式市場 序盤 小幅安 グリーンスパン 議長 証言 控え,5.0,1.0,refile-ecb plan extra bln 28-day dollar refina...,"[[0.318434, -2.26064, -0.8581, 0.612717, -0.04...","[[-0.0927665, 0.0564256, -0.0189555, 0.0254941...","[[0.604915827842, -0.587371901787, -2.23725223...","[[0.318433910608, -2.26064181328, -0.858099699...","[[-0.0927664786577, 0.0564255565405, -0.018955...","[[0.604915827842, -0.587371901787, -2.23725223..."
2,US 3-month bill high rate pct,表 米 落札 結果 最高 落札 金利 3か月 6か月,5.0,1.0,instant view-analysts comment ecb news confer...,"[[0.318434, -2.26064, -0.8581, 0.612717, -0.04...","[[0.441488, -1.31358, -0.872881, -0.286089, 0....","[[2.11681226507, 2.15296566811, -0.27546831647...","[[0.318433910608, -2.26064181328, -0.858099699...","[[0.441487878561, -1.31358408928, -0.872880518...","[[2.11681226507, 2.15296566811, -0.27546831647..."
3,US 3-month bill high rate pct,表 米 落札 結果 最高 落札 金利 3か月 6か月,5.0,1.0,citi merrill mull same ceo candidates-cnbc,"[[0.318434, -2.26064, -0.8581, 0.612717, -0.04...","[[-0.359338, -0.450443, 0.464431, -0.345707, -...","[[2.11681226507, 2.15296566811, -0.27546831647...","[[0.318433910608, -2.26064181328, -0.858099699...","[[-0.359338283539, -0.450443416834, 0.46443137...","[[2.11681226507, 2.15296566811, -0.27546831647..."
4,US 3-month bill high rate pct,表 米 落札 結果 最高 落札 金利 3か月 6か月,5.0,1.0,seoul share hover 4-week low samsung elec fall,"[[0.318434, -2.26064, -0.8581, 0.612717, -0.04...","[[-0.0994938, -1.5312, -0.652909, 0.683739, 0....","[[2.11681226507, 2.15296566811, -0.27546831647...","[[0.318433910608, -2.26064181328, -0.858099699...","[[-0.0994938, -1.5312, -0.652909, 0.683739, 0....","[[2.11681226507, 2.15296566811, -0.27546831647..."
5,US stock open slightly high ahead fed,米国 株式市場 序盤 小 反発 FOMC 結果 見守る,5.0,1.0,treasury paulson lead china group december,"[[0.318434, -2.26064, -0.8581, 0.612717, -0.04...","[[1.64935, -1.29647, -5.47781, 1.32763, -3.309...","[[0.604915827842, -0.587371901787, -2.23725223...","[[0.318433910608, -2.26064181328, -0.858099699...","[[1.64934837818, -1.29646813869, -5.4778137207...","[[0.604915827842, -0.587371901787, -2.23725223..."
6,rpt-US stock open slightly high ahead fed,米国 株式市場 序盤 小 反発 FOMC 結果 見守る,5.0,1.0,analysis-europe should focus yuan not dollar,"[[-0.0612247, -0.0130941, 0.0373003, 0.0055707...","[[2.74302, -0.901368, -0.216322, -2.37545, 0.2...","[[0.604915827842, -0.587371901787, -2.23725223...","[[-0.0612246803939, -0.0130941485986, 0.037300...","[[2.74302053452, -0.901368260384, -0.216322258...","[[0.604915827842, -0.587371901787, -2.23725223..."
7,blair say worry oil want low price,原油価格 問題 議題 なる 可能性 英 首相,5.0,1.0,inflation expectation have rise -fed lacker,"[[0.13563, 0.152818, 1.28335, 1.10705, -1.7674...","[[2.33168, 3.44082, -0.902136, -0.994928, -1.9...","[[0.289268975447, 0.65020507603, 0.11294028075...","[[0.135629788041, 0.152817904949, 1.2833530902...","[[2.33167695999, 3.44081711769, -0.90213632583...","[[0.289268975447, 0.65020507603, 0.11294028075..."
8,s korea fin min see gdp 4-5 pct report,今年 下期 韓国 GDP 成長率 見通し 財政 経済,5.0,1.0,genzyme sanofi odds threshold price wsj,"[[0.279975, 3.30506, 1.17713, -1.40503, -0.097...","[[-0.248277, 0.168342, -0.0160929, 0.187689, 0...","[[-1.37303873497, -0.14625378227, 0.0905386579...","[[0.279975, 3.30506, 1.17713, -1.40503, -0.097...","[[-0.248276501894, 0.168341502547, -0.01609287...","[[-1.37303873497, -0.14625378227, 0.0905386579..."
9,s korea fin min see gdp 4-5 pct report,再送 今年 下期 韓国 GDP 成長率 見通し 財政 経済,5.0,1.0,south korean win hit 8-week high v dollar,"[[0.279975, 3.30506, 1.17713, -1.40503, -0.097...","[[1.10703, 2.21735, 1.24801, -0.696346, -2.711...","[[1.21273531925, -0.96096547073, -0.4266170704...","[[0.279975, 3.30506, 1.17713, -1.40503, -0.097...","[[1.10703, 2.21735, 1.24801, -0.696346, -2.711...","[[1.21273531925, -0.96096547073, -0.4266170704..."


In [17]:
df_pairs_sample = df_pairs_sample[df_pairs_sample['word2vec_en']!=float('nan')]

In [18]:
df_pairs_sample

Unnamed: 0,en_article,jp_article,similarity,dis_similarity,en_article_wrong,word2vec_en,word2vec_en_wrong,word2vec_jp,padding_en,padding_en_wrong,padding_jp
0,china slow mid-term prospect bright-fitch,中国 経済成長 やや 減速 中期 見通し 良好 フィッチ,5.0,1.0,ifr-us corp bonds-ig midday bank get jump eUSu...,"[[-0.692976, 0.993873, 1.56972, -0.849406, -0....","[[-0.109248, -0.814155, -0.125125, -0.441452, ...","[[-0.090137516401, -1.93797823832, -1.07036912...","[[-0.692975759506, 0.993872761726, 1.569722294...","[[-0.109248496592, -0.814155459404, -0.1251250...","[[-0.090137516401, -1.93797823832, -1.07036912..."
1,US stock open low ahead greenspan remark,米国 株式市場 序盤 小幅安 グリーンスパン 議長 証言 控え,5.0,1.0,refile-ecb plan extra bln 28-day dollar refina...,"[[0.318434, -2.26064, -0.8581, 0.612717, -0.04...","[[-0.0927665, 0.0564256, -0.0189555, 0.0254941...","[[0.604915827842, -0.587371901787, -2.23725223...","[[0.318433910608, -2.26064181328, -0.858099699...","[[-0.0927664786577, 0.0564255565405, -0.018955...","[[0.604915827842, -0.587371901787, -2.23725223..."
2,US 3-month bill high rate pct,表 米 落札 結果 最高 落札 金利 3か月 6か月,5.0,1.0,instant view-analysts comment ecb news confer...,"[[0.318434, -2.26064, -0.8581, 0.612717, -0.04...","[[0.441488, -1.31358, -0.872881, -0.286089, 0....","[[2.11681226507, 2.15296566811, -0.27546831647...","[[0.318433910608, -2.26064181328, -0.858099699...","[[0.441487878561, -1.31358408928, -0.872880518...","[[2.11681226507, 2.15296566811, -0.27546831647..."
3,US 3-month bill high rate pct,表 米 落札 結果 最高 落札 金利 3か月 6か月,5.0,1.0,citi merrill mull same ceo candidates-cnbc,"[[0.318434, -2.26064, -0.8581, 0.612717, -0.04...","[[-0.359338, -0.450443, 0.464431, -0.345707, -...","[[2.11681226507, 2.15296566811, -0.27546831647...","[[0.318433910608, -2.26064181328, -0.858099699...","[[-0.359338283539, -0.450443416834, 0.46443137...","[[2.11681226507, 2.15296566811, -0.27546831647..."
4,US 3-month bill high rate pct,表 米 落札 結果 最高 落札 金利 3か月 6か月,5.0,1.0,seoul share hover 4-week low samsung elec fall,"[[0.318434, -2.26064, -0.8581, 0.612717, -0.04...","[[-0.0994938, -1.5312, -0.652909, 0.683739, 0....","[[2.11681226507, 2.15296566811, -0.27546831647...","[[0.318433910608, -2.26064181328, -0.858099699...","[[-0.0994938, -1.5312, -0.652909, 0.683739, 0....","[[2.11681226507, 2.15296566811, -0.27546831647..."
5,US stock open slightly high ahead fed,米国 株式市場 序盤 小 反発 FOMC 結果 見守る,5.0,1.0,treasury paulson lead china group december,"[[0.318434, -2.26064, -0.8581, 0.612717, -0.04...","[[1.64935, -1.29647, -5.47781, 1.32763, -3.309...","[[0.604915827842, -0.587371901787, -2.23725223...","[[0.318433910608, -2.26064181328, -0.858099699...","[[1.64934837818, -1.29646813869, -5.4778137207...","[[0.604915827842, -0.587371901787, -2.23725223..."
6,rpt-US stock open slightly high ahead fed,米国 株式市場 序盤 小 反発 FOMC 結果 見守る,5.0,1.0,analysis-europe should focus yuan not dollar,"[[-0.0612247, -0.0130941, 0.0373003, 0.0055707...","[[2.74302, -0.901368, -0.216322, -2.37545, 0.2...","[[0.604915827842, -0.587371901787, -2.23725223...","[[-0.0612246803939, -0.0130941485986, 0.037300...","[[2.74302053452, -0.901368260384, -0.216322258...","[[0.604915827842, -0.587371901787, -2.23725223..."
7,blair say worry oil want low price,原油価格 問題 議題 なる 可能性 英 首相,5.0,1.0,inflation expectation have rise -fed lacker,"[[0.13563, 0.152818, 1.28335, 1.10705, -1.7674...","[[2.33168, 3.44082, -0.902136, -0.994928, -1.9...","[[0.289268975447, 0.65020507603, 0.11294028075...","[[0.135629788041, 0.152817904949, 1.2833530902...","[[2.33167695999, 3.44081711769, -0.90213632583...","[[0.289268975447, 0.65020507603, 0.11294028075..."
8,s korea fin min see gdp 4-5 pct report,今年 下期 韓国 GDP 成長率 見通し 財政 経済,5.0,1.0,genzyme sanofi odds threshold price wsj,"[[0.279975, 3.30506, 1.17713, -1.40503, -0.097...","[[-0.248277, 0.168342, -0.0160929, 0.187689, 0...","[[-1.37303873497, -0.14625378227, 0.0905386579...","[[0.279975, 3.30506, 1.17713, -1.40503, -0.097...","[[-0.248276501894, 0.168341502547, -0.01609287...","[[-1.37303873497, -0.14625378227, 0.0905386579..."
9,s korea fin min see gdp 4-5 pct report,再送 今年 下期 韓国 GDP 成長率 見通し 財政 経済,5.0,1.0,south korean win hit 8-week high v dollar,"[[0.279975, 3.30506, 1.17713, -1.40503, -0.097...","[[1.10703, 2.21735, 1.24801, -0.696346, -2.711...","[[1.21273531925, -0.96096547073, -0.4266170704...","[[0.279975, 3.30506, 1.17713, -1.40503, -0.097...","[[1.10703, 2.21735, 1.24801, -0.696346, -2.711...","[[1.21273531925, -0.96096547073, -0.4266170704..."


In [19]:
# Generate training data (similarity = 1)
features_en_1 = np.array(df_pairs_sample["padding_en"].values.tolist(),dtype=np.float32)
features_jp_1 = np.array(df_pairs_sample["padding_jp"].values.tolist(),dtype=np.float32)

# Generate training data (similarity = 0)
features_en_0 = np.array(df_pairs_sample["padding_en_wrong"].values.tolist(),dtype=np.float32)
#np.random.shuffle((features_en_0))

In [20]:
features_jp_1.shape

(50000, 8, 200)

In [21]:
#torch.manual_seed(1)    # reproducible

# Hyper Parameters
EPOCH = 1          # 训练整批数据多少次
BATCH_SIZE = 100
TIME_STEP = 20      # rnn 时间步数 / 图片高度
INPUT_SIZE = 200     # rnn 每步输入值 / 图片每行像素
HIDDEN_SIZE = 200
BIDIRECTION = False
LR = 1e-4           # learning rate
DROP_OUT = 0.2

In [22]:
#gpu = torch.cuda.is_available()
#device = torch.device("cuda" if gpu else "cpu")

In [23]:
class LaLSTM(nn.Module):
    def __init__(self, batch_size=BATCH_SIZE, time_step=TIME_STEP, input_size=INPUT_SIZE, hidden_size=HIDDEN_SIZE, dropout_p=DROP_OUT, bidirection=False, attention=False, gpu=True):
        super(LaLSTM, self).__init__()
        self.attention = attention
        self.bidirection = bidirection
        self.gpu = gpu
        global device
        print(device)
        
        self.lstm_left = nn.LSTM(     # LSTM 效果要比 nn.RNN() 好多了
            input_size=input_size,      # 图片每行的数据像素点
            hidden_size=hidden_size,     # rnn hidden unit
            num_layers=1,       # 有几层 RNN layers
            batch_first=True,   # input & output 会是以 batch size 为第一维度的特征集 e.g. (batch, time_step, input_size)
            bidirectional=bidirection,
        )
        
        self.lstm_right = nn.LSTM(     # LSTM 效果要比 nn.RNN() 好多了
            input_size=input_size,      # 图片每行的数据像素点
            hidden_size=hidden_size,     # rnn hidden unit
            num_layers=1,       # 有几层 RNN layers
            batch_first=True,   # input & output 会是以 batch size 为第一维度的特征集 e.g. (batch, time_step, input_size)
            bidirectional=bidirection,
        )
        
        if bidirection:
            print("bidirection")
            self.full_connect = torch.nn.Sequential(
                                                #torch.nn.Linear(hidden_size*2, hidden_size),
                                                #torch.nn.ReLU(),
                                                torch.nn.Linear(hidden_size*4, 1),
                                                torch.nn.ReLU()
                                                )
            #self.attn_left = nn.Linear(4*hidden_size, hidden_size, bias=False)
            #self.attn_right = nn.Linear(4*hidden_size, hidden_size, bias=False)
            self.left_hidden =  self.inithidden(2)
            self.right_hidden = self.inithidden(2)
            
        else:
            #self.attn_left = nn.Linear(2*hidden_size, hidden_size, bias=False)
            #self.attn_right = nn.Linear(2*hidden_size, hidden_size, bias=False)

            self.full_connect = torch.nn.Sequential(
                                                    #torch.nn.Linear(hidden_size*2, hidden_size),
                                                    #torch.nn.ReLU(),
                                                    torch.nn.Linear(hidden_size*2, 1),
                                                    torch.nn.ReLU()
                                                    )
            self.left_hidden =  self.inithidden(1)
            self.right_hidden = self.inithidden(1)
            
        self.dropout = nn.Dropout(dropout_p)
        #Initialize
        for seq in range(len(self.full_connect),2) :   
            nn.init.xavier_uniform(self.full_connect[seq].weight)
        #nn.init.xavier_uniform(self.attn_left.weight)
        #nn.init.xavier_uniform(self.attn_right.weight)
        print("Initialized")
        
    def inithidden(self,num,batch_size=BATCH_SIZE,hidden_size=HIDDEN_SIZE):
        hid = (Variable(torch.zeros(num, batch_size, hidden_size)),
               Variable(torch.zeros(num, batch_size, hidden_size)))
        #if self.gpu :
        #    for var in hid:
        #        var = var.to(device)
        #print (hid)
        torch.nn.init.orthogonal(hid[0], gain=1)
        torch.nn.init.orthogonal(hid[1], gain=1)
        #print(hid)
        return hid
    def forward(self, x_left, x_right):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n shape (n_layers, batch, hidden_size)   LSTM 有两个 hidden states, h_n 是分线, h_c 是主线
        # h_c shape (n_layers, batch, hidden_size)
        # batch_first = True : (batch, time_step, input_size)
        
        #注意现在左右两边用的是同一个网络
        left_out, self.left_hidden = self.lstm_left(x_left,self.left_hidden)   # None 表示 hidden state 会用全0的 state
        #right_out, self.right_hidden = self.lstm_right(x_right,self.right_hidden)
        right_out, self.right_hidden = self.lstm_left(x_right,self.right_hidden)

        #h_state_left = (h_n_left, h_c_left)
        #h_state_right = (h_n_right, h_c_right)
        h_n_left = self.left_hidden[0]
        h_n_right = self.right_hidden[0]
        
        # attention = h_left dot h_right.T
        #(batch, h_left, h_right)
        
        
        #attn_matrix_left = torch.bmm(left_out,torch.transpose(right_out,1,2))
        
        #(batch, h_right, h_left)
        #attn_matrix_right = torch.bmm(right_out,torch.transpose(left_out,1,2))
        #left_at = F.softmax(attn_matrix_left,2)
        #right_at = F.softmax(attn_matrix_right,2)
        #left_ct = torch.bmm(left_at,right_out)
        #right_ct = torch.bmm(right_at,left_out)
        #take only last attention for results without feed_in
        
        #用out
        #left_hn = left_out[:, -1, :].squeeze(1)
        #right_hn = right_out[:, -1, :].squeeze(1)
        
        #left_cn = left_ct[:, -1, :].squeeze(1)
        #right_cn = right_ct[:, -1, :].squeeze(1)
         
        #left_new_hn = F.tanh(self.attn_left(torch.cat((left_hn,left_cn),1)))
        #right_new_hn = F.tanh(self.attn_right(torch.cat((right_hn,right_cn),1)))
              
        #用hidden_state
        left_hn = h_n_left
        right_hn = h_n_right
        
        if self.bidirection:
            left_hn = torch.cat([left_hn[-1], left_hn[-2]], 1)
            right_hn = torch.cat([right_hn[-1], right_hn[-2]], 1)
        else:
            left_hn = left_hn.squeeze(0)
            right_hn = right_hn.squeeze(0)
            
        #print(left_hn.size())
        #print(right_hn.size())
        #print(left_out.size())
        #print(right_out.size())
       
        if self.attention:
            self.out = self.full_connect(self.dropout(torch.cat((left_new_hn,right_new_hn),1)))
        else:
            self.out = self.full_connect(self.dropout(torch.cat((left_hn,right_hn),1)))
        
        return self.out

#lalstm = LaLSTM().to(device)
#print(lalstm)

In [28]:
#del lalstm
gpu = torch.cuda.is_available()
device = torch.device("cuda:1" if gpu else "cpu")

In [29]:
#os.environ['CUDA_LAUNCH_BLOCKING'] = 1
model_plain = LaLSTM(bidirection=True,attention=False).to(device)
#model_plain = torch.nn.DataParallel(model_plain)
print(model_plain)
train_feature_left = np.concatenate((features_en_1,features_en_0),axis=0)
train_feature_right = np.concatenate((features_jp_1,features_jp_1),axis=0)
train_result = np.concatenate((np.ones(features_en_1.shape[0],dtype=np.float32),np.zeros(features_en_0.shape[0],dtype=np.float32)),axis=0)
train_result = train_result[:,np.newaxis]

p = np.random.permutation(len(train_result))
train_feature_left = train_feature_left[p]
train_feature_right = train_feature_right[p]
train_result = train_result[p]
print(train_feature_left.shape,train_feature_right.shape,train_result.shape)

criterion = torch.nn.MSELoss(size_average=False)
#optimizer = torch.optim.SGD(model_att.parameters(), lr=LR, momentum=0.2)
optimizer = torch.optim.Adam(model_plain.parameters(), lr=LR, betas=(0.9, 0.99))

all_loss = []
first = True
for epoch in range(EPOCH):
    loop = train_result.shape[0]//BATCH_SIZE
    for t in range(loop):
        # Forward pass: Compute predicted y by passing x to the model
        in_left = Variable(torch.from_numpy(train_feature_left[BATCH_SIZE*t:BATCH_SIZE*(t+1)])).float().to(device)
        in_right = Variable(torch.from_numpy(train_feature_right[BATCH_SIZE*t:BATCH_SIZE*(t+1)])).float().to(device)
        output = Variable(torch.from_numpy(train_result[BATCH_SIZE*t:BATCH_SIZE*(t+1)])).float().to(device)
        #in_left = in_left.to(torch.device("cuda"))
        #in_right = in_rightt.to(torch.device("cuda"))
        #output = output.to(torch.device("cuda"))
        
        
        y_pred = model_plain(in_left,in_right)
        # Compute and print loss
        loss = criterion(y_pred, output)
        #print(y_pred,output)
        all_loss.append(loss.item())
        print(t, loss.item())

        # Zero gradients, perform a backward pass, and update the weights.
        optimizer.zero_grad()
        
        loss.backward(retain_graph=True)
        #if first:
        #    loss.backward(retain_graph=True)
        #    first = False
        #else:
        #    loss.backward()
        optimizer.step()
plt.plot(all_loss)

cuda:1
bidirection
Initialized




RuntimeError: cuda runtime error (77) : an illegal memory access was encountered at /pytorch/aten/src/THC/generic/THCTensorCopy.c:20

In [26]:
model = LaLSTM(bidirection=True,attention=False)
print(model)
train_feature_left = np.concatenate((features_en_1,features_en_0),axis=0)
train_feature_right = np.concatenate((features_jp_1,features_jp_1),axis=0)
train_result = np.concatenate((np.ones(features_en_1.shape[0],dtype=np.float32),np.zeros(features_en_0.shape[0],dtype=np.float32)),axis=0)
train_result = train_result[:,np.newaxis]

p = np.random.permutation(len(train_result))
train_feature_left = train_feature_left[p]
train_feature_right = train_feature_right[p]
train_result = train_result[p]

print(train_feature_left.shape,train_feature_right.shape,train_result.shape)

# different optimizers
opt_SGD         = torch.optim.SGD(model.parameters(), lr=LR)
opt_Momentum    = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.8)
opt_RMSprop     = torch.optim.RMSprop(model.parameters(), lr=LR, alpha=0.9)
opt_Adam        = torch.optim.Adam(model.parameters(), lr=LR, betas=(0.9, 0.99))

criterion = torch.nn.MSELoss(size_average=False)
#optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
optimizer = opt_Adam

all_loss = []
for epoch in range(EPOCH):
    loop = train_result.shape[0]//BATCH_SIZE
    for t in range(loop):
        # Forward pass: Compute predicted y by passing x to the model
        in_left = torch.from_numpy(train_feature_left[BATCH_SIZE*t:BATCH_SIZE*(t+1)])
        in_right = torch.from_numpy(train_feature_right[BATCH_SIZE*t:BATCH_SIZE*(t+1)])
        output = torch.from_numpy(train_result[BATCH_SIZE*t:BATCH_SIZE*(t+1)])
        y_pred = model(in_left,in_right)

        # Compute and print loss
        loss = criterion(y_pred, output)
        all_loss.append(loss.item())
        print(t, loss.item())

        # Zero gradients, perform a backward pass, and update the weights.
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()
plt.plot(all_loss)

cuda:1
bidirection
Initialized
LaLSTM(
  (lstm_left): LSTM(200, 200, batch_first=True, bidirectional=True)
  (lstm_right): LSTM(200, 200, batch_first=True, bidirectional=True)
  (full_connect): Sequential(
    (0): Linear(in_features=800, out_features=1, bias=True)
    (1): ReLU()
  )
  (dropout): Dropout(p=0.0)
)




(100000, 8, 200) (100000, 8, 200) (100000, 1)
0 48.1601448059082
1 33.56179428100586
2 29.56279754638672
3 33.758819580078125
4 32.87086486816406
5 33.27394485473633
6 27.87605094909668
7 26.427114486694336
8 28.777263641357422
9 28.674734115600586
10 26.70044708251953
11 28.929950714111328
12 26.939252853393555
13 25.324583053588867
14 27.30428123474121
15 27.257381439208984
16 25.652950286865234
17 30.3001766204834
18 28.69813346862793
19 26.212915420532227
20 24.122150421142578
21 27.25435447692871
22 24.386154174804688
23 26.35154151916504
24 30.642379760742188
25 23.188066482543945
26 28.927400588989258
27 27.815143585205078
28 27.316402435302734
29 23.394733428955078
30 26.66168785095215
31 25.012157440185547
32 24.763946533203125
33 27.314117431640625
34 25.34014320373535
35 25.5624942779541
36 23.96206283569336
37 23.86604118347168
38 23.875778198242188
39 27.206472396850586
40 26.067373275756836
41 25.13617706298828
42 25.623046875
43 25.587644577026367
44 23.472150802612305
4

KeyboardInterrupt: 

In [None]:
model_att = LaLSTM(bidirection=False,attention=True)
print(train_feature_left.shape,train_feature_right.shape,train_result.shape)

criterion = torch.nn.MSELoss(size_average=False)
#optimizer = torch.optim.SGD(model_att.parameters(), lr=LR, momentum=0.2)
optimizer = torch.optim.Adam(model.parameters(), lr=LR, betas=(0.9, 0.99))

all_loss = []
for epoch in range(EPOCH):
    loop = train_result.shape[0]//BATCH_SIZE
    for t in range(loop):
        # Forward pass: Compute predicted y by passing x to the model
        in_left = torch.from_numpy(train_feature_left[BATCH_SIZE*t:BATCH_SIZE*(t+1)])
        in_right = torch.from_numpy(train_feature_right[BATCH_SIZE*t:BATCH_SIZE*(t+1)])
        output = torch.from_numpy(train_result[BATCH_SIZE*t:BATCH_SIZE*(t+1)])
        y_pred = model_att(in_left,in_right)

        # Compute and print loss
        loss = criterion(y_pred, output)
        all_loss.append(loss.item())
        print(t, loss.item())

        # Zero gradients, perform a backward pass, and update the weights.
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()
plt.plot(all_loss)

In [None]:
torch.zeros(2,5,5)