In [49]:
import os
import glob
import re
import jieba
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from utils.utils import load_sgm, Preprocess, WordVector
from model.models import simpleNMT
from keras.optimizers import Adam
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# load data

In [2]:
data_dir = '/Users/harryxu/school/data/ai_challenger_translation_validation_20170912/translation_validation_20170912/'
en_path = data_dir + 'valid.en-zh.en.sgm'
cn_path = data_dir + 'valid.en-zh.zh.sgm'


en_data = load_sgm(en_path)
cn_data = load_sgm(cn_path)

In [3]:
print(en_data[:2])
print(cn_data[:2])

['Do you think we look young enough to blend in at a high school?', "Hi, honey. I guess you're really tied up in meetings."]
['你们觉得我们看起来够年轻溜进高中吗？', '嗨，亲爱的。你现在肯定忙着开会呢。']


# preprocess data

In [4]:
# tokenization
preprocessor = Preprocess()
preprocessor.fit([en_data, cn_data])

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/3b/ffc6dtb57bb2_ng7vhzhg5nm0000gp/T/jieba.cache
Loading model cost 1.136 seconds.
Prefix dict has been built succesfully.


In [26]:
a1, a2 = preprocessor.get_ids()
a1 = pad_sequences(a1, preprocessor.cn_maxlen, padding="post", truncating="post")
a2 = pad_sequences(a2, preprocessor.cn_maxlen, padding="post", truncating="post")

In [7]:
# load word vectors
en_wv_path = '/Users/harryxu/NLP/glove/glove.6B.100d.txt'
cn_wv_path = '/Users/harryxu/NLP/glove/kdxf.txt'

WV = WordVector(preprocessor.en_dict, preprocessor.cn_dict)
en_mat, cn_mat = WV.to_matrix(en_wv_path, cn_wv_path)


DEBUG: convert wv to dict take  20.08706307411194
DEBUG: convert wv to dict take  0.12171506881713867
DEBUG: convert dicts to matrix take 0.15119099617004395
DEBUG: convert dicts to matrix take 0.08150291442871094


In [8]:
en_mat.shape

(8778, 100)

In [9]:
cn_mat.shape

(12952, 20)

In [42]:
# fit model
model = simpleNMT(pad_length=preprocessor.cn_maxlen,
                  n_voc_in=en_mat.shape[0],
                  d_voc_in=en_mat.shape[1],
                  n_labels=cn_mat.shape[1],
                  embedding_learnable=True,
                  encoder_units=2,
                  decoder_units=2,
                  trainable=True,
                  return_probabilities=False,
                  weights=[en_mat])

inputs shape: (?, ?, 4)


In [56]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 35)                0         
_________________________________________________________________
OneHot (Embedding)           (None, 35, 100)           877800    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 35, 4)             1648      
_________________________________________________________________
attention_decoder_1 (Attenti (None, 35, 20)            726       
Total params: 880,174
Trainable params: 880,174
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.compile(optimizer=Adam(1e-2),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [44]:
a1_new = np.zeros((a1.shape + (100,)))
for n in range(a1.shape[0]):
    for d in range(a1.shape[1]):
        a1_new[n,d] = en_mat[a1[n,d]]
        
a2_new = np.zeros((a2.shape + (20,)))
for n in range(a2.shape[0]):
    for d in range(a2.shape[1]):
        a2_new[n,d] = cn_mat[a1[n,d]]
    

In [52]:
# model.fit(a2,a1_new)
model.fit(a1,a2_new,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x127a78be0>

In [55]:
model.predict(a1[0][np.newaxis,:]).shape

(1, 35, 20)

In [154]:
['a'] + ['a','b']

['a', 'a', 'b']

In [141]:
en_tok[1]

['Hi',
 ',',
 'honey.',
 'I',
 'guess',
 'you',
 "'re",
 'really',
 'tied',
 'up',
 'in',
 'meetings.']

In [80]:
# tokenization
re.findall(r'\w+', en_data[1])
# 

['Hi',
 'honey',
 'I',
 'guess',
 'you',
 're',
 'really',
 'tied',
 'up',
 'in',
 'meetings']

In [97]:
re.findall(r'[a-zA-Z0-9]+|[\'\w\']+|[.,?!]', en_data[1])

['Hi',
 ',',
 'honey',
 '.',
 'I',
 'guess',
 'you',
 "'re",
 'really',
 'tied',
 'up',
 'in',
 'meetings',
 '.']

In [144]:
a1 = re.findall(r'[.,?!:()]+|[0-9.]+|[a-zA-Z0-9\-()]+|[\'\w\".]+', "Hi. are you ('Mr.0') WRITE's sis-in-law...? Totally $3.2")
a1

['Hi',
 '.',
 'are',
 'you',
 '(',
 "'Mr.0'",
 ')',
 'WRITE',
 "'s",
 'sis-in-law',
 '...?',
 'Totally',
 '3.2']

In [145]:
[a1_1.lower() for a1_1 in a1]

['hi',
 '.',
 'are',
 'you',
 '(',
 "'mr.0'",
 ')',
 'write',
 "'s",
 'sis-in-law',
 '...?',
 'totally',
 '3.2']

In [133]:
a1 = jieba.cut('你说，他的名字叫"清华大学"…！一共（3.2）元', cut_all=False)
list(a1)

['你',
 '说',
 '，',
 '他',
 '的',
 '名字',
 '叫',
 '"',
 '清华大学',
 '"',
 '…',
 '！',
 '一共',
 '（',
 '3.2',
 '）',
 '元']

In [114]:
list(jieba.cut("He's sis-in-law is name is called 'Qinghua University'!"))

['He',
 "'",
 's',
 ' ',
 'sis',
 '-',
 'in',
 '-',
 'law',
 ' ',
 'is',
 ' ',
 'name',
 ' ',
 'is',
 ' ',
 'called',
 ' ',
 "'",
 'Qinghua',
 ' ',
 'University',
 "'",
 '!']

In [69]:
cn_data[:10]

['你们觉得我们看起来够年轻溜进高中吗？',
 '嗨，亲爱的。你现在肯定忙着开会呢。',
 '因为你想在进养老院前娶妻生子。',
 '我就一天24小时都得在她眼皮子底下。',
 '找条牢靠的链子或者别的什么固定住这些灯。',
 '为了不让别的父母经历我的遭遇。',
 '我要去赴约会，必须学跳舞。现在就学。',
 '有时候我们信任的人替我们做了这样的选择。',
 '好吧。那么，我想现在能做的有限。',
 '我尊重这点，并且会不惜一切保护隐私不被侵犯。']

# model

In [10]:
cn_data[:10]

['<?xml version="1.0" encoding="UTF-8"?>\n',
 '<mteval>\n',
 '<refset setid="setid" srclang="en" trglang="zh" refid="ref0">\n',
 '<doc sysid="sysid" docid="docid" genre="talk">\n',
 '<seg id="1"> 你们觉得我们看起来够年轻溜进高中吗？ </seg>\n',
 '<seg id="2"> 嗨，亲爱的。你现在肯定忙着开会呢。 </seg>\n',
 '<seg id="3"> 因为你想在进养老院前娶妻生子。 </seg>\n',
 '<seg id="4"> 我就一天24小时都得在她眼皮子底下。 </seg>\n',
 '<seg id="5"> 找条牢靠的链子或者别的什么固定住这些灯。 </seg>\n',
 '<seg id="6"> 为了不让别的父母经历我的遭遇。 </seg>\n']

In [11]:
cn_data[-10:]

['<seg id="7994"> 噢，我们让你看起来像个银行家。脱掉你的裤子。 </seg>\n',
 '<seg id="7995"> 但其实是你。是你把她推了下去。 </seg>\n',
 '<seg id="7996"> 参加美国教育部奖学金项目的女孩原定今天抵达。 </seg>\n',
 '<seg id="7997"> 所以你的意思是，汤米的这一切疯狂举动 </seg>\n',
 '<seg id="7998"> 嘿。加州调查局的你要不要来说说你到底在搞什么？ </seg>\n',
 '<seg id="7999"> 噢看在老天的份上！到灌木丛里解决掉吧。 </seg>\n',
 '<seg id="8000"> 布兰达， 要不要告诉我你究竟在烦什么？ </seg>\n',
 '</doc>\n',
 '</refset>\n',
 '</mteval>\n']

In [71]:
from bs4 import SoupStrainer
from bs4 import BeautifulSoup

with open(en_path, 'r') as f:
    en_data = f.read()

strainer = SoupStrainer('seg')
en_soup = BeautifulSoup(en_data, "lxml", parse_only=strainer)

In [73]:
en_soup.find_all('seg')

[<seg id="1"> Do you think we look young enough to blend in at a high school? </seg>,
 <seg id="2"> Hi, honey. I guess you're really tied up in meetings. </seg>,
 <seg id="3"> Because you want to start a family before you hit the nursing home. </seg>,
 <seg id="4"> She's got to have me in her sight like 24 hours a day. </seg>,
 <seg id="5"> Find a safety chain or something to keep these lights in place. </seg>,
 <seg id="6"> So that no parent has to go through what I've known. </seg>,
 <seg id="7"> I have to go to the date, learn to dance. Definitely. Now. </seg>,
 <seg id="8"> Is when someone we've trusted makes the choice for us. </seg>,
 <seg id="9"> Okay. Well, I guess there's not much to do about it right now then. </seg>,
 <seg id="10"> I respect that, and I will protect it at all cost. </seg>,
 <seg id="11"> Yeah, it's getting weird. - let's get out of here. </seg>,
 <seg id="12"> So after investigators got a blood trace on the doorknob, </seg>,
 <seg id="13"> Which means if we 

In [59]:
str(en_soup.find_all('seg')[0].string)

' Do you think we look young enough to blend in at a high school? '

In [60]:
out = [str(x.string).strip( ) for x in en_soup.find_all('seg')]