In [1]:
import os
import json
import math
import pickle
from keras.layers import *
from keras.models import Model
from scipy.stats import pearsonr,spearmanr
from sklearn.metrics import mean_absolute_error,mean_squared_error

Using TensorFlow backend.


In [2]:
# 参数设置
batch_size = 128
num_train = 23000
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
def padding(x):
    # padding至batch内的最大长度
    ml = max([len(i) for i in x])
    return [i + list(np.zeros(((ml-len(i)),768))) for i in x]
def process_line(line_X,line_y):
    feature = []
    line_X = line_X.strip().split('\t')
    for l in line_X:
        l = l.split()
        feature.append(list(map(float,l)))
    score = float(line_y.strip())
    return feature,score

In [4]:
# 读入验证集
path = './2017/en_de/sentence_level/task1_en-de_training-dev/'
f_X = open(path+'dev.features','r')
f_y = open(path+'dev.hter','r')
X_dev,y_dev = [],[]
for line_X,line_y in zip(f_X,f_y):
    feature, score = process_line(line_X,line_y)
    X_dev.append(feature)
    y_dev.append(score)
f_X.close()
f_y.close()
X_dev = np.array(padding(X_dev))
y_dev = np.array(y_dev)

In [5]:
# 构建模型
input_features = Input(shape=(None,768), dtype='float32')
hidden = Bidirectional(CuDNNLSTM(128))(input_features)
score = Dense(1, activation='sigmoid')(hidden)

model = Model(inputs=[input_features], outputs=score)
model.compile(loss='mean_squared_error',
        optimizer='adam',
        metrics=['mean_absolute_error','mean_squared_error'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, 768)         0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               919552    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 919,809
Trainable params: 919,809
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
# 准备训练集
def data_generator():
    # 数据生成器
    X,Y = [],[]
    while True:
        f_X = open(path+'train.features','r')
        f_y = open(path+'train.hter.shuffle','r')
        for line_X,line_y in zip(f_X,f_y):
            feature, score = process_line(line_X,line_y)
            X.append(feature)
            Y.append(score)
            if len(X) == batch_size:
                X = np.array(padding(X))
                Y = np.array(Y)
                yield (X,Y)
                X,Y = [],[]
        f_X.close()
        f_y.close()
hist = model.fit_generator(data_generator(),
    steps_per_epoch=num_train/batch_size,
    validation_data=(X_dev,y_dev),
    epochs=6)
#model.fit(X_train, y_train,validation_data=(X_test, y_test), 
                 #epochs=50, batch_size=batch_size, shuffle=True)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [7]:
# 读入测试集
test_path = './2017/en_de/sentence_level/task1_en-de_test/'
f_X = open(test_path+'test.features','r')
f_y = open(test_path+'en-de_task1_test.2017.hter','r')
X_test,y_test = [],[]
for line_X,line_y in zip(f_X,f_y):
    feature, score = process_line(line_X,line_y)
    X_test.append(feature)
    y_test.append(score)
f_X.close()
f_y.close()
X_test = np.array(padding(X_test))
y_test = np.array(y_test)

In [8]:
y_pred = model.predict(X_test)
y_ = []
for i in y_pred:
    y_.append(i[0])
print("Pearson ",pearsonr(y_test, y_))
print(spearmanr(y_test, y_))
print("MAE ",mean_absolute_error(y_test, y_))
print("RMSE ",math.sqrt(mean_squared_error(y_test, y_)))

Pearson  (0.59347142046142853, 1.2639081032951138e-190)
SpearmanrResult(correlation=0.62455740638199653, pvalue=8.8920517252414074e-217)
MAE  0.117356501219
RMSE  0.1537366126045113
