# Table of Contents
 <p>

In [1]:
from keras.preprocessing import sequence
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, Dense, Dropout
from keras.datasets import imdb
from keras import backend as K
import numpy as np

max_features = 10000 #保留前max_features个词
maxlen = 100 #填充/阶段到100词
batch_size = 1000
nb_grams = 25 #训练一个10-gram的语言模型
nb_train = 1000 #训练样本数

#加载内置的IMDB数据集
(x_train, y_train), (x_test, y_test) = imdb.load_data(path='/Users/lizhn7/Downloads/EXPERIMENT/COPA/mini_lm/data/imdb.npz',
                                                      num_words=max_features)
x_lm_ = np.append(x_train, x_test)

#构造用来训练语言模型的数据
#这里只用了已有数据，实际环境中，可以补充其他数据使得训练更加充分
x_lm = []
y_lm = []
for x in x_lm_:
		for i in range(len(x)):
			x_lm.append([0]*(nb_grams - i + max(0,i-nb_grams))+x[max(0,i-nb_grams):i])
			y_lm.append([x[i]])

x_lm = np.array(x_lm)
y_lm = np.array(y_lm)
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
x = np.vstack([x_train, x_test])
y = np.hstack([y_train, y_test])

Using TensorFlow backend.


In [3]:
y_lm[0:10]

array([[   1],
       [  14],
       [  22],
       [  16],
       [  43],
       [ 530],
       [ 973],
       [1622],
       [1385],
       [  65]])

In [5]:
x_lm.shape

(11737946, 25)

In [19]:
y_lm.shape

(11737946, 1)

In [14]:
x_lm = []
y_lm = []
for x in x_lm_:
		for i in range(len(x)):
			x_lm.append([0]*(nb_grams - i + max(0,i-nb_grams))+x[max(0,i-nb_grams):i])
			y_lm.append([x[i]])

In [2]:
len(y_lm)

11737946

In [9]:
y_lm.extend(y_lm)

In [10]:
len(y_lm)

93903568

In [11]:
y_lm = np.array(y_lm)

In [12]:
y_lm.shape

(93903568, 1)

In [15]:
x_lm = np.vstack([x_lm, x_lm, x_lm])

In [16]:
x_lm.shape

(35213838, 25)

In [18]:
import h5py
fh = h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/mini_lm/data/train.h5', 'w')
fh['x_lm'] = np.vstack([x_lm, x_lm])
fh.close()

In [19]:
with h5py.File('/Users/lizhn7/Downloads/EXPERIMENT/COPA/mini_lm/data/train.h5', 'r') as fh:
    test = fh['x_lm'][:]

In [20]:
test.shape

(93903568, 25)

In [None]:
#重新划分训练集和测试集
#合并原来的训练集和测试集，随机挑选1000个样本，作为新的训练集，剩下为测试集
idx = range(len(x))
np.random.shuffle(idx)
x_train = x[idx[:nb_train]]
y_train = y[idx[:nb_train]]
x_test = x[idx[nb_train:]]
y_test = y[idx[nb_train:]]

AttributeError: 'list' object has no attribute 'shape'

In [10]:
embedded_size = 100 #词向量维度
hidden_size = 1000 #LSTM的维度，可以理解为编码后的句向量维度。

#encoder部分
inputs = Input(shape=(None,), dtype='int32')
embedded = Embedding(max_features, embedded_size)(inputs)
lstm = LSTM(hidden_size)(embedded)
encoder = Model(inputs=inputs, outputs=lstm)

#完全用ngram模型训练encode部分
input_grams = Input(shape=(nb_grams,), dtype='int32')
encoded_grams = encoder(input_grams)
softmax = Dense(max_features, activation='softmax')(encoded_grams)
lm = Model(inputs=input_grams, outputs=softmax)
lm.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
#用sparse交叉熵，可以不用事先将类别转换为one hot形式。

#情感分析部分
#固定encoder，后面接一个简单的Dense层（相当于逻辑回归）
#这时候训练的只有hidden_size+1=1001个参数
#因此理论上来说，少量标注样本就可以训练充分
for layer in encoder.layers:
    layer.trainable=False

sentence = Input(shape=(maxlen,), dtype='int32')
encoded_sentence = encoder(sentence)
sigmoid = Dense(10, activation='relu')(encoded_sentence)
sigmoid = Dropout(0.5)(sigmoid)
sigmoid = Dense(1, activation='sigmoid')(sigmoid)
model = Model(inputs=sentence, outputs=sigmoid)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
encoder.layers

[<keras.engine.topology.InputLayer at 0x1379e1c50>,
 <keras.layers.embeddings.Embedding at 0x1379e1c88>,
 <keras.layers.recurrent.LSTM at 0x1379e1eb8>]

In [11]:
encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 100)         1000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000)              4404000   
Total params: 5,404,000
Trainable params: 0
Non-trainable params: 5,404,000
_________________________________________________________________


In [12]:
lm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 10)                0         
_________________________________________________________________
model_1 (Model)              (None, 1000)              5404000   
_________________________________________________________________
dense_1 (Dense)              (None, 10000)             10010000  
Total params: 15,414,000
Trainable params: 10,010,000
Non-trainable params: 5,404,000
_________________________________________________________________


In [21]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 100)               0         
_________________________________________________________________
model_1 (Model)              (None, 1000)              5404000   
_________________________________________________________________
dense_2 (Dense)              (None, 10)                10010     
_________________________________________________________________
dropout_1 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 5,414,021
Trainable params: 10,021
Non-trainable params: 5,404,000
_________________________________________________________________
