# 1. 使用one-hot 初始编码进行训练和预测

In [23]:
from keras.datasets import imdb

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

In [7]:
max([max(sequence) for sequence in train_data])

9999

In [9]:
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
decode_review = [reverse_word_index.get(i-3, '?') for i in train_data[0]]
decode_review

In [13]:
import numpy as np

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1
    return results

x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

In [22]:
from keras import models
from keras import layers
from keras import optimizers

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000, )))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.RMSprop(lr=0.001),
             loss='binary_crossentropy',
             metrics=['accuracy'])

x_val = x_train[:10000]
partial_x_train = x_train[10000:]

y_val = y_train[:10000]
partial_y_train = y_train[10000:]
history = model.fit(partial_x_train,
                   partial_y_train,
                   epochs=20,
                   batch_size=512,
                   validation_data=[x_val, y_val])

Train on 15000 samples, validate on 10000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## 找到最佳情况为第五个epoch并测试

In [30]:
from sklearn import metrics

history = model.fit(partial_x_train,
                   partial_y_train,
                   epochs=5,
                   batch_size=512,
                   validation_data=[x_val, y_val])

y_prob = model.predict(x_test)
y_pred = y_prob
y_pred[y_pred >= 0.5] = 1
y_pred[y_pred < 0.5] = 0

test_acc = metrics.accuracy_score(y_test, y_pred)
test_precision = metrics.precision_score(y_test, y_pred)
test_recall = metrics.recall_score(y_test, y_pred)
test_auc = metrics.roc_auc_score(y_test, y_prob)

print("测试集的准确率为:", test_acc)
print("测试集的精确率为:", test_precision)
print("测试集的召回率为:", test_recall)
print("测试集的auc值为:", test_auc)

Train on 15000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
测试集的准确率为: 0.84252
测试集的精确率为: 0.8521839269556634
测试集的召回率为: 0.8288
测试集的auc值为: 0.8425199999999999


# 2. 使用embedding层

In [44]:
from keras.datasets import imdb
from keras import preprocessing

max_features = 10000
maxlen = 100

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

## 2.1 只使用embedding层进行训练和预测

In [45]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

model = Sequential()
model.add(Embedding(10000, 64, input_length=maxlen))

model.add(Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()
history = model.fit(x_train, y_train,
epochs=10,
batch_size=32,
validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 100, 64)           640000    
_________________________________________________________________
flatten_6 (Flatten)          (None, 6400)              0         
_________________________________________________________________
dense_22 (Dense)             (None, 512)               3277312   
_________________________________________________________________
dense_23 (Dense)             (None, 128)               65664     
_________________________________________________________________
dense_24 (Dense)             (None, 32)                4128      
_________________________________________________________________
dense_25 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_26 (Dense)             (None, 1)                 17        
Total para

In [48]:
model = Sequential()
model.add(Embedding(10000, 64, input_length=maxlen))

model.add(Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()
history = model.fit(x_train, y_train,
epochs=2,
batch_size=32,
validation_split=0.2)

y_prob = model.predict(x_test)
y_pred = y_prob
y_pred[y_pred >= 0.5] = 1
y_pred[y_pred < 0.5] = 0

test_acc = metrics.accuracy_score(y_test, y_pred)
test_precision = metrics.precision_score(y_test, y_pred)
test_recall = metrics.recall_score(y_test, y_pred)
test_auc = metrics.roc_auc_score(y_test, y_prob)

print("测试集的准确率为:", test_acc)
print("测试集的精确率为:", test_precision)
print("测试集的召回率为:", test_recall)
print("测试集的auc值为:", test_auc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 100, 64)           640000    
_________________________________________________________________
flatten_8 (Flatten)          (None, 6400)              0         
_________________________________________________________________
dense_32 (Dense)             (None, 512)               3277312   
_________________________________________________________________
dense_33 (Dense)             (None, 128)               65664     
_________________________________________________________________
dense_34 (Dense)             (None, 32)                4128      
_________________________________________________________________
dense_35 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_36 (Dense)             (None, 1)                 17        
Total para

## 2.2 使用glove预训练词向量

### 2.2.1. 处理imdb原始数据和标签

In [33]:
import os
imdb_dir = './data/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

In [34]:
texts[0]

'Frank Sinatra was far from the ideal actor for westerns. He was a great actor, From Here to Eternity and The Man with The Golden arm are a proof of that, but he did not have the physique of a western hero, you identified him as an urban guy. But he tried to do his job well in Johnny Concho, the fact that the film was a failure at the box office was not his fault. I blame it on two factors: a) the story was too unusual, specially in the fact that Sinatra behaves more like a villain than as a hero throughout the movie. In a genre where people kind of expected a certain pattern, to break away from it the film has to be very good. b) the story is not convincing, it is hard to believe that a whole town will allow Sinatra to do anything he wants just because they are afraid of his brother. Also when a man shows him a special holster that will open sideways so he has not to draw the gun you wonder that if that will make him invincible, why all the gunfighters have not adopted it? I think tha

In [35]:
labels[0]

0

In [36]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 400
train_samples = 20000
validation_samples = 5000
max_words = 5000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:train_samples]
y_train = labels[:train_samples]

x_val = data[train_samples: train_samples+validation_samples]
y_val = labels[train_samples: train_samples+validation_samples]

Found 88582 unique tokens.
Shape of data tensor: (25000, 400)
Shape of label tensor: (25000,)


In [37]:
glove_dir = './embedding/'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [38]:
sorted(word_index.items(), key=lambda d: d[1], reverse=True)

[('belivable', 88582),
 ('macshane', 88581),
 ('cartmans', 88580),
 ('bachar', 88579),
 ('dvda', 88578),
 ('pacy', 88577),
 ('octagonal', 88576),
 ('hexagonal', 88575),
 ("cote's", 88574),
 ('threequels', 88573),
 ('shying', 88572),
 ('fetishwear', 88571),
 ('exhuberance', 88570),
 ("moviegoer's", 88569),
 ('picnicking', 88568),
 ('darwinian', 88567),
 ('blueish', 88566),
 ("warhols'", 88565),
 ('schreck', 88564),
 ('potee', 88563),
 ('manouvres', 88562),
 ('grandes', 88561),
 ('doozys', 88560),
 ('ascots', 88559),
 ("d'angelo's", 88558),
 ('megastar', 88557),
 ('waheeda', 88556),
 ("fischer's", 88555),
 ('choronzhon', 88554),
 ('psychomania', 88553),
 ('hypnotising', 88552),
 ('kitties', 88551),
 ('540i', 88550),
 ("ackroyd's", 88549),
 ("spot'", 88548),
 ('brimful', 88547),
 ('exporters', 88546),
 ('spasmodic', 88545),
 ("''their", 88544),
 ("pair''", 88543),
 ("''nice", 88542),
 ("kieslowski's", 88541),
 ('psychoanalyzes', 88540),
 ('outbreaking', 88539),
 ('chillness', 88538),
 ("'

In [39]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embeddings_vector = embeddings_index.get(word)
        if embeddings_vector is not None:
            embedding_matrix[i] = embeddings_vector

In [40]:
from keras.models import Sequential
from keras import layers
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 400, 100)          500000    
_________________________________________________________________
flatten_4 (Flatten)          (None, 40000)             0         
_________________________________________________________________
dense_10 (Dense)             (None, 32)                1280032   
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 33        
Total params: 1,780,065
Trainable params: 1,780,065
Non-trainable params: 0
_________________________________________________________________


In [41]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [42]:
model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['acc'])

history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))

model.save_weights('pre_trained_glove_model.h5')

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [43]:
test_dir = os.path.join(imdb_dir, 'test')

labels = []
texts = []
for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in sorted(os.listdir(dir_name)):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
                
sequences = tokenizer.texts_to_sequences(texts)
x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(labels)

In [44]:
model.load_weights('pre_trained_glove_model.h5')
model.evaluate(x_test, y_test)



[1.09888627307415, 0.71692]

## 2.3. TextCNN + glove预训练向量

In [45]:
from keras import Input, Model
from keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout
from keras.callbacks import EarlyStopping
from keras.datasets import imdb
from keras.preprocessing import sequence
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

class TextCNN(object):
    def __init__(self, maxlen, max_features, embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation

    def get_model(self):
        input = Input((self.maxlen,))

        # Embedding part can try multichannel as same as origin paper
        embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)(input)
        convs = []
        for kernel_size in [3, 4, 5]:
            c = Conv1D(128, kernel_size, activation='relu')(embedding)
            c = GlobalMaxPooling1D()(c)
            convs.append(c)
        x = Concatenate()(convs)

        output = Dense(self.class_num, activation=self.last_activation)(x)
        model = Model(inputs=input, outputs=output)
        return model

In [50]:
model = TextCNN(maxlen, max_words, embedding_dim).get_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 400)          0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 400, 100)     500000      input_4[0][0]                    
__________________________________________________________________________________________________
conv1d_10 (Conv1D)              (None, 398, 128)     38528       embedding_8[0][0]                
__________________________________________________________________________________________________
conv1d_11 (Conv1D)              (None, 397, 128)     51328       embedding_8[0][0]                
__________________________________________________________________________________________________
conv1d_12 

In [51]:
model.layers[1].set_weights([embedding_matrix])
#model.layers[1].trainable = False

In [52]:
early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=32,
          epochs=10,
          callbacks=[early_stopping],
          validation_data=(x_val, y_val))

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7feec9749780>

In [53]:
model.evaluate(x_test, y_test)



[0.37440894957132637, 0.90008]

In [54]:
model2 = TextCNN(maxlen, max_words, embedding_dim).get_model()
model2.layers[1].set_weights([embedding_matrix])
model2.layers[1].trainable = False
model2.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 400)          0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 400, 100)     500000      input_5[0][0]                    
__________________________________________________________________________________________________
conv1d_13 (Conv1D)              (None, 398, 128)     38528       embedding_9[0][0]                
__________________________________________________________________________________________________
conv1d_14 (Conv1D)              (None, 397, 128)     51328       embedding_9[0][0]                
__________________________________________________________________________________________________
conv1d_15 

In [57]:
early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')
model2.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model2.fit(x_train, y_train,
          batch_size=32,
          epochs=10,
          callbacks=[early_stopping],
          validation_data=(x_val, y_val))

model2.evaluate(x_test, y_test)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.35344335737906396, 0.88592]

In [58]:
model3 = TextCNN(maxlen, max_words, embedding_dim).get_model()
model3.layers[1].set_weights([embedding_matrix])
#model2.layers[1].trainable = False
model3.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 400)          0                                            
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 400, 100)     500000      input_6[0][0]                    
__________________________________________________________________________________________________
conv1d_16 (Conv1D)              (None, 398, 128)     38528       embedding_10[0][0]               
__________________________________________________________________________________________________
conv1d_17 (Conv1D)              (None, 397, 128)     51328       embedding_10[0][0]               
__________________________________________________________________________________________________
conv1d_18 

In [63]:
x_total = np.concatenate((x_train, x_val), axis=0)
y_total = np.concatenate((y_train, y_val), axis=0)

In [64]:
early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')
model3.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model3.fit(x_total, y_total,
          batch_size=32,
          epochs=10,
          callbacks=[early_stopping],
          validation_data=(x_test, y_test))

model3.evaluate(x_test, y_test)

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


[0.3154137371329218, 0.90404]

In [67]:
model4 = TextCNN(maxlen, max_words, embedding_dim).get_model()
#model4.layers[1].set_weights([embedding_matrix])
#model2.layers[1].trainable = False
model4.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 400)          0                                            
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 400, 100)     500000      input_7[0][0]                    
__________________________________________________________________________________________________
conv1d_19 (Conv1D)              (None, 398, 128)     38528       embedding_11[0][0]               
__________________________________________________________________________________________________
conv1d_20 (Conv1D)              (None, 397, 128)     51328       embedding_11[0][0]               
__________________________________________________________________________________________________
conv1d_21 

In [68]:
early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')
model4.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
model4.fit(x_total, y_total,
          batch_size=32,
          epochs=10,
          callbacks=[early_stopping],
          validation_data=(x_test, y_test))

model4.evaluate(x_test, y_test)

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


[0.3875766246571578, 0.88988]