# [Keras_ELMo_Tutorial Preprocessing](https://github.com/UKPLab/elmo-bilstm-cnn-crf/blob/master/Keras_ELMo_Tutorial.ipynb)

In [1]:
import pandas as pd
import numpy as np
import os
import sys

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, plot_model
from keras.layers import Embedding
from keras.initializers import Constant

from keras.layers import Dense, Dropout, Activation, Input
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

from keras.models import Sequential

Using TensorFlow backend.


In [2]:
import spacy
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])

In [3]:
MAX_NUM_WORDS = 100000
MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 100
num_classes = 2

In [4]:
# data: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
train_data_df = pd.read_csv('data/imdb_train_pandas_datafram.csv')
test_data_df = pd.read_csv('data/imdb_test_pandas_datafram.csv')

In [5]:
train_data_df.head()

Unnamed: 0,text,label
0,This film is notable for three reasons. First...,0
1,Escaping the life of being pimped by her fathe...,1
2,Wenders was great with Million $ Hotel.I don't...,0
3,Saw this in the theater in '86 and fell out of...,1
4,"A terrible amateur movie director (no, not Tod...",0


In [6]:
# 英文分词
def use_spacy_segmented_words(a_text_sentence):
    doc = nlp(a_text_sentence)
    token_list = [token for token in doc]
    return token_list

In [7]:
# 把cut_text 处理成定长
def padding_cut_text(datafram_cut_text, MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH):
    raw_cut_text_len = len(datafram_cut_text)
    if raw_cut_text_len >= MAX_SEQUENCE_LENGTH:
        return datafram_cut_text[:MAX_SEQUENCE_LENGTH]
    else:
        datafram_cut_text += ["" for _ in range(MAX_SEQUENCE_LENGTH - raw_cut_text_len)] 
        return datafram_cut_text

In [8]:
# 分离特征文本和标签
def slipe_text_label(data_pandas_datafram):
    x_list, label_list = [], []
    for row_index, a_row in data_pandas_datafram.iterrows():
        raw_text = a_row[0]
        raw_cut_text = use_spacy_segmented_words(raw_text)
        raw_cut_padding_text = padding_cut_text(raw_cut_text)
        raw_cut_padding_text = [str(x) for x in raw_cut_padding_text]
        x_list.append(raw_cut_padding_text)
        label_list.append(a_row[1])
    return x_list, label_list

In [9]:
raw_x_train, raw_y_train = slipe_text_label(train_data_df)

raw_x_test, raw_y_test = slipe_text_label(test_data_df)

In [33]:
from allennlp.commands.elmo import ElmoEmbedder
# https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5
# https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json
options_file = "/home/b418/jupyter_workspace/yuanxiao/elmo_data/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json"
weight_file = "/home/b418/jupyter_workspace/yuanxiao/elmo_data/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5"

elmo = ElmoEmbedder(options_file, weight_file) 

In [51]:
# Lookup the ELMo embeddings for all documents (all sentences) in our dataset. Store those
# in a numpy matrix so that we must compute the ELMo embeddings only once.
def create_elmo_embeddings(elmo, documents, max_sentences = 1000):
    num_sentences = min(max_sentences, len(documents)) if max_sentences > 0 else len(documents)
    print("\n\n:: Lookup of "+str(num_sentences)+" ELMo representations. This takes a while ::")
    embeddings = []
    documentIdx = 0
    for elmo_embedding in elmo.embed_sentences(documents):  
        document = documents[documentIdx]
        # 取第三个向量的值
        third_elmo_embedding = elmo_embedding[2]    
        embeddings.append(third_elmo_embedding)            
        # Some progress info
        documentIdx += 1
        percent = 100.0 * documentIdx / num_sentences
        line = '[{0}{1}]'.format('=' * int(percent / 2), ' ' * (50 - int(percent / 2)))
        status = '\r{0:3.0f}%{1} {2:3d}/{3:3d} sentences'
        sys.stdout.write(status.format(percent, line, documentIdx, num_sentences))
        
        if max_sentences > 0 and documentIdx >= max_sentences:
            break       
    return embeddings


x_train_elmo = create_elmo_embeddings(elmo, raw_x_train[:100], 100)
x_text_elmo = create_elmo_embeddings(elmo, raw_x_test[:100], 100)



:: Lookup of 100 ELMo representations. This takes a while ::

:: Lookup of 100 ELMo representations. This takes a while ::

In [54]:
len(x_text_elmo)

100

In [62]:
x_train_elmo_1 = np.array(x_train_elmo)

In [63]:
x_text_elmo_1 = np.array(x_text_elmo)

## 对齐

In [78]:
y_train = np.array(raw_y_train[:100])
y_test = np.array(raw_y_test[:100])

# 设计模型

In [84]:
def creat_elmo_v1_model():
    model = Sequential()
    model.add(Conv1D(filters=250, kernel_size=3, padding='same', input_shape=(MAX_SEQUENCE_LENGTH,1024)))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    
    plot_model(model, 
               to_file="IMDB_ELMo_Preprocessing.png",
               show_shapes=True)
    model.compile(
              loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])
    return model

In [85]:
creat_elmo_v1_model = creat_elmo_v1_model()

# 训练模型

In [86]:
creat_elmo_v1_model.fit(x_train_elmo_1, y_train,
          batch_size=2,
          epochs=10,
          validation_split=0.2)

Train on 80 samples, validate on 20 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa61ead44e0>

# 测试模型

In [89]:
creat_elmo_v1_model.evaluate(x_text_elmo_1, y_test)



[1.1400712180137633, 0.66]