# [把 ELMo 作为 keras 的一个嵌入层使用](https://github.com/strongio/keras-elmo)

In [1]:
# Import our dependencies
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
from keras import backend as K
import keras.layers as layers
from keras.models import Model
import numpy as np

# Initialize session
sess = tf.Session()
K.set_session(sess)

Using TensorFlow backend.


In [2]:
# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)

    train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
    test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))

    return train_df, test_df

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

train_df, test_df = download_and_load_datasets()
train_df.head()

Unnamed: 0,sentence,sentiment,polarity
0,**SPOILERS** Redicules slasher film that makes...,4,0
1,1st watched 2/2/2003 - 4 out of 10(Dir-Jim Kam...,4,0
2,Of the three titles from Jess Franco to find t...,3,0
3,This could well be the worst film I've ever se...,1,0
4,I feel like I'm the only kid in town who was a...,3,0


In [3]:
train_df.head(30)

Unnamed: 0,sentence,sentiment,polarity
0,**SPOILERS** Redicules slasher film that makes...,4,0
1,1st watched 2/2/2003 - 4 out of 10(Dir-Jim Kam...,4,0
2,Of the three titles from Jess Franco to find t...,3,0
3,This could well be the worst film I've ever se...,1,0
4,I feel like I'm the only kid in town who was a...,3,0
5,Although time has revealed how some of the eff...,8,1
6,The plot doesn't begin to describe the film: a...,8,1
7,"""Hollywood North"" is an euphemism from the mov...",7,1
8,"What a mess!! Why was this movie made? This, a...",1,0
9,"In some ways, the concept behind the storyline...",2,0


In [4]:
# Now instantiate the elmo model
# elmo model url https://tfhub.dev/google/elmo/2
elmo_model = hub.Module("/home/b418/jupyter_workspace/B418_common/袁宵/tfhub_modules/elmo/", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

In [5]:
# Build our model

# We create a function to integrate the tensorflow model with a Keras model
# This requires explicitly casting the tensor to a string, because of a Keras quirk
def ElmoEmbedding(x):
    return elmo_model(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]
 
input_text = layers.Input(shape=(1,), dtype=tf.string)
embedding = layers.Lambda(ElmoEmbedding, output_shape=(1024,))(input_text)
dense = layers.Dense(256, activation='relu')(embedding)
pred = layers.Dense(1, activation='sigmoid')(dense)

model = Model(inputs=[input_text], outputs=pred)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
lambda_2 (Lambda)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               262400    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 262,657
Trainable params: 262,657
Non-trainable params: 0
_________________________________________________________________


In [6]:
# Create datasets (Only take up to 150 words for memory)
train_text = train_df['sentence'].tolist()
train_text = [' '.join(t.split()[0:150]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = train_df['polarity'].tolist()

test_text = test_df['sentence'].tolist()
test_text = [' '.join(t.split()[0:150]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = test_df['polarity'].tolist()

In [9]:
# Fit!
model.fit(train_text, 
          train_label,
          epochs=5,
          batch_size=256,
          validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f9d073ec390>

# 评估模型

In [10]:
model.evaluate(test_text, test_label)



[0.4151706755256653, 0.80912]