In [1]:
# - Outlook for multiple classes: for multi-class prediction 
#     use softmax but for multi-label prediction use sigmoid
# - Use aws deep learning instance: 
#     https://docs.aws.amazon.com/dlami/latest/devguide/keras-mxnet.html
# - Use t-SNE to visualize the word embeddings
# - Similarity search by using learned embeddings, i.e. 
#     https://blog.insightdatascience.com/the-unreasonable-effectiveness-of-deep-learning-representations-4ce83fc663cf
#     (especially Spotify Annoy Index)
# - Test out 1d-conv layers vs. stacked LSTM
# - in order to handle arbitrary length input: 
#     (1) remove input_length from embedding layer,
#     (2) batch wise apply pad_sequences to training input

In [2]:
from keras.datasets import imdb
from keras.preprocessing import sequence 
from keras.layers import LSTM, Embedding, Dense, Flatten, Bidirectional
from keras.models import Sequential
from keras.utils import get_file
from keras.initializers import Constant
from keras.preprocessing.text import Tokenizer
import numpy as np
from nltk.corpus import reuters
from sklearn.preprocessing import MultiLabelBinarizer

MAX_NUM_WORDS = 5000
MAX_SEQUENCE_LEN = 500

Using TensorFlow backend.


In [3]:
categs = sorted([(cat, len(reuters.fileids(categories=cat))) for cat in reuters.categories()], key=lambda x: -x[1])
categs[:10]

[('earn', 3964),
 ('acq', 2369),
 ('money-fx', 717),
 ('grain', 582),
 ('crude', 578),
 ('trade', 485),
 ('interest', 478),
 ('ship', 286),
 ('wheat', 283),
 ('corn', 237)]

In [4]:
top_categories = [cat for cat, _ in categs[2:7]] 
fileids = reuters.fileids(categories=top_categories)
fileids_test = [fid for fid in fileids if fid.startswith("test")]
fileids_train = [fid for fid in fileids if fid.startswith("train")]

In [5]:
mlb = MultiLabelBinarizer(top_categories)
mlb.fit(reuters.categories(fid) for fig in fileids)
y_train = mlb.transform(reuters.categories(fid) for fid in fileids_train)
y_test = mlb.transform(reuters.categories(fid) for fid in fileids_test)

  .format(sorted(unknown, key=str)))
  .format(sorted(unknown, key=str)))


In [6]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(reuters.raw(fid) for fid in fileids)
X_train = tokenizer.texts_to_sequences(reuters.raw(fid) for fid in fileids_train)
X_test = tokenizer.texts_to_sequences(reuters.raw(fid) for fid in fileids_test)

In [7]:
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LEN)
X_test = sequence.pad_sequences(X_test)

In [8]:
# get glove coeff matrix
embeddings_index = {}
fname = get_file("glove.6B.100d.txt", "http://nlp.stanford.edu/data/glove.6B.zip", extract=True)
with open(fname, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

# prepare pre-learned embedding matrix
embdedding_dim = 100
word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, embdedding_dim))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
glove_embedding = Embedding(num_words, embdedding_dim,
                            embeddings_initializer=Constant(embedding_matrix), trainable=True)

Found 400000 word vectors.


In [11]:
# Build the model 
model = Sequential()
model.add(glove_embedding)
model.add(LSTM(32, return_sequences=True))
model.add(LSTM(32, return_sequences=True))
model.add(LSTM(32))
model.add(Dense(100, activation='relu', name='fc')) 
model.add(Dense(len(top_categories), activation='sigmoid')) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         500100    
_________________________________________________________________
lstm_4 (LSTM)                (None, None, 32)          17024     
_________________________________________________________________
lstm_5 (LSTM)                (None, None, 32)          8320      
_________________________________________________________________
lstm_6 (LSTM)                (None, 32)                8320      
_________________________________________________________________
fc (Dense)                   (None, 100)               3300      
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 505       
Total params: 537,569
Trainable params: 537,569
Non-trainable params: 0
_________________________________________________________________
None

In [15]:
model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.1)

Train on 1695 samples, validate on 189 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1331b15c0>

In [41]:
model.evaluate(X_test, y_test)



[0.3781221391117505, 0.8248588035335649]

In [16]:
mlb.inverse_transform(model.predict(X_test[0:10])>0.5)

[(), (), (), (), (), ('money-fx',), (), ('money-fx',), (), ()]

In [17]:
mlb.inverse_transform(y_test[0:10])

[('trade',),
 ('grain',),
 ('crude',),
 ('grain', 'trade'),
 ('grain',),
 ('money-fx', 'interest'),
 ('grain', 'trade'),
 ('money-fx', 'interest'),
 ('trade',),
 ('trade',)]

In [44]:
tokenizer.sequences_to_texts(X_train[0:1])

["u k growing with japan thatcher prime minister margaret thatcher said the u k was growing more with japanese trade barriers and warned that it would soon have new powers against countries not offering access to their markets she told parliament that the bid by the u k 's cable and wireless plc lt l to enter the japanese telecommunications market was being regarded by her government as a test case i to the prime minister of japan mr nakasone on the fourth of march to our interest on the cable and wireless bid i have not yet had a reply we see this as a test on how open the japanese market really is thatcher said thatcher told parliament that shortly we shall have more powers than we have now when for example the powers under the financial services act and the banking act become available then we shall be able to take action in cases where other countries do not offer the same full access to financial services as we do cable and wireless is seeking a stake in the proposed japanese tele

In [None]:
from keras.models import Model
model_vec = Model(model.input, model.get_layer(name="fc").output)
vecs = model_vec.predict(X_test)

from sklearn.preprocessing import normalize
vecs_norm = normalize(vecs)

In [27]:
search_text = "trade issues ec's with japan member states of the european community are starting to run out"

search_text_vec = model_vec.predict(sequence.pad_sequences(tokenizer.texts_to_sequences([search_text])))
search_text_vec = normalize(search_text_vec)

sorted_indices = np.argsort(-search_text_vec[0].dot(vecs_norm.T))
print(mlb.inverse_transform(y_test[sorted_indices[:5]]))
print(tokenizer.sequences_to_texts(X_test[sorted_indices[:5]]))

[('trade',), ('money-fx',), ('interest',), ('money-fx',), ('money-fx',)]
['canada february trade surplus 1 25 billion dlrs after january 623 mln dlrs surplus canada february trade surplus 1 25 billion dlrs after january 623 mln dlrs surplus', "france's balladur says target zone nearer french finance minister edouard balladur said that the financial community is closer to at a system of target zones for currencies despite the fact that little is being said about them speaking with reporters at the meetings of the international monetary fund balladur said we are not very far from the of target zones even if we don't say so he told reporters that our ideas are progressing adding that the finance ministers have been talking about more cooperation on economic policies and on levels around which currencies should stabilize", "u k reserves lift hopes of further base rate cut the record 4 9 billion dlrs rise in u k reserves in may to a total 34 7 billion has lifted hopes for a further cut in b

In [20]:
sorted_indices = np.argsort(-vecs_norm[0].dot(vecs_norm.T))
print(mlb.inverse_transform(y_test[0:1]))
print(tokenizer.sequences_to_texts(X_test[0:1]))

print(mlb.inverse_transform(y_test[sorted_indices[:5]]))
print(tokenizer.sequences_to_texts(X_test[sorted_indices[:5]]))

[('trade',)]
[('trade',), ('trade',), ('money-fx', 'interest'), ('crude',), ('grain',)]
