In [1]:
# - Outlook for multiple classes: for multi-class prediction 
#     use softmax but for multi-label prediction use sigmoid
# - Use aws deep learning instance: 
#     https://docs.aws.amazon.com/dlami/latest/devguide/keras-mxnet.html
# - Use t-SNE to visualize the word embeddings
# - Similarity search by using learned embeddings, i.e. 
#     https://blog.insightdatascience.com/the-unreasonable-effectiveness-of-deep-learning-representations-4ce83fc663cf
#     (especially Spotify Annoy Index)
# - Test out 1d-conv layers vs. stacked LSTM
# - in order to handle arbitrary length input: 
#     (1) remove input_length from embedding layer,
#     (2) batch wise apply pad_sequences to training input

# TODO:
# - multi-label, unbalanced dataset: use class_wheight, etc. (https://blog.mimacom.com/text-classification/, https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras)
# - use CNN before or after LSTM layers
# - use other metrics to assess model quality (recall, confusion matrix, etc.)
# - implement http://www.aclweb.org/anthology/W18-0913

In [82]:
from keras.datasets import imdb
from keras.preprocessing import sequence 
from keras.layers import LSTM, Embedding, Dense, Flatten, Bidirectional, Dropout, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.models import Sequential
from keras.utils import get_file
from keras.initializers import Constant
from keras.preprocessing.text import Tokenizer
import numpy as np
from nltk.corpus import reuters
from sklearn.preprocessing import MultiLabelBinarizer

MAX_NUM_WORDS = 5000
MAX_SEQUENCE_LEN = 500

In [3]:
#categs = sorted([(cat, len(reuters.fileids(categories=cat))) for cat in reuters.categories()], key=lambda x: -x[1])
#categs[:10]

In [4]:
#top_categories = [cat for cat, _ in categs[2:7]]
#fileids = reuters.fileids(categories=top_categories)
fileids = reuters.fileids()
fileids_test = [fid for fid in fileids if fid.startswith("test")]
fileids_train = [fid for fid in fileids if fid.startswith("train")]

In [5]:
mlb = MultiLabelBinarizer()
mlb.fit(reuters.categories(fid) for fid in fileids_train)
y_train = mlb.transform(reuters.categories(fid) for fid in fileids_train)
y_test = mlb.transform(reuters.categories(fid) for fid in fileids_test)

In [6]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(reuters.raw(fid) for fid in fileids)
X_train = tokenizer.texts_to_sequences(reuters.raw(fid) for fid in fileids_train)
X_test = tokenizer.texts_to_sequences(reuters.raw(fid) for fid in fileids_test)

In [7]:
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LEN)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LEN)

In [22]:
# get glove coeff matrix
embeddings_index = {}
fname = get_file("glove.6B.100d.txt", "http://nlp.stanford.edu/data/glove.6B.zip", extract=True)
with open(fname, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

# prepare pre-learned embedding matrix
embdedding_dim = 100
word_index = tokenizer.word_index
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, embdedding_dim))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Found 400000 word vectors.


In [110]:
glove_embedding = Embedding(num_words, embdedding_dim, input_length=MAX_SEQUENCE_LEN,
                            embeddings_initializer=Constant(embedding_matrix), trainable=True)

# Build the model 
model = Sequential()
model.add(glove_embedding)
model.add(Dropout(0.1))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.1))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.1))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.1))
#model.add(Dropout(0.1))
#model.add(LSTM(100, return_sequences=True))
#model.add(LSTM(100))
#model.add(Dropout(0.1))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.1))
#model.add(Dense(512, activation='relu', name='fc'))
#model.add(Dropout(0.1))
model.add(Dense(mlb.classes_.shape[0], activation='sigmoid')) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 500, 100)          500100    
_________________________________________________________________
dropout_66 (Dropout)         (None, 500, 100)          0         
_________________________________________________________________
conv1d_61 (Conv1D)           (None, 496, 128)          64128     
_________________________________________________________________
max_pooling1d_53 (MaxPooling (None, 99, 128)           0         
_________________________________________________________________
dropout_67 (Dropout)         (None, 99, 128)           0         
_________________________________________________________________
conv1d_62 (Conv1D)           (None, 95, 128)           82048     
_________________________________________________________________
max_pooling1d_54 (MaxPooling (None, 19, 128)           0         
__________

In [111]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, shuffle=True)

Train on 6992 samples, validate on 777 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x15a1c3f60>

In [112]:
y_test_pred = model.predict(X_test, verbose=1)



In [113]:
from sklearn.metrics import classification_report
PRED_THRESHOLD = 0.5
print(classification_report(y_test, y_test_pred>PRED_THRESHOLD, target_names=mlb.classes_))

                 precision    recall  f1-score   support

            acq       0.95      0.97      0.96       719
           alum       0.00      0.00      0.00        23
         barley       0.00      0.00      0.00        14
            bop       0.45      0.43      0.44        30
        carcass       0.58      0.39      0.47        18
     castor-oil       0.00      0.00      0.00         1
          cocoa       0.81      0.72      0.76        18
        coconut       0.00      0.00      0.00         2
    coconut-oil       0.00      0.00      0.00         3
         coffee       0.96      0.96      0.96        28
         copper       1.00      0.22      0.36        18
     copra-cake       0.00      0.00      0.00         1
           corn       0.87      0.73      0.80        56
         cotton       0.00      0.00      0.00        20
     cotton-oil       0.00      0.00      0.00         2
            cpi       0.50      0.32      0.39        28
            cpu       0.00    

In [13]:
#####################################################################################################################
############################################# TESTING ###############################################################
#####################################################################################################################