In [43]:
import os
import sys
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, RepeatVector, multiply
from keras.layers import Bidirectional, LSTM, MaxPooling1D, Embedding, Flatten
from keras.models import Model
from keras.initializers import Constant
from sklearn.metrics import classification_report

In [14]:
BASE_DIR = '/Users/Masters/Downloads/'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
EMBEDDING_DIM_DRUGS = 30
VALIDATION_SPLIT = 0.2

In [3]:
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [4]:
df = pd.read_csv('/Users/Masters/Downloads/train_F3WbcTw.csv')

In [5]:
df.columns

Index(['unique_hash', 'text', 'drug', 'sentiment'], dtype='object')

In [6]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df['text'].values)
sequences = tokenizer.texts_to_sequences(df['text'].values)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(df['sentiment']))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Found 45846 unique tokens.
Shape of data tensor: (5279, 1000)
Shape of label tensor: (5279, 3)


In [8]:
tokenizer_query = {drug: i for i, drug in enumerate(list(set(df['drug'].values)))}
drug_sequences = np.array([tokenizer_query[drug] for drug in df['drug'].values])
drug_sequences = drug_sequences.reshape((len(drug_sequences), 1))

In [12]:
print(drug_sequences.shape, len(tokenizer_query))

(5279, 1) 102


In [10]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
drug_sequences = drug_sequences[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
q_train = drug_sequences[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]
q_val = drug_sequences[-num_validation_samples:]


In [11]:
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [20]:
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

embedding_layer_query = Embedding(len(tokenizer_query),
                            EMBEDDING_DIM_DRUGS,
                            trainable=True,
                            input_length=1)

In [40]:
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
query_input = Input(shape=(1,), dtype='int32')

embedded_sequences = embedding_layer(sequence_input)
embedded_query = embedding_layer_query(query_input)
embedded_query = Flatten()(embedded_query)

x = Bidirectional(LSTM(15, activation='relu', return_sequences=True))\
                            (embedded_sequences)
embedded_query = RepeatVector(MAX_SEQUENCE_LENGTH)(embedded_query)
multiplied = multiply([embedded_query, x])
output = Dense(1, activation='softmax')(multiplied)
output = Flatten()(output)

output = Dense(3, activation='softmax')(output)

model = Model(inputs=[sequence_input, query_input], outputs=output)


(None, 1000, 30) (None, 1000, 30)


In [41]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_29 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 1, 30)        3060        input_29[0][0]                   
__________________________________________________________________________________________________
input_28 (InputLayer)           (None, 1000)         0                                            
__________________________________________________________________________________________________
flatten_11 (Flatten)            (None, 30)           0           embedding_5[11][0]               
__________________________________________________________________________________________________
embedding_

In [42]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit([x_train, q_train], y_train,
          batch_size=128,
          epochs=10,
          validation_data=([x_val, q_val], y_val))

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 4224 samples, validate on 1055 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0xb370a59b0>

In [44]:
y_pred = model.predict([x_val, q_val])

In [53]:
preds = y_pred.argmax(axis=1)
trues = y_val.argmax(axis=1)

In [54]:
print(classification_report(trues, preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       119
           1       0.00      0.00      0.00       193
           2       0.70      1.00      0.83       743

    accuracy                           0.70      1055
   macro avg       0.23      0.33      0.28      1055
weighted avg       0.50      0.70      0.58      1055



  'precision', 'predicted', average, warn_for)
