In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, RepeatVector, multiply
from keras.layers import Bidirectional, LSTM, MaxPooling1D, Embedding, GRU
from keras.layers import Flatten, Softmax, Activation, Lambda
from keras.models import Model
from keras.initializers import Constant
from sklearn.metrics import classification_report
from keras.optimizers import Adam
import fasttext
import keras.backend as K
import tensorflow as tf
from sklearn.utils import class_weight


Using TensorFlow backend.


In [None]:
model = fasttext.train_unsupervised(
    '/Users/Masters/Documents/repos/sentiment_analysis/corpus.txt',
    model='skipgram',
    dim = 20)


In [None]:
model.save_model("fast_text_sentiment_20d.bin")


In [2]:
model = fasttext.load_model("fast_text_sentiment_20d.bin")





In [3]:
BASE_DIR = '/Users/Masters/Downloads/'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
MAX_SEQUENCE_LENGTH = 200
MAX_NUM_WORDS = 5000
EMBEDDING_DIM = 20
VALIDATION_SPLIT = 0.2

In [None]:
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        
print('Found %s word vectors.' % len(embeddings_index))

In [4]:
df = pd.read_csv('/Users/Masters/Downloads/train_F3WbcTw.csv')

In [None]:
df.columns

In [None]:
df.head()

In [None]:
lens = [len(tx.split(' ')) for tx in df['text'].values]

In [None]:
lens

In [None]:
df['sentiment'].value_counts()

In [5]:
#df = df[(df['sentiment'] == 0) | (df['sentiment'] == 1)]
df = df[df['drug'].isin(['gilenya', 'ocrevus', 'ocrelizumab', 'fingolimod',
                        'opdivo', 'remicade', 'humira', 'entyvio',
                        'tarceva', 'cladribine', 'keytruda', 'stelara', 'tagrisso',
                        'alimta'])]

In [None]:
sample_weights = class_weight.compute_sample_weight('balanced',
                                                    df['sentiment'].values)


In [6]:
weights_dict = {2: 0.5, 1: 5.0, 0:10}
sample_weights = np.array([weights_dict[w] for w in df['sentiment'].values])

In [7]:
print(set(sample_weights))
print(sample_weights, df['sentiment'].values)
print(df['sentiment'].value_counts())

{0.5, 10.0, 5.0}
[ 0.5  0.5  0.5 ...  0.5  0.5 10. ] [2 2 2 ... 2 2 0]
2    2915
1     734
0     483
Name: sentiment, dtype: int64


In [8]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df['text'].values+df['drug'].values)
sequences = tokenizer.texts_to_sequences(df['text'].values)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(df['sentiment']))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

drug_sequences = tokenizer.texts_to_sequences(df['drug'].values)
drug_sequences = np.array(drug_sequences)
drug_sequences.shape

Found 34603 unique tokens.
Shape of data tensor: (4132, 200)
Shape of label tensor: (4132, 3)


(4132, 1)

In [None]:
df['drug'].value_counts()

In [9]:
# split the data into a training set and a validation set
random.seed(9001)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
drug_sequences = drug_sequences[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
q_train = drug_sequences[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]
q_val = drug_sequences[-num_validation_samples:]
sample_weights_train = sample_weights[:-num_validation_samples]
sample_weights_val = sample_weights[-num_validation_samples:]


In [None]:
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [10]:
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = model[word]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [11]:
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            trainable=False)


In [None]:
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
query_input = Input(shape=(1,), dtype='int32')

embedded_sequences = embedding_layer(sequence_input)
embedded_query = embedding_layer(query_input)
embedded_query = Flatten()(embedded_query)

x = Bidirectional(GRU(10, activation='relu', return_sequences=True))\
                            (embedded_sequences)
embedded_query = RepeatVector(MAX_SEQUENCE_LENGTH)(embedded_query)
multiplied = multiply([embedded_query, x])
output = Dense(1, activation='relu', kernel_regularizer='l2')(multiplied)
output = Flatten()(output)
output = Activation('softmax')(output)

output = Dense(2, activation='softmax')(output)

model = Model(inputs=[sequence_input, query_input], outputs=output)


In [12]:
def dot_prod(var):
    return K.sum(var[0] * var[1],axis=-1,keepdims=True)

def weighted_sum(var):
    context_vector = var[0] * var[1]
    return tf.reduce_sum(context_vector, axis=1)

In [13]:
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
query_input = Input(shape=(1,), dtype='int32')

embedded_sequences = embedding_layer(sequence_input)
embedded_query = embedding_layer(query_input)
embedded_query = Flatten()(embedded_query)

values = Bidirectional(GRU(10, activation='relu', return_sequences=True))\
                    (embedded_sequences)
embedded_query = RepeatVector(MAX_SEQUENCE_LENGTH)(embedded_query)
attention_weights = Lambda(dot_prod)([embedded_query, values])
attention_weights = Activation('softmax')(attention_weights)
context_vector = Lambda(weighted_sum)([attention_weights, values])

output = Dense(3, activation='softmax')(context_vector)
model = Model(inputs=[sequence_input, query_input], outputs=output)


Instructions for updating:
Colocations handled automatically by placer.


In [None]:
model.summary()

In [None]:
y_train[0].shape

In [14]:
optim = Adam()

model.compile(loss='categorical_crossentropy',
              optimizer=optim,
              metrics=['acc'])


In [None]:
model.fit([x_train[0].reshape(1,1000),
           q_train[0]],
          y_train[0].reshape(1,2),
          batch_size=128,
          epochs=30,
          validation_data=([x_train[0].reshape(1,1000),
                            q_train[0]],
                           y_train[0].reshape(1,2)))

In [15]:
model.fit([x_train, q_train], y_train,
          sample_weight = sample_weights_train,
          batch_size=128,
          epochs=30,
          validation_data=([x_val, q_val], y_val))

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 3306 samples, validate on 826 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1a355b8908>

In [16]:
y_pred = model.predict([x_val, q_val])

In [17]:
preds = y_pred.argmax(axis=1)
trues = y_val.argmax(axis=1)

In [18]:
print(classification_report(trues, preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        88
           1       0.00      0.00      0.00       132
           2       0.73      1.00      0.85       606

    accuracy                           0.73       826
   macro avg       0.24      0.33      0.28       826
weighted avg       0.54      0.73      0.62       826



  'precision', 'predicted', average, warn_for)
