In [164]:
import tensorflow as tf
import pandas as pd 
import numpy as np
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [12]:
texts = pd.read_csv('data/drugsCom_raw.csv')

In [134]:
sentences = np.array(texts['review'])

In [79]:
# basic preprocessing of sentences and collect final preprocessed sentences in final_setences list 
final_sentences = []
for sent in sentences:
    words = [word for word in sent.split() if ('&' not in  word) or ('#' not in word)]  
    final_words = []
    for word in words:
        if word.startswith('"'):
            word = word[1:]
        if word.endswith('"'):
            word = word[::-1][1::][::-1]
        if word.startswith('('):
            word = word[1::]
        if word.endswith(')'):
            word = word[::-1][1::][::-1]
        if word.endswith('.'):
            word = word[::-1][1::][::-1]
        if word.endswith(','):
            word = word[::-1][1::][::-1]
        if word.endswith(')'):
            word = word[::-1][1::][::-1]
        if word.endswith('.'):
            word = word[::-1][1::][::-1]
        if word.endswith('.'):
            word = word[::-1][1::][::-1]
        final_words.append(word)
    final_text = ' '.join(final_words)
    final_sentences.append(final_text)

In [173]:
test_sentence = final_sentences[4]
test_sentence

'I have been on this birth control for one cycle After reading some of the reviews on this type and similar birth controls I was a bit apprehensive to start Im giving this birth control a 9 out of 10 as I have not been on it long enough for a 10 So far I love this birth control! My side effects have been so minimal its like Im not even on birth control! I have experienced mild headaches here and there and some nausea but other than that ive been feeling great! I got my period on cue on the third day of the inactive pills and I had no idea it was coming because I had zero pms! My period was very light and I barely had any cramping! I had unprotected sex the first month and obviously get pregnant so very pleased! Highly recommend'

In [182]:
test_sent_sequence = tokenizer.texts_to_sequences([test_sentence])[0]

In [320]:
test_sent_padded = pad_sequences([test_sent_sequence],padding = 'post',truncating='post',maxlen = 1199)[0]

In [88]:
tokenizer = Tokenizer(num_words = 1000,oov_token = "<OOV>")
tokenizer.fit_on_texts(final_sentences) 
word_index = tokenizer.word_index

In [89]:
# creating sequences from the sentence tokens
sequences = tokenizer.texts_to_sequences(final_sentences)

In [90]:
print(sequences[0])

[127, 8, 121, 1, 95, 4, 37, 1, 1, 1, 16, 696, 12, 361, 183, 20, 6, 134, 386, 160, 91, 6, 77, 706, 3, 320, 17, 1, 1, 1, 3, 14, 104, 36, 606, 6, 87, 1, 2, 11, 18, 27, 33, 38, 626, 4, 213, 1, 74, 184, 303, 179, 1, 12, 74, 2, 92, 11, 730, 449, 16, 1, 36, 606, 17]


In [94]:
# oov token usage 
TEST = ['industriliztionsdf fmy my']
tokenizer.texts_to_sequences(TEST)

[[1, 1, 6]]

In [103]:
padded = pad_sequences(sequences,padding = 'post',truncating='post')
# each sequences shape will be same after padding 
padded[0].shape

(1199,)

In [142]:
padded[0]

array([127,   8, 121, ...,   0,   0,   0])

In [107]:
padded.shape # 1199 word tokens for each sentences, 
# we have to create embeddings for each 1199 word tokens later

(53766, 1199)

In [108]:
sentences.shape # 53766 data instances 

(53766,)

All the sentences are converted into tokens. But how do we get meaning from these word tokens: Embedding 

In [138]:
labels = np.array(texts['rating'])

In [136]:
padded.shape # train x 

(53766, 1199)

In [139]:
labels.shape # train target labels 

(53766,)

In [165]:
onehot_labels = to_categorical(labels)
onehot_labels.shape

(53766, 11)

In [167]:
onehot_labels[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.], dtype=float32)

In [168]:
no_unique_tokens= len(word_index)
embedding_dimension = 128
input_length = padded.shape[1]

In [335]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(no_unique_tokens,embedding_dimension,input_length=input_length),
    tf.keras.layers.LSTM(64,activation='relu'),
    tf.keras.layers.Dense(11,activation = 'softmax')
])
optimizer = optimizer = keras.optimizers.Adam(learning_rate=0.01)
model.compile(
    optimizer = optimizer,
    loss ='categorical_crossentropy'
)



In [336]:
model.fit(x = padded[:10000],y = onehot_labels[:10000],batch_size = 128,epochs = 5,verbose=1)

Epoch 1/5
 5/79 [>.............................] - ETA: 9:07 - loss: nan 

KeyboardInterrupt: 

In [331]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 1199, 128)         4293888   
                                                                 
 lstm_9 (LSTM)               (None, 64)                49408     
                                                                 
 dense_17 (Dense)            (None, 11)                715       
                                                                 
Total params: 4,344,011
Trainable params: 4,344,011
Non-trainable params: 0
_________________________________________________________________


In [332]:
test_case1 = np.array([padded[9],])
model.predict(test_case1)



array([[0.00036795, 0.17296182, 0.04106217, 0.04856793, 0.03616836,
        0.04449897, 0.03559168, 0.05467949, 0.11750646, 0.16739662,
        0.28119853]], dtype=float32)

In [333]:
test_case2 = np.array([padded[48],])
model.predict(test_case2)



array([[0.00036795, 0.17296182, 0.04106217, 0.04856793, 0.03616836,
        0.04449897, 0.03559168, 0.05467949, 0.11750646, 0.16739662,
        0.28119853]], dtype=float32)

In [334]:
test_case2 = np.array([padded[1],])
model.predict(test_case2)



array([[0.00036795, 0.17296182, 0.04106217, 0.04856793, 0.03616836,
        0.04449897, 0.03559168, 0.05467949, 0.11750646, 0.16739662,
        0.28119853]], dtype=float32)