In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

Collecting pandas
  Downloading pandas-1.1.0-cp36-cp36m-manylinux1_x86_64.whl (10.5 MB)
[K     |████████████████████████████████| 10.5 MB 5.7 MB/s eta 0:00:01
[?25hCollecting pytz>=2017.2
  Downloading pytz-2020.1-py2.py3-none-any.whl (510 kB)
[K     |████████████████████████████████| 510 kB 2.2 MB/s eta 0:00:01
Installing collected packages: pytz, pandas
Successfully installed pandas-1.1.0 pytz-2020.1
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [8]:
import pickle
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [9]:
df = pd.read_csv('IMDB Dataset.csv')
mapping_dict = {"positive":1, "negative":0}
df = df.replace({'sentiment':mapping_dict})
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [4]:
embeddings_index = {}

with open('glove.6B.50d.txt', 'r') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))


Found 400000 word vectors.


In [5]:
features = df['review'].to_numpy()
labels = df['sentiment'].to_numpy()

In [6]:
random_state = np.random.RandomState(345)
random_state.shuffle(features)
random_state = np.random.RandomState(345)
random_state.shuffle(labels)

In [7]:
train_split = int(0.7*len(features))
test_val_split = int(0.15*len(features))
train_features = features[0:train_split]
train_labels = labels[0:train_split]
val_features = features[train_split:train_split + test_val_split]
val_labels = labels[train_split:train_split + test_val_split]
test_features = features[train_split + test_val_split:]
test_labels = labels[train_split + test_val_split:]

In [9]:
print(train_split)

35000


In [23]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_features)

X_train = tokenizer.texts_to_sequences(train_features)
X_test = tokenizer.texts_to_sequences(val_features)

In [24]:
vocab_size = len(tokenizer.word_index) + 1

maxlen = 50

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding='post', maxlen=maxlen)

In [25]:
hits = 0
misses = 0

embedding_matrix = np.zeros((vocab_size, 50))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [26]:
embedding_layer = tf.keras.layers.Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=maxlen , trainable=False)

In [27]:
input = tf.keras.Input(shape=(None,), dtype="int64")
x = embedding_layer(input)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(x)
# x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
x = tf.keras.layers.Dropout(0.5)(x)
preds = tf.keras.layers.Dense(1,activation="sigmoid")(x)
model = tf.keras.Model(input, preds)
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_3 (Embedding)      (None, None, 50)          5283400   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 256)               183296    
_________________________________________________________________
dense_6 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0   

In [28]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy()]
)

In [29]:
history = model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
model.save('sentiment_rnn')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: sentiment_rnn/assets


In [31]:
with open('tokenizer.pickle', 'wb') as f:
    pickle.dump(tokenizer, f, pickle.HIGHEST_PROTOCOL)
