**Step 1: Load the IMDB Dataset of 50K Movie Reviews**

In [150]:
import pandas as pd
df = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df.head()

Drop any duplicate entries from the dataset, keeping only the first entry

In [151]:
df = df.drop_duplicates(subset = ['review'], keep = 'first')
df.head()

Explore the Distribution of the classes in the dataset

In [152]:
df['sentiment'].value_counts().plot(kind = 'bar')

In [153]:
len(df)

Explore the distribution of the length of reviews

In [154]:
seqlen = df['review'].apply(lambda x: len(x.split()))
seqlen

In [155]:
import seaborn as sns
import matplotlib.pyplot as plt

In [156]:
sns.set_style('darkgrid')
plt.figure(figsize = (16, 10))
sns.distplot(seqlen)

In [157]:
# Choosing our max_length of sequence as 512 to capture maximum amount of data
SEQ_LEN = 512

Now let's get to tokenizing our inputs

In [158]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [159]:
tokens = tokenizer.encode_plus("hello world", max_length = SEQ_LEN, 
                               truncation = True, padding = 'max_length', 
                               add_special_tokens = True, return_token_type_ids = False, 
                               return_attention_mask = True, return_tensors = 'tf')
tokens

In [160]:
Xids = np.zeros((len(df), SEQ_LEN))
Xmask = np.zeros((len(df), SEQ_LEN))

Xids.shape, Xmask.shape

In [162]:
for i, sequence in enumerate(df['review']):
    tokens = tokenizer.encode_plus(sequence, max_length = SEQ_LEN, 
                               truncation = True, padding = 'max_length', 
                               add_special_tokens = True, return_token_type_ids = False, 
                               return_attention_mask = True, return_tensors = 'tf')
    Xids[i, :], Xmask[i, :] = tokens['input_ids'], tokens['attention_mask']

In [163]:
Xids

In [164]:
Xmask

In [165]:
class_map = {
    "positive": 0,
    "negative": 1
}
df['Sentiment'] = df['sentiment'].map(class_map)
arr = df['Sentiment'].values
arr, arr.shape

In [166]:
labels = np.zeros((len(df), arr.max() + 1))
labels.shape

In [167]:
labels[np.arange(len(df)), arr] = 1

In [168]:
labels

Save the input_ids, mask and labels for reusability

In [169]:
import numpy as np
with open('../working/movie-xids.npy', 'wb') as f:
    np.save(f, Xids)
with open('../working/movie-xmask.npy', 'wb') as f:
    np.save(f, Xmask)
with open('../working/movie-labels.npy', 'wb') as f:
    np.save(f, labels)

In [170]:
del Xids, Xmask, labels

**Step 2: Create Data Pipiline for Model Input**

In [171]:
import numpy as np

with open('../working/movie-xids.npy', 'rb') as f:
    xids = np.load(f, allow_pickle = True)
with open('../working/movie-xmask.npy', 'rb') as f:
    xmask = np.load(f, allow_pickle = True)
with open('../working/movie-labels.npy', 'rb') as f:
    labels = np.load(f, allow_pickle = True)

In [173]:
labels

In [174]:
import tensorflow as tf
tf.config.experimental.list_physical_devices('GPU')

In [175]:
dataset = tf.data.Dataset.from_tensor_slices((xids, xmask, labels))
dataset.take(1)

In [176]:
def map_fn(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

In [177]:
dataset = dataset.map(map_fn)
dataset.take(1)

In [178]:
batch_size = 16
dataset = dataset.shuffle(100000).batch(batch_size, drop_remainder = True)
dataset.take(1)

In [179]:
DS_LEN = len(list(dataset))
DS_LEN

In [180]:
split = 0.9

In [212]:
# Here we'll be dividing the train and test in 9:1
train = dataset.take(round(DS_LEN * split))
test_ds = dataset.skip(round(DS_LEN * split))

In [213]:
# We'll further be dividing the train set in train and validation set in 9:1
train_size = len(list(train))
train_ds = train.take(round(train_size * split))
val_ds = train.skip(round(train_size * split))

In [214]:
test_ds.take(1)

In [215]:
train_ds.take(1)

In [216]:
val_ds.take(1)

**Step 3: Build and Train our Model**

In [217]:
from transformers import TFAutoModel
bert = TFAutoModel.from_pretrained('bert-base-cased')
bert.summary()

In [218]:
# Two inputs
input_ids = tf.keras.layers.Input(shape = (SEQ_LEN,), name = 'input_ids', dtype = 'int32')
mask = tf.keras.layers.Input(shape = (SEQ_LEN,), name = 'attention_mask', dtype = 'int32')

#Transformer
embeddings = bert.bert(input_ids, attention_mask = mask)[0]

#classifier head
# x = tf.keras.layers.Dense(1024, activation = 'relu')(embeddings)
# y = tf.keras.layers.Dense(2, activation = 'softmax', name = 'outputs')(x)

X = tf.keras.layers.LSTM(64)(embeddings)
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dense(64, activation='relu')(X)
X = tf.keras.layers.Dropout(0.1)(X)
y = tf.keras.layers.Dense(2, activation = 'softmax', name = 'outputs')(X)

model = tf.keras.Model(inputs = [input_ids, mask], outputs = y)
model.summary()

In [219]:
model.layers[2].trainable = False
model.summary()

In [224]:
tf.keras.utils.plot_model(model, show_shapes=True)

In [220]:
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer = optimizer, loss = loss, metrics = [acc])

In [221]:
history = model.fit(
    train_ds,
    validation_data = val_ds,
    epochs = 3
)

In [222]:
model.save('../working/sentiment_model_imdb')

In [223]:
history.history

In [226]:
test_loss, test_accuracy = model.evaluate(test_ds)
test_loss, test_accuracy

Test Model on Examples

In [227]:
import tensorflow as tf
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = tf.keras.models.load_model('sentiment_model_imdb')

code_to_category = {
    0: "positive",
    1: "negative"
}
def pred_text(model, text, max_length = 512):
    tokens = tokenizer.encode_plus(text, max_length = max_length, 
                               truncation = True, padding = 'max_length', 
                               add_special_tokens = True, return_token_type_ids = False, 
                               return_attention_mask = True, return_tensors = 'tf')
    probs = model.predict({
        'input_ids': tf.cast(tokens['input_ids'], tf.float64),
        'attention_mask': tf.cast(tokens['attention_mask'], tf.float64)
    })
    
    pred = np.argmax(probs[0])
    
    prediction = code_to_category[pred]
    confidence = probs[0][pred]
    
    return prediction, confidence

In [229]:
text = "This movie was amazingly brilliant."
pred_text(model, text)

In [230]:
text = "This movie was amazingly awful."
pred_text(model, text)

In [231]:
text = "Maybe they should try to get a better cast next time."
pred_text(model, text)

In [232]:
text = "Movie sucks!!"
pred_text(model, text)

In [235]:
text = "Movie rocks!!"
pred_text(model, text)

In [236]:
text = "Only the first half of the movie was enjoyable"
pred_text(model, text)

In [240]:
text = "They could have spent the money better helping people"
pred_text(model, text)

In [242]:
import shutil
shutil.make_archive('../working/sentiment_model_imdb', 'zip', '../working/sentiment_model_imdb')

In [244]:
from IPython.display import FileLink
FileLink(r'sentiment_model_imdb.zip')