In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#!pip install tensorflow-gpu==2.4.0

In [None]:
import zipfile

z= zipfile.ZipFile('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip')
z.extractall()

z= zipfile.ZipFile('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip')
z.extractall()

In [None]:
df = pd.read_csv('train.tsv', sep='\t')
df.head()

# Preprocessing

In [None]:
df['Sentiment'].value_counts().plot(kind = 'bar')

In [None]:
seq_len = 512
num_samples = len(df)

num_samples, seq_len

In [None]:
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
tokens = tokenizer(df['Phrase'].tolist(), 
                   max_length = seq_len,
                   truncation = True,
                   padding = 'max_length',
                   add_special_tokens = True,
                   return_tensors = 'np'
                  )

In [None]:
tokens.keys()

In [None]:
tokens['input_ids']

In [None]:
tokens['attention_mask']

In [None]:
import numpy as np

with open('movie_xids.npy', 'wb') as f:
    np.save(f, tokens['input_ids'])
with open('movie_xmask.npy', 'wb') as f:
    np.save(f, tokens['attention_mask'])

In [None]:
arr = df['Sentiment'].values

In [None]:
arr.shape

In [None]:
arr

In [None]:
arr.max()+1

In [None]:
labels = np.zeros((num_samples, arr.max()+1))
labels.shape

In [None]:
labels

In [None]:
labels[np.arange(num_samples), arr] = 1

In [None]:
labels

In [None]:
with open('movie_labels.npy', 'wb') as f:
    np.save(f, labels)

# Building Dataset

In [None]:
Xids = tokens['input_ids']
Xmask = tokens['attention_mask']

In [None]:
Xids.shape

In [None]:
import tensorflow as tf

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

In [None]:
dataset.take(1)

In [None]:
# to get following format we use map_func function
# {input_id, attention_mask}, outputs

def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids,
           'attention_mask': masks}, labels

In [None]:
dataset = dataset.map(map_func)

In [None]:
dataset.take(1)

# Dataset batch, split and shuffle

In [None]:
batch_size = 16

In [None]:
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder = True)

In [None]:
dataset.take(1)

In [None]:
split = 0.9

In [None]:
size  = int ((Xids.shape[0] / batch_size) * split)

In [None]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

In [None]:
train_ds.take(1)

In [None]:
tf.data.experimental.save(train_ds, 'train')
tf.data.experimental.save(val_ds, 'val')

In [None]:
train_ds.element_spec

# Build and Train

In [None]:
from transformers import TFAutoModel

In [None]:
bert = TFAutoModel.from_pretrained('bert-base-cased')

In [None]:
bert.summary()

In [None]:
# two inputs
input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')

# transformer
embeddings = bert.bert(input_ids, attention_mask=mask)[1]

# classifier head
x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(5, activation = 'softmax', name='outputs')(x)

In [None]:
model = tf.keras.Model(inputs = [input_ids, mask], outputs=y)

In [None]:
model.layers[2].trainable = False          # use already trained layers

In [None]:
model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(lr=5e-5, decay=1e-6) #optimal values for bert. may need tuning
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [None]:
!nvidia-smi

In [None]:
with tf.device('/device:GPU:0'):
    history = model.fit(
        train_ds,
        validation_data = val_ds,
        epochs = 3
    )