In this notebook, will do model training and evaluation in notebook and compare these results with Pipeline Local.

In [None]:
import collections
import pathlib
import re
import string

import tensorflow as tf
import numpy as np

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras import utils
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import tensorflow_datasets as tfds

In [None]:
DATA_FILE = '/Users/ukannika/work/imdb/imdb.csv'
batch_size = 32
num_epochs = 5

In [None]:
# Load Data
imdb_csv_ds = tf.data.experimental.make_csv_dataset(
    DATA_FILE,
    batch_size= batch_size, 
    label_name='label',
    num_epochs= num_epochs,
    ignore_errors=True)

# Split the dataset into train and val.
val_size = int(0. * 200)

train_data = imdb_csv_ds.skip(val_size)
val_data = imdb_csv_ds.take(val_size)

In [None]:
training_sentences=[]
training_labels=[]

val_sentences=[]
val_labels=[]

for sentence,label in train_data:
    print(sentence)
    training_sentences.append(str(sentence['text'].numpy()))
    training_labels.append(label.numpy())
    
for sentence,label in val_data:
    val_sentences.append(str(sentence['text'].numpy()))
    val_labels.append(label.numpy())
    
training_labels_final=np.array(training_labels)
testing_labels_final=np.array(val_labels)

In [None]:
training_sentences[1]

In [None]:
vocab_size=8000
embedding_dim = 64
max_length = 400
trunc_type='post'
oov_tok = "<OOV>"

In [None]:
# Tokenize the data
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

# Encode sequences
training_sequences=tokenizer.texts_to_sequences(training_sentences)
val_sequences=tokenizer.texts_to_sequences(val_sentences)


In [None]:
# Padding
training_padded=tf.keras.preprocessing.sequence.pad_sequences(training_sequences, maxlen=max_length, padding='post')
val_padded=tf.keras.preprocessing.sequence.pad_sequences(val_sequences, maxlen=max_length,  padding='post')

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(8000, 64, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(training_padded, 
          training_labels_final, 
          steps_per_epoch=500, 
          validation_data=(val_padded, testing_labels_final),
          validation_steps=200
         )