In [0]:
from google.colab import files
files.upload()

In [0]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

In [0]:
!ls

In [0]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [0]:
!unzip imdb-dataset-of-50k-movie-reviews.zip

In [0]:
!head "IMDB Dataset.csv"

In [0]:
import tensorflow as tf
import pandas as pd
import tensorflow_datasets as tfds
import re
from pathlib import Path
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Bidirectional, LSTM
from tensorflow.keras import Sequential

In [0]:
df = pd.read_csv('IMDB Dataset.csv', encoding='latin')
df.head()

In [0]:
df.sentiment = pd.Categorical(df.sentiment)
df['sentiment'] = df.sentiment.cat.codes
df.head()

In [0]:
import unicodedata

def remove_br(w):
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()
  w = w.lower()

  # title = u"Klüft skräms inför på fédéral électoral große"
  unicodedata.normalize('NFKD', w).encode('ascii','ignore')
  # 'Kluft skrams infor pa federal electoral groe'

  return re.sub(r'(<.?br.?[/]?>)',"", w)
df['review'] = df['review'].apply(remove_br)
# remove_br('dasda')

In [0]:
df.head()

In [0]:
dataset = tf.data.Dataset.from_tensor_slices((df.review.values, df.sentiment.values))

In [0]:
for feat, targ in dataset.take(5):
  print(feat, " ", targ)

In [0]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [0]:
all_labeled_dataset = dataset.shuffle(BUFFER_SIZE, reshuffle_each_iteration=False)

In [0]:
for ex in all_labeled_dataset.take(5):
  print(ex)

In [0]:
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (text.numpy() for text, label in all_labeled_dataset), target_vocab_size=2**13)

In [0]:
tokenizer_en.vocab_size

In [0]:
sample_string = 'Transformer is awesome.'

tokenized_string = tokenizer_en.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))

original_string = tokenizer_en.decode(tokenized_string)
print ('The original string: {}'.format(original_string))

assert original_string == sample_string

In [0]:
# encoder = tfds.features.text.Tok?enTextEncoder(vocabulary_set,)

def encode(text_tensor, label):
  encoded_text = tokenizer_en.encode(text_tensor.numpy())
  return encoded_text, label

In [0]:
def encode_map_fn(text, label):
  encoded_text, label = tf.py_function(
      encode,
      inp=[text, label],
      Tout=(tf.int64, tf.int8)
  )
  encoded_text.set_shape([None])
  label.set_shape([])
  return encoded_text, label

all_encoded_data = all_labeled_dataset.map(encode_map_fn)

In [0]:
train_data = all_encoded_data.shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE)

# test_data  = all_encoded_data.take(TAKE_SIZE)
# test_data  = test_data.padded_batch(BATCH_SIZE)

In [0]:
for ex in train_data.take(1):
  text = ex[0][10]
  label = ex[1][10]
  print(ex[0].shape,ex[1].shape)

In [0]:
yr = text.numpy()

In [0]:
yr.shape

In [0]:
list_ = re.findall('\d+', text.numpy())

In [0]:
list_ = [int(a) for a in list_]

In [0]:
tokenizer_en.decode(yr)

In [0]:
label

In [0]:
embedding_dim = 64

model = Sequential([
                  Embedding(tokenizer_en.vocab_size, embedding_dim),
                  Bidirectional(LSTM(64)),
                  Dense(64, activation='relu'),
                  Dense(64, activation='relu'),
                  Dense(1)
])
model.summary()

In [0]:
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics=['accuracy'])
history = model.fit(
    train_data,
    epochs=10
)

In [0]:
model.save('saved_model/my_model')

In [0]:
!ls saved_model/

In [0]:
def pad_to_size(vec, size):
  zeros = [0] * (size - len(vec))
  vec.extend(zeros)
  return vec

In [0]:
def sample_predict(sample_pred_text, pad):
  encoded_sample_prediction_text = tokenizer_en.encode(sample_pred_text)

  if pad:
    encoded_sample_pred_text = pad_to_size(encoded_sample_prediction_text, 64)
  encoded_sample_pred_text = tf.cast(encoded_sample_prediction_text, tf.float32)
  predictions = model.predict(tf.expand_dims(encoded_sample_pred_text, 0))
  return (predictions)

In [0]:
sample_pred_text = ("I just loved the movie. I really liked it. It was a brilliant movie.")
prediction = sample_predict(sample_pred_text, pad=True)

In [0]:
prediction

In [0]:
sample_pred_text = ("I hated this movie. There was nothing in this movie of interest. This movie is the worst. I hope this movie never existed.")
prediction = sample_predict(sample_pred_text, pad=False)

In [0]:
prediction

In [0]:
!ls saved_model/my_model/ -l 