In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import time
import io

In [2]:
print(tf.__version__)

2.5.0


In [3]:
### in this notebook we are going to use imdb reviews data in tensorflow_dataset 

In [4]:
##loading imdb_reviews dataset

In [None]:
data_set , info = tfds.load("imdb_reviews" ,with_info = True , as_supervised = True)  ### with_info = True returns builder info and as_supervised = True returns 2 tuple structure (input , label)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [None]:
###imdb_reviews dataset has two columns one has reviews and other has labels for positive and negative 

In [None]:
print(type(data_set))

In [None]:
print(data_set)

In [None]:
train_data , test_data = data_set['train'] , data_set['test']

In [None]:
training_reviews = []
training_labels = []

testing_reviews = []
testing_labels = []

for s,l in train_data:
  training_reviews.append(str(s.numpy()))
  training_labels.append(l.numpy())

for s,l in test_data:
  testing_reviews.append(str(s.numpy()))
  testing_labels.append(l.numpy())


In [None]:
print(type(s))
print(type(l))
print(type(training_reviews))
print(type(training_labels))

In [None]:
### s and l are tensors that why we have to convert them to numpy arrays now converting training_labels and testing_labels to numpys
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

In [None]:
### now creating sequences for training and testing reviews

In [None]:
vocab = 10000        ### maximum this numbers of words are tokenised\
max_len = 120         ### this is the max len for sequence
truncate_type = "post"  
embedding_dim = 16        ### dimension of embedding vectors
oov = "OOV"

In [None]:
tokenizer = Tokenizer(num_words = vocab , oov_token = oov)
tokenizer.fit_on_texts(training_reviews)
sequences = tokenizer.texts_to_sequences(training_reviews)
training_sequence_final = pad_sequences(sequences , maxlen = max_len , truncating = truncate_type)

In [None]:
testing_sequence = tokenizer.texts_to_sequences(testing_reviews)
testing_sequence_final = pad_sequences(testing_sequence , maxlen = max_len , truncating = truncate_type)

In [None]:
print(testing_sequence_final.shape)
print(training_sequence_final.shape)

In [None]:
print(training_labels[:10])

In [None]:
model1 = tf.keras.Sequential([tf.keras.layers.Embedding( vocab , embedding_dim , input_length = max_len) ,   ## vocab is the input_dim means maximunm integer index , embedding_dim is the output dims , input_len is size of input sequence
                             tf.keras.layers.Flatten() , 
                             tf.keras.layers.Dense(6 , activation = 'relu') , 
                             tf.keras.layers.Dense(1 , activation = "sigmoid")
                             ]) 

In [None]:
model1.compile(loss = "binary_crossentropy" , optimizer = 'adam' , metrics = ["accuracy"] )
model1.summary()

In [None]:
#### in place of flatten layer we could use GlobalAveragePooling1D  what it does is it flattens the mertics by averaging columnwise which is computationaly efficient

In [None]:
model2 = tf.keras.Sequential([tf.keras.layers.Embedding( vocab , embedding_dim , input_length = max_len) ,   ## vocab is the input_dim means maximunm integer index , embedding_dim is the output dims , input_len is size of input sequence
                             tf.keras.layers.GlobalAveragePooling1D() , 
                             tf.keras.layers.Dense(6 , activation = 'relu') , 
                             tf.keras.layers.Dense(1 , activation = "sigmoid")
                             ]) 
model2.compile(loss = "binary_crossentropy" , optimizer = 'adam' , metrics = ["accuracy"])
model2.summary()

In [None]:
### look at the difference in output shapes

In [None]:
start = time.time()
model1.fit(training_sequence_final , training_labels , validation_data = (testing_sequence_final , testing_labels) , epochs = 5)
end = time.time()
print(end-start )

In [None]:
model2.fit(training_sequence_final , training_labels , validation_data = (testing_sequence_final , testing_labels) , epochs = 5)

In [None]:
embedding_layer = model.layers[0]
weights = embedding_layer.get_weights()[0]
print(weights.shape)

In [None]:
###  this means that each word in vocab is converted into 16 dimensions with semantic similarity

In [None]:
word_index = tokenizer.word_index
reversed_word_index = dict((value,key) for (key,value) in word_index.items())  ### this is the code to reverse key,value pair in dictionary

In [None]:
weights[0]   ## mere demonstration
word = '\t'.join([str(x) for x in weights[0]])
print(word)

In [None]:
file_m = io.open("meta.tsv" , "w" , encoding = 'utf-8')
file_v = io.open("vecs.tsv" , "w" , encoding = "utf-8")
for i in range(1 , vocab):                           #### starting with 1 because 1 key had "OOV" token
  vector = weights[i]
  word = reversed_word_index[i]
  file_m.write(word+ "\n")
  file_v.write("\t".join([str(x) for x in vector]))
file_m.close()
file_v.close()

In [None]:
###  for downloading the files created 
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download("meta.tsv")
  files.download("vecs.tsv")

In [None]:
sentence = "I really think this is amazing. honest."
sequence = tokenizer.texts_to_sequences([sentence])
print(sequence)
