#### Load Movie reviews Dataset

We will be using data available on Kaggle platform for this exercise. The data is available at https://www.kaggle.com/c/word2vec-nlp-tutorial/data.

In [None]:
#Connect Google drive to colab
from google.colab import drive
drive.mount('/gdrive')

Load dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
#change file path to point to where you have stored the zip file.
df = pd.read_csv('/gdrive/My Drive/AI-ML/labeledTrainData.tsv.zip', header=0, delimiter="\t", quoting=3)

In [None]:
print('Number of examples in Dataset: ', df.shape)
df.head()

In [None]:
df.loc[0, 'review']

Split Data into Training and Test Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['review'],
    df['sentiment'],
    test_size=0.2, 
    random_state=42
)

In [None]:
X_train.shape, X_test.shape

#### Build the Tokenizer

In [None]:
import tensorflow as tf

In [None]:
desired_vocab_size = 10000 #Vocablury size
t = tf.keras.preprocessing.text.Tokenizer(num_words=desired_vocab_size) # num_words -> Vocablury size

In [None]:
#Fit tokenizer with actual training data
t.fit_on_texts(X_train.tolist())

In [None]:
#Vocabulary
t.word_index

In [None]:
len(t.word_index)

#### Prepare Training and Test Data

Get the word index for each of the word in the review

In [None]:
X_train[0]

In [None]:
#Replace each word in the text with word's index
X_train = t.texts_to_sequences(X_train.tolist())

In [None]:
print(X_train[0])

In [None]:
X_test = t.texts_to_sequences(X_test)

How many words in each review?

In [None]:
len(X_train[2000])

#### Pad Sequences - Important

In [None]:
#Define maximum number of words to consider in each review
max_review_length = 300

In [None]:
#Pad training and test reviews
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train,
                                                        maxlen=max_review_length,
                                                        padding='post',
                                                        truncating='post')
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, 
                                                       maxlen=max_review_length, 
                                                       padding='post',
                                                       truncating='post')

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
X_train[2000]

#### Load Google Word2Vec model

We can use gensim library to load pre-trained Word2Vec or Glove models. For list of available models can be found at [this url](https://github.com/RaRe-Technologies/gensim-data).

In [None]:
import gensim.downloader as api

In [None]:
#Load Google word2vec model
model = api.load('word2vec-google-news-300')

In [None]:
#Size of the model
model.vectors.shape

In [None]:
#Model's vocab
model.index2word

In [None]:
#Embedding for word great
model['with']

#### Get Pre-trained Embeddings

In [None]:
embedding_size = model.vector_size
embedding_size

Google Word2Vec model has vocabulary size of 3M words. In this example, we have only 10000 words as vocabulary. This means we do not require entire Google Word2Vec model. Rather, we will only take the embeddings of the words that are in our dataset vocabulary.

In [None]:
#Initialize embedding matrix for our dataset with 10000+1 rows (1 for padding word)
#and 300 columns (as embedding size is 300)
embedding_matrix = np.zeros((desired_vocab_size + 1, embedding_size))

In [None]:
#Load word vectors for each word from Google Word2Vec model
for word, i in sorted(t.word_index.items(),key=lambda x:x[1]):
    if i > (desired_vocab_size+1):
        break
    try:
        embedding_vector = model[word] #Reading word's embedding from Google Word2Vec
        embedding_matrix[i] = embedding_vector
    except:
        pass

We now have word embeddings for our vocabulary words from Google Word2Vec model. We can now use it in our Model training.

In [None]:
embedding_matrix.shape

#### Build Model - Dense Layers

In [None]:
#Initialize model
tf.keras.backend.clear_session()
model = tf.keras.Sequential()

To handle, pre-trained embeddings, we will use Keras Embedding layer

In [None]:
model.add(tf.keras.layers.Embedding(desired_vocab_size + 1, #Vocablury size
                                    embedding_size, #Embedding size
                                    weights=[embedding_matrix], #Embeddings taken from pre-trained model
                                    trainable=False, #As embeddings are already available, we will not train this layer. It will act as lookup layer.
                                    input_length=max_review_length) #Number of words in each review
          )

Embedding Layer gives us 3D output ->
[Batch_Size , Review Length , Embedding_Size]

In [None]:
model.output

In [None]:
#Flatten the data as we will use Dense layers
model.add(tf.keras.layers.Flatten())

In [None]:
model.output

Add Hidden layers

In [None]:
#Add Hidden layers (Dense layers)
model.add(tf.keras.layers.Dense(100, activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(50, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(25, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))

Add Output layer

In [None]:
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
#Compile the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.summary()

##### Train Model

In [None]:
model.fit(X_train,y_train,
          epochs=5,
          batch_size=32,          
          validation_data=(X_test, y_test))

#### Building a CNN Model

Start a model

In [None]:
model2 = tf.keras.Sequential()

Add Embedding layer to handle Word2Vec

In [None]:
model2.add(tf.keras.layers.Embedding(desired_vocab_size + 1, #Vocablury size
                                    embedding_size, #Embedding size
                                    weights=[embedding_matrix], #Embeddings taken from pre-trained model
                                    trainable=False, #As embeddings are already available, we will not train this layer. It will act as lookup layer.
                                    input_length=max_review_length) #Number of words in each review
          )

In [None]:
model2.output

Add Conv1D hidden layers : As our text data is 2D (number of words, Embedding size), we will use Conv1D in this case (compared to Conv2D with images which are 3D)

In [None]:
#Add first convolutional layer
model2.add(tf.keras.layers.Conv1D(32, #Number of filters 
                                 kernel_size=(3), #Size of the filter
                                 strides=1,
                                 activation='relu'))

#normalize data
model2.add(tf.keras.layers.BatchNormalization())

#Add second convolutional layer
model2.add(tf.keras.layers.Conv1D(64, kernel_size=(3), strides=2))
model2.add(tf.keras.layers.ReLU())

#normalize data
model2.add(tf.keras.layers.BatchNormalization())

In [None]:
#Use Global Average Pooling
model2.add(tf.keras.layers.GlobalAveragePooling1D())

#Output layer
model2.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
#Compile the model
model2.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model2.summary()

In [None]:
model2.fit(X_train,y_train,
          epochs=5,
          batch_size=32,          
          validation_data=(X_test, y_test))