## Natural Language Processing 

#### How to use Tensorflow to process text data. 


In [1]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer

#### Vocabulary: A corpus i.e a collection of documents.  
#### When we test the model and if the word is not present, then the model may specify as oov (out of vocabulary).
#### 
#### First it assign the index number to each word. 
#### Then it assigns the index number to each sentence provided. 
#### If it does not recognize then it assigns the oov tokenizer. 
#### Padding is used (pad_sequences either as post or pre) to build the array of same size.  Paddig adds 0 to each row to make it as same length as the longest sentence. 
#### If one sentence is very very long then you use truncate in the preprocessing to optimize the processor. Otherwise padding will add too many zeros for remaining sentences causing performance issue.  
#### Another option is to split the sentence into multiple sentences so that you dont need to add too many zeroes through padding. 
#### 
#### Cosine Similarity
#### On a n-dimensional space, I represent each word with a number. And somehow I use the machine to create the word embedding to distinguish two words. 
#### 
#### Embeddings 
#### 
#### Word Embeddings: 

In [2]:
import pandas as pd
import numpy as np

In [3]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
data = pd.read_csv('IMDB Dataset.csv')

In [5]:
data.shape

(50000, 2)

In [6]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
data['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [8]:
X_data = data['review']
y_data = data['sentiment']

In [9]:
X_data.head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [10]:
X_data[-6:-1]

49994    This is your typical junk comedy.<br /><br />T...
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
Name: review, dtype: object

In [11]:
X_data = X_data.replace({'<.*?>': ""}, regex=True)  # Remove html tags

In [12]:
X_data = X_data.replace({'[^A-Za-z]' : ' ' }, regex =True)  # Remove non-alphabetical characters.

In [13]:
X_data = X_data.apply(lambda review: review.lower()) # Convertt to lowercase

In [14]:
y_data = y_data.map({'positive': 1,  'negative': 0})

In [15]:
from sklearn.model_selection import  train_test_split


In [16]:
X_train, X_test, y_train, y_test = train_test_split (X_data, y_data, test_size = 0.2, random_state = 12345)

In [17]:
np.random.seed(12345)
tf.random.set_seed(12345)

In [18]:
print(X_train.shape)
print(X_test.shape)

(40000,)
(10000,)


In [19]:
# To encode text to int
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [20]:
VOCAB=1000 # Limit the vocabulary to 1000 words
EMBED_DIM = 32 # n-dimension for embedding Layer
MAXLEN = 100  # Maximmum length of sentence

# Our brain cannot visualize more than 3 dimensions. 
# Embedding layer will convert word into a number in a 32 dimensional space.
## GloVe (google it...
## Count Vectorization - It will use One Hot Encoding to assign number to bag of words. 
## Bag of Words - Vector Representation Example : You are counting the words. It's not going to do any other processing.

In [21]:
token = Tokenizer(lower = True, 
                  num_words=VOCAB, 
                  oov_token='<OOV>')

In [22]:
# Convert train and test data to list
X_train_l = X_train.to_list()
X_test_l = X_test.to_list()

In [23]:
token.fit_on_texts(X_train_l)

In [24]:
X_train_s = token.texts_to_sequences(X_train_l)
X_test_s = token.texts_to_sequences(X_test_l)

In [25]:
X_train_s = pad_sequences(X_train_s, 
                          maxlen=MAXLEN,
                          padding = 'post',
                          truncating = 'post')
# Here we are basically saying to pad the sequence to 100 words but truncate at the same time if any sentence is longer than 100 words.

In [26]:
X_test_s = pad_sequences(X_test_s, 
                          maxlen=MAXLEN, # Do not forget to mention maxlen parameter for test dataset
                          padding = 'post',
                          truncating = 'post') 

In [27]:
len(X_train_s[22])

100

In [28]:
len(X_test_s[58])

100

### Embeddings

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D

In [30]:
emb_model = Sequential()

In [31]:
emb_model.add(Embedding(VOCAB,
                        EMBED_DIM,
                        input_length = MAXLEN))

In [32]:
emb_model.add(GlobalAveragePooling1D()) # Average Pooling Layer

In [33]:
emb_model.add(Dense(128, activation = 'relu')) # Dense Layer - intermediate free connector layer

In [34]:
emb_model.add(Dense(1, activation = 'sigmoid')) # Output layer

In [35]:
emb_model.compile(optimizer = 'adam', 
                  loss = 'binary_crossentropy', 
                  metrics = ['accuracy'])

In [36]:
emb_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 32)           32000     
                                                                 
 global_average_pooling1d (G  (None, 32)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 128)               4224      
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 36,353
Trainable params: 36,353
Non-trainable params: 0
_________________________________________________________________


####  100 is sentence length, 32 is dimension
####  Embedding Param# 32,000 = 1000 words multiplied by 32 dimensions i.e. it will learn 32,000 
####  Across each 
#### 33* 128 = 4224
###  129* 1 = 129

In [40]:
result = emb_model.fit(X_train_s,
                       y_train, 
                       validation_data=(X_test_s, y_test),
                       epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [41]:
new_reviews = ["I bought this for my husband who plays the piano.  He is having a wonderful time playing these old hymns.  The music  is at times hard to read because we think the book was published for singing from more than playing from.  Great purchase though!"]

In [42]:
new_reviews = token.texts_to_sequences(new_reviews)

In [50]:
new_reviews = pad_sequences(new_reviews, 
                           maxlen = MAXLEN,
                           padding = 'post',
                           truncating = 'post')

In [51]:
new_reviews # the result shows that is has predicted positive sentiment as numbers are positive. 

array([[ 10,   1,  11,  18,  60, 602,  36, 299,   2,   1,  25,   7, 261,
          4, 394,  58, 395, 134, 157,   1,   2, 208,   7,  32, 209, 251,
          6, 340,  87,  70, 103,   2, 269,  14,   1,  18,   1,  38,  53,
         74, 395,  38,  82,   1, 155,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0]], dtype=int32)

In [52]:
emb_model.predict(new_reviews)

array([[0.981613]], dtype=float32)

In [53]:
e = emb_model.layers[0]

In [56]:
e = emb_model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(1000, 32)


In [58]:
reverse_word_index = dict([(value, key) for (key, value) in token.word_index.items()])

In [59]:
import io
vectors = io.open('vectors_emb_25Mar2023.tsv', 'w', encoding='utf-8') # Vectors
metadata = io.open('metadata_emb_25Mar2023.tsv', 'w', encoding='utf-8') # Metadata
for word_num in range(1, VOCAB):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    metadata.write(word + "\n")
    vectors.write('\t'.join([str(x) for x in embeddings]) + "\n")
vectors.close()
metadata.close()

## Bidirectional LSTM

In [61]:
## They said Teddy bears are on sale
## They said Teddy Roosevelt was a great president
## In the above two sentences, first 3 words are same.  
## Only when we go towards the end, we understand the context of the sentence
##
## To address this problem, we use bidirectional LSTM that will resolve the issue 
## by starting from the beginning and at the same time from the end.


In [62]:
from tensorflow.keras.layers import LSTM, Bidirectional

In [63]:
bidi_model = Sequential()

In [64]:
bidi_model.add(Embedding(VOCAB,
                         EMBED_DIM,
                         input_length = MAXLEN))