In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding
from tensorflow.keras import Sequential
from tensorflow.keras.utils import pad_sequences

# Sample One hot encoding of word into vectors

In [13]:
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good']


In [14]:
voc = 10000
word_vec = []
for words in sent:
    word_vec.append(one_hot(words,voc))
    

In [15]:
word_vec

[[7300, 4137, 656, 3422],
 [7300, 4137, 656, 6558],
 [7300, 8893, 656, 4531],
 [8613, 9128, 9112, 8581, 8342],
 [8613, 9128, 9112, 8581, 9651],
 [3125, 7300, 9829, 656, 6198],
 [7619, 7067, 4481, 8581]]

# Embedding Layer - Padding the embedded word vec

In [17]:
padded_sent = pad_sequences(word_vec,padding='post',maxlen=len(sent))
padded_sent

array([[7300, 4137,  656, 3422,    0,    0,    0],
       [7300, 4137,  656, 6558,    0,    0,    0],
       [7300, 8893,  656, 4531,    0,    0,    0],
       [8613, 9128, 9112, 8581, 8342,    0,    0],
       [8613, 9128, 9112, 8581, 9651,    0,    0],
       [3125, 7300, 9829,  656, 6198,    0,    0],
       [7619, 7067, 4481, 8581,    0,    0,    0]], dtype=int32)

# Embedding Layer - Real Embedding based on cosine similarity/context similarity

In [18]:
model = Sequential()    # creating the NN structure sequential
dim = 20            # dimension for the embedding layer
model.add(Embedding(input_dim=voc,output_dim=dim,input_length = len(sent)))     # embedding layer creation !!!
model.compile(optimizer='sgd',loss='mse')




In [19]:
model.predict(padded_sent)  # this is the embedded word vector from the Embedding Layer

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step


array([[[ 0.01621879,  0.03800709,  0.00812382, -0.03492592,
          0.02636328,  0.0312126 , -0.03468812,  0.0440152 ,
         -0.0493655 ,  0.03855444, -0.02968282,  0.00142076,
         -0.03080897,  0.04212071,  0.03681935,  0.02050047,
         -0.00982034,  0.0114616 ,  0.01845836,  0.0243338 ],
        [ 0.00045646,  0.0308601 ,  0.0484332 ,  0.03177959,
         -0.00522889,  0.02048087, -0.03589197,  0.01722583,
         -0.04031146,  0.01052474,  0.00770257, -0.04031245,
          0.00222021,  0.03621398,  0.02087491,  0.04380565,
          0.00393962, -0.02307582, -0.03376484,  0.01412583],
        [-0.02930867,  0.02116438,  0.02434031, -0.03647938,
         -0.01953638,  0.03307468,  0.01085495, -0.02884765,
          0.02918747, -0.00509958, -0.03302293, -0.04178126,
          0.04453478,  0.00348908,  0.01646459,  0.00861697,
         -0.03066071,  0.01324768, -0.01167672,  0.01209216],
        [ 0.03145543, -0.00562654,  0.03738964,  0.0167394 ,
          0.00882548,

# IMDB dataset

In [20]:
data = pd.read_csv(r'E:\IMPORTANT STUFF\PYTHON\Notes\KRISHNAIK_GENAI_UDEMY\CODES\SIMPLE RNN\MYCODE\IMDB Dataset.csv')

In [21]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Splitting into Training and Testing datasets

In [None]:
from sklearn.model_selection import train_test_split
X = data['review']
Y = data['sentiment']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=69)

In [36]:
X_train_list = X_train.tolist()
X_test_list = X_test.tolist()
Y_train_list = Y_train.tolist()
Y_test_list = Y_test.tolist()

# Tokenization - Preprocessing steps

In [None]:
# tokenizing the datasets & Padding them
maxLen = 300
# X_train
tokenize = Tokenizer(num_words=10000)
tokenize.fit_on_texts(X_train_list)
tokenized_X_train = tokenize.texts_to_sequences(X_train_list)
padded_X_train = pad_sequences(tokenized_X_train,maxlen = maxLen,padding = 'post')

# Y_train
tokenize = Tokenizer(num_words=10000)
tokenize.fit_on_texts(Y_train_list)
tokenized_Y_train = tokenize.texts_to_sequences(Y_train_list)
padded_Y_train = pad_sequences(tokenized_Y_train,maxlen = maxLen,padding = 'post')

# X_test
tokenize = Tokenizer(num_words=10000)
tokenize.fit_on_texts(X_test_list)
tokenized_X_test = tokenize.texts_to_sequences(X_test_list)
padded_X_test = pad_sequences(tokenized_X_test,maxlen = maxLen,padding = 'post')

# Y_test
tokenize = Tokenizer(num_words=10000)
tokenize.fit_on_texts(Y_test_list)
tokenized_Y_test = tokenize.texts_to_sequences(Y_test_list)
padded_Y_test = pad_sequences(tokenized_Y_test,maxlen = maxLen,padding = 'post')

In [None]:
# Padding the datasets



# Developing the Model

In [None]:
def create_model()

In [11]:
# Getting all the reviews as sentences in a list as elements
sent_list = []
for i in range (len(data)):
    sent = data.review[i]
    sent_list.append(sent)


In [12]:
# Tokenizing each words of each of the sentences
from tensorflow.keras.preprocessing.text import Tokenizer
tokenize = Tokenizer(num_words = 1000)
tokenize.fit_on_texts(sent_list)
sequences_text = tokenize.texts_to_sequences(sent_list)

In [13]:
# Padding and equalising the shape of the input vectors
padded_sequence = pad_sequences(sequences_text,maxlen=200,padding='post')
padded_sequence

array([[  1,  86, 148, ...,  16, 125, 486],
       [  3, 393, 120, ...,   0,   0,   0],
       [ 10, 190,  11, ...,   0,   0,   0],
       ...,
       [ 10, 235,   3, ...,   0,   0,   0],
       [145, 166,   5, ...,   0,   0,   0],
       [ 54,  27,   1, ...,   0,   0,   0]],
      shape=(50000, 200), dtype=int32)

# Prepping the Embedding Layer

In [14]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras import Sequential

In [15]:
model = Sequential()
model.add(Embedding(input_dim=10000,output_dim=20,input_length = len(padded_sequence)))
model.compile(optimizer='adam',loss='mse')



In [16]:
model.predict(padded_sequence)

[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step


array([[[-0.0032584 ,  0.03846446, -0.00708508, ...,  0.01037114,
         -0.03384073,  0.03141378],
        [-0.04603532,  0.01881084,  0.03154302, ...,  0.02603985,
          0.01091089,  0.01461883],
        [ 0.04872518,  0.01564261,  0.00205563, ..., -0.0276661 ,
         -0.0142632 ,  0.02531422],
        ...,
        [ 0.01390792,  0.03154914,  0.03378722, ..., -0.00618058,
         -0.02937014, -0.00728397],
        [-0.03003403,  0.01274594, -0.00845401, ...,  0.0396121 ,
         -0.02985126,  0.02492431],
        [ 0.04172175,  0.02564397,  0.03675484, ...,  0.00823667,
         -0.01623418, -0.01033724]],

       [[ 0.03369761, -0.01808919, -0.02805237, ...,  0.03218969,
         -0.02850361, -0.01555448],
        [-0.02327627,  0.00764804,  0.00597836, ..., -0.01332783,
         -0.02840406, -0.02163839],
        [-0.01114633,  0.02829557,  0.01086622, ...,  0.00315038,
          0.04906798, -0.02127459],
        ...,
        [ 0.01111611,  0.04841674,  0.0130274 , ..., -