In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot

# Sample One hot encoding of word into vectors

In [None]:
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good']


In [None]:
voc = 10000
word_vec = []
for words in sent:
    word_vec.append(one_hot(words,voc))
    

In [None]:
word_vec

[[7127, 4975, 1452, 1294],
 [7127, 4975, 1452, 8233],
 [7127, 2321, 1452, 2135],
 [1388, 4878, 9696, 7032, 4777],
 [1388, 4878, 9696, 7032, 9316],
 [3884, 7127, 8006, 1452, 6912],
 [7768, 8666, 8793, 7032]]

# Embedding Layer - Padding the embedded word vec

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras import Sequential
from tensorflow.keras.utils import pad_sequences

In [None]:
padded_sent = pad_sequences(word_vec,padding='post',maxlen=len(sent))
padded_sent

array([[7127, 4975, 1452, 1294,    0,    0,    0],
       [7127, 4975, 1452, 8233,    0,    0,    0],
       [7127, 2321, 1452, 2135,    0,    0,    0],
       [1388, 4878, 9696, 7032, 4777,    0,    0],
       [1388, 4878, 9696, 7032, 9316,    0,    0],
       [3884, 7127, 8006, 1452, 6912,    0,    0],
       [7768, 8666, 8793, 7032,    0,    0,    0]], dtype=int32)

# Embedding Layer - Real Embedding based on cosine similarity/context similarity

In [None]:
model = Sequential()    # creating the NN structure sequential
dim = 20            # dimension for the embedding layer
model.add(Embedding(input_dim=voc,output_dim=dim,input_length = len(sent)))     # embedding layer creation !!!
model.compile(optimizer='sgd',loss='mse')




In [None]:
model.predict(padded_sent)  # this is the embedded word vector from the Embedding Layer

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 261ms/step


array([[[-4.87494841e-02,  1.78309418e-02, -3.05114388e-02,
          8.88448954e-03, -2.59356741e-02, -3.87987010e-02,
         -3.49156968e-02, -2.36329436e-02,  3.36800255e-02,
         -2.28322744e-02,  4.77285124e-02,  3.04271691e-02,
         -1.32724755e-02,  1.59482099e-02, -1.02779269e-02,
         -3.32386494e-02, -3.35150249e-02,  4.33909185e-02,
          7.75825977e-03,  3.42686512e-02],
        [-1.03077777e-02,  3.11327912e-02,  2.21135877e-02,
         -2.18718648e-02, -3.73913758e-02, -2.40171440e-02,
         -1.34511814e-02,  1.02429166e-02, -6.67518377e-03,
          8.06728750e-03, -4.31843512e-02, -1.03771575e-02,
         -3.00099254e-02, -7.82120973e-04,  2.58976482e-02,
         -3.22128423e-02,  3.79674509e-03,  3.62005718e-02,
         -1.08899251e-02,  1.11493245e-02],
        [ 2.01610439e-02,  4.39710133e-02,  1.17119439e-02,
         -1.31171457e-02,  2.50121020e-02, -3.09908632e-02,
          1.83920972e-02, -1.72906406e-02, -3.17806490e-02,
         -3.

# IMDB dataset

In [None]:
data = pd.read_csv(r'E:\IMPORTANT STUFF\PYTHON\Notes\KRISHNAIK_GENAI_UDEMY\CODES\SIMPLE RNN\MYCODE\IMDB Dataset.csv')

In [None]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Splitting into Training and Testing datasets

In [None]:
from sklearn.model_selection import train_test_split
X = data['review']
Y = data['sentiment']
X_train,X_temp,Y_train,Y_temp = train_test_split(X,Y,test_size=0.3,random_state=69)
X_val,X_test,Y_val,Y_test = train_test_split(X_temp,Y_temp,test_size=0.5,random_state=69)


In [11]:
# Getting all the reviews as sentences in a list as elements
sent_list = []
for i in range (len(data)):
    sent = data.review[i]
    sent_list.append(sent)


In [12]:
# Tokenizing each words of each of the sentences
from tensorflow.keras.preprocessing.text import Tokenizer
tokenize = Tokenizer(num_words = 1000)
tokenize.fit_on_texts(sent_list)
sequences_text = tokenize.texts_to_sequences(sent_list)

In [13]:
# Padding and equalising the shape of the input vectors
padded_sequence = pad_sequences(sequences_text,maxlen=200,padding='post')
padded_sequence

array([[  1,  86, 148, ...,  16, 125, 486],
       [  3, 393, 120, ...,   0,   0,   0],
       [ 10, 190,  11, ...,   0,   0,   0],
       ...,
       [ 10, 235,   3, ...,   0,   0,   0],
       [145, 166,   5, ...,   0,   0,   0],
       [ 54,  27,   1, ...,   0,   0,   0]],
      shape=(50000, 200), dtype=int32)

# Prepping the Embedding Layer

In [14]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras import Sequential

In [15]:
model = Sequential()
model.add(Embedding(input_dim=10000,output_dim=20,input_length = len(padded_sequence)))
model.compile(optimizer='adam',loss='mse')



In [16]:
model.predict(padded_sequence)

[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step


array([[[-0.0032584 ,  0.03846446, -0.00708508, ...,  0.01037114,
         -0.03384073,  0.03141378],
        [-0.04603532,  0.01881084,  0.03154302, ...,  0.02603985,
          0.01091089,  0.01461883],
        [ 0.04872518,  0.01564261,  0.00205563, ..., -0.0276661 ,
         -0.0142632 ,  0.02531422],
        ...,
        [ 0.01390792,  0.03154914,  0.03378722, ..., -0.00618058,
         -0.02937014, -0.00728397],
        [-0.03003403,  0.01274594, -0.00845401, ...,  0.0396121 ,
         -0.02985126,  0.02492431],
        [ 0.04172175,  0.02564397,  0.03675484, ...,  0.00823667,
         -0.01623418, -0.01033724]],

       [[ 0.03369761, -0.01808919, -0.02805237, ...,  0.03218969,
         -0.02850361, -0.01555448],
        [-0.02327627,  0.00764804,  0.00597836, ..., -0.01332783,
         -0.02840406, -0.02163839],
        [-0.01114633,  0.02829557,  0.01086622, ...,  0.00315038,
          0.04906798, -0.02127459],
        ...,
        [ 0.01111611,  0.04841674,  0.0130274 , ..., -