# Requirements

In [185]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn as sk
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding,SimpleRNN,Dense,LSTM
from tensorflow.keras import Sequential
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import regularizers

# Sample One hot encoding of word into vectors

In [186]:
sent=[  'the glass of milk',
     'the glass of juice',
     'the cup of tea',
    'I am a good boy',
     'I am a good developer',
     'understand the meaning of words',
     'your videos are good']


In [187]:
voc = 10000
word_vec = []
for words in sent:
    word_vec.append(one_hot(words,voc))
    

In [188]:
word_vec

[[1733, 8418, 7196, 2161],
 [1733, 8418, 7196, 9120],
 [1733, 3829, 7196, 52],
 [148, 1504, 8122, 7926, 6283],
 [148, 1504, 8122, 7926, 5027],
 [4688, 1733, 411, 7196, 6554],
 [930, 2709, 3247, 7926]]

## Embedding Layer - Padding the embedded word vec

In [189]:
padded_sent = pad_sequences(word_vec,padding='post',maxlen=len(sent))
padded_sent

array([[1733, 8418, 7196, 2161,    0,    0,    0],
       [1733, 8418, 7196, 9120,    0,    0,    0],
       [1733, 3829, 7196,   52,    0,    0,    0],
       [ 148, 1504, 8122, 7926, 6283,    0,    0],
       [ 148, 1504, 8122, 7926, 5027,    0,    0],
       [4688, 1733,  411, 7196, 6554,    0,    0],
       [ 930, 2709, 3247, 7926,    0,    0,    0]], dtype=int32)

## Embedding Layer - Real Embedding based on cosine similarity/context similarity

In [190]:
model = Sequential()    # creating the NN structure sequential
dim = 20            # dimension for the embedding layer
model.add(Embedding(input_dim=voc,output_dim=dim,input_length = len(sent)))     # embedding layer creation !!!
model.compile(optimizer='sgd',loss='mse')




In [191]:
model.predict(padded_sent)  # this is the embedded word vector from the Embedding Layer

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 79ms/step


array([[[-2.83974763e-02,  3.79369520e-02, -1.96084511e-02,
          2.34667920e-02,  4.34835665e-02, -1.71406381e-02,
          2.70891897e-02, -1.38779394e-02,  1.12969652e-02,
          1.25057958e-02, -4.60207835e-02,  1.52049772e-02,
         -9.59308073e-03, -3.10332775e-02,  4.76069488e-02,
          4.50065024e-02,  2.84823813e-02, -1.92847010e-02,
          3.13978232e-02, -1.78305507e-02],
        [-4.41440232e-02,  4.90874387e-02, -3.40690501e-02,
         -3.13991793e-02,  7.91388750e-03,  5.69830090e-03,
          4.47376817e-03,  7.79509544e-04, -1.71301961e-02,
         -2.34176964e-03, -4.99409437e-03,  3.82731222e-02,
         -3.56982835e-02, -2.42666155e-03,  4.43895794e-02,
         -3.46268415e-02, -2.19464302e-03,  1.88204683e-02,
         -2.85943151e-02,  2.17725299e-02],
        [ 3.24842073e-02, -3.81610282e-02,  3.18554379e-02,
         -8.45704228e-03,  6.87099993e-04,  1.19723678e-02,
         -1.97519306e-02,  1.10385194e-02, -4.48429585e-02,
          4.

# IMDB dataset

In [192]:
data = pd.read_csv(r'E:\IMPORTANT STUFF\PYTHON\Notes\KRISHNAIK_GENAI_UDEMY\CODES\SIMPLE RNN\MYCODE\IMDB Dataset.csv')

In [193]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Splitting into Training and Testing datasets

In [194]:

X = data['review']
Y = data['sentiment']
le = LabelEncoder()
Y = le.fit_transform(Y)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=69)

In [195]:
X_train_list = X_train.tolist()
X_test_list = X_test.tolist()
Y_train_list = Y_train.tolist()
Y_test_list = Y_test.tolist()
Y_train_array = np.array(Y_train_list)
Y_test_array = np.array(Y_test_list)

In [None]:

X = data['review']
Y = data['sentiment']
le = LabelEncoder()
Y = le.fit_transform(Y)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=69)
X_train_list = X_train.tolist()
X_test_list = X_test.tolist()
Y_train_list = Y_train.tolist()
Y_test_list = Y_test.tolist()
Y_train_array = np.array(Y_train_list)
Y_test_array = np.array(Y_test_list)
# tokenizing the datasets & Padding them
maxLen = 300
# X_train
tokenize = Tokenizer(num_words=10000)
tokenize.fit_on_texts(X_train_list)
tokenized_X_train = tokenize.texts_to_sequences(X_train_list)
padded_X_train = pad_sequences(tokenized_X_train,maxlen = maxLen,padding = 'post')


# X_test
tokenized_X_test = tokenize.texts_to_sequences(X_test_list)
padded_X_test = pad_sequences(tokenized_X_test,maxlen = maxLen,padding = 'post')
# building and training a simple RNN
model = Sequential()
model.add(Embedding(input_dim=10000,output_dim=150,input_length = maxLen))
model.add(LSTM(64, dropout=0.3, recurrent_dropout=0.3),)
model.add(Dense(1,activation='sigmoid',kernel_regularizer=regularizers.l2(0.01)))
model.compile(optimizer='adam',loss='binary_crossentropy')
# Summary of the model
model.build(input_shape=(None, maxLen))

# Creating a EarlyStopper for avoiding Overfitting
earlystopper = EarlyStopping(monitor='val_loss',patience=4,restore_best_weights=True)
model.fit(padded_X_train,Y_train_array,batch_size=30,epochs=30,callbacks=[earlystopper],validation_split=0.2)

def sentimentAnalyze(sentence):
    tokenized_sentence = tokenize.texts_to_sequences(sentence.lower())
    padded_sentence = pad_sequences(tokenized_sentence,maxlen = 300,padding = 'post')
    result = model.predict(padded_sentence)
    if result[0][0]>0.5:
        return ("Positive Comment!!!",result[0][0])
    else:
        return ('Maa chuda, mat, dekh film',result[0][0])

print(sentimentAnalyze("The movie was good indeed"))

# Tokenization - Preprocessing steps

In [196]:
# tokenizing the datasets & Padding them
maxLen = 300
# X_train
tokenize = Tokenizer(num_words=10000)
tokenize.fit_on_texts(X_train_list)
tokenized_X_train = tokenize.texts_to_sequences(X_train_list)
padded_X_train = pad_sequences(tokenized_X_train,maxlen = maxLen,padding = 'post')


# X_test
tokenized_X_test = tokenize.texts_to_sequences(X_test_list)
padded_X_test = pad_sequences(tokenized_X_test,maxlen = maxLen,padding = 'post')



In [197]:
print(np.unique(Y_train, return_counts=True))

(array([0, 1]), array([17527, 17473]))


# Developing the Model

In [198]:
# building and training a simple RNN
model = Sequential()
model.add(Embedding(input_dim=10000,output_dim=150,input_length = maxLen))
model.add(LSTM(64, dropout=0.3, recurrent_dropout=0.3),)
model.add(Dense(1,activation='sigmoid',kernel_regularizer=regularizers.l2(0.01)))
model.compile(optimizer='adam',loss='binary_crossentropy')



In [199]:
# Summary of the model
model.build(input_shape=(None, maxLen))
model.summary()

In [200]:
# Creating a EarlyStopper for avoiding Overfitting
earlystopper = EarlyStopping(monitor='val_loss',patience=4,restore_best_weights=True)
model.fit(padded_X_train,Y_train_array,batch_size=30,epochs=30,callbacks=[earlystopper],validation_split=0.2)

Epoch 1/30
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 108ms/step - loss: 0.6915 - val_loss: 0.6604
Epoch 2/30
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 108ms/step - loss: 0.6615 - val_loss: 0.6525
Epoch 3/30
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 108ms/step - loss: 0.4964 - val_loss: 0.3398
Epoch 4/30
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 108ms/step - loss: 0.2795 - val_loss: 0.2931
Epoch 5/30
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 109ms/step - loss: 0.2110 - val_loss: 0.3020
Epoch 6/30
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 109ms/step - loss: 0.1683 - val_loss: 0.3051
Epoch 7/30
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 109ms/step - loss: 0.1351 - val_loss: 0.3384
Epoch 8/30
[1m934/934[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 108ms/step - loss: 0.1094 - val_loss: 0.3539


<keras.src.callbacks.history.History at 0x1f4845496d0>

In [201]:
# Saving the model
model.save('modelLSTMRNN.h5')



# Model Prediction

In [267]:
def sentimentAnalyze(sentence):
    tokenized_sentence = tokenize.texts_to_sequences([sentence.lower()])
    padded_sentence = pad_sequences(tokenized_sentence,maxlen = 300,padding = 'post')
    result = model.predict(padded_sentence)
    if result[0][0]>0.5:
        return ("Positive Comment!!!",result[0][0])
    else:
        return ('Maa chuda, mat dekh film',result[0][0])

print(sentimentAnalyze("Not my type!!"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
('Maa chuda, mat dekh film', np.float32(0.44543988))
