In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Step 1: Dataset Import (Dependant and Independant variable)

In [2]:
# importing the Dataset
import pandas as pd
reviews = pd.read_csv('/content/drive/MyDrive/NLP DataSet/imdb_master.csv',usecols=["review", "label"],
                      encoding='ISO-8859-1').sample(n=500)
reviews.head()

Unnamed: 0,review,label
84620,"I've seen other Olsen Twins movies, and most o...",unsup
94998,OK. First things first. When I watch a film I ...,unsup
785,... but had to see just how bad it could get. ...,neg
41753,It must have been several years after it was r...,pos
38027,"A friend of mine recommended this movie, citin...",pos


In [3]:
reviews['label'].value_counts()

unsup    243
neg      142
pos      115
Name: label, dtype: int64

# Step 2: Cleaning the Dataset ( Tokenization, Stopwords, Stemmimg or Lemmitization)

In [4]:
#Data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()
lemma=WordNetLemmatizer()

In [6]:
# For Lemmatizer
corpus = []
for i in range(0, len(reviews)):
    review = re.sub('[^a-zA-Z]', ' ', str(reviews['review'].iloc[i]))
    review = review.lower()
    review = review.split()

    review = [lemma.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
#  just print to see Corpus
corpus

# Step 3: One Hot Encoding via Vocabulary size

In [9]:
import tensorflow as tf
print(tf.__version__)

2.15.0


In [10]:
##tensorflow >2.0
from tensorflow.keras.preprocessing.text import one_hot

One Hot Reprensentation

In [35]:
### Vocabulary size
voc_size=100

In [36]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
print(onehot_repr)

[[31, 22, 69, 69, 20, 52, 27, 38, 37, 12, 16, 68, 33, 52, 96, 95, 69, 36, 23, 93, 18, 69, 23, 37, 12, 84, 32, 66, 5, 84, 42, 76, 52, 28, 32, 25, 82, 24, 69, 12, 29], [20, 43, 28, 43, 48, 54, 31, 92, 44, 61, 92, 64, 39, 90, 85, 44, 44, 43, 96, 54, 91, 95, 48, 11, 33, 12, 78, 41, 12, 1, 65, 76, 44, 17, 41, 27, 49, 79, 35, 87, 84, 44, 21, 84, 69, 5, 5, 15, 95, 59, 85, 5, 44, 44, 13, 38, 72, 43, 68, 53, 15, 45, 28, 13, 89, 85, 2, 16, 54, 20, 34, 10, 9, 71, 52, 33, 68, 29, 26, 10, 11, 83, 35, 90, 54, 31, 25, 52, 23, 44, 44, 54, 82, 63, 10, 66, 59, 26, 7, 15, 84, 8, 44, 31, 85, 53, 59, 46, 3, 62, 57, 63, 15, 72, 89, 30, 17, 92, 68, 18, 44, 44, 75, 56, 54, 48, 44, 89, 85, 44, 44, 8, 2, 45, 8, 88, 58, 54, 49, 55, 35, 55, 52, 10, 44, 54, 26, 23, 22, 29, 62, 53, 28, 69, 90, 23, 64, 31, 4, 96, 44, 44, 20, 44, 8, 54, 52, 53, 22, 19, 27, 60, 50, 58, 87, 48, 69, 13, 89, 50, 38], [45, 69, 2, 19, 22, 53, 92, 9, 31, 44, 85, 87, 92, 37, 62, 71, 51, 67, 90, 64, 17, 11, 5, 60, 3, 12, 50, 65, 11, 35, 60, 1

##### Word Embedding Representation

In [37]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
import numpy as np

# Step 4: Pre or Post Padding

In [38]:
## post padding
sent_length=50
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

[[31 22 69 ...  0  0  0]
 [ 8  2 45 ... 89 50 38]
 [69 39 59 ... 67 52 88]
 ...
 [66 78 82 ... 74 69 64]
 [90 93 99 ... 95  4 91]
 [54 68 44 ... 40 92  6]]


# Step 5: Feature Representation
##### That is 40 features for each word of a sentences that is 500 sentences

In [43]:
##features representation
embedding_vector_features=40

# Step 6: Model Building LSTM

In [44]:
##Creating model
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(3,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 40)            4000      
                                                                 
 lstm_2 (LSTM)               (None, 100)               56400     
                                                                 
 dense_2 (Dense)             (None, 3)                 303       
                                                                 
Total params: 60703 (237.12 KB)
Trainable params: 60703 (237.12 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [45]:
## For Dependant Variable
y=pd.get_dummies(reviews['label'])
# y=y.iloc[:,2].values
print(y)

       neg  pos  unsup
84620    0    0      1
94998    0    0      1
785      1    0      0
41753    0    1      0
38027    0    1      0
...    ...  ...    ...
43329    0    1      0
7223     1    0      0
49666    0    1      0
47266    0    1      0
47603    0    1      0

[500 rows x 3 columns]


In [58]:
# Converting it to arrays
X_final=np.array(embedded_docs)
y_final=np.array(y)
X_final.shape,y_final.shape
# here 500 means 500 sentences which is imported and 50 is the independant features also the sent_length (which was set) and 3 catagories

((500, 50), (500, 3))

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

##### Model Training

In [49]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d63c1745bd0>

# Step 7: Prediction and Performation for LSTM

In [51]:
y_pred=model.predict(X_test)
# y_pred=np.where(y_pred > 0.6, 1,0) ##AUC ROC Curve
y_pred = (y_pred > 0.2)



In [52]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.32      0.98      0.48        53
           1       0.22      0.91      0.35        32
           2       0.48      1.00      0.65        80

   micro avg       0.35      0.98      0.52       165
   macro avg       0.34      0.96      0.50       165
weighted avg       0.38      0.98      0.54       165
 samples avg       0.36      0.98      0.52       165



## End of LSTM

# Step 6: Model Building Bidirectional LSTM

In [60]:
##Creating model
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(3,activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 50, 40)            4000      
                                                                 
 bidirectional_2 (Bidirecti  (None, 200)               112800    
 onal)                                                           
                                                                 
 dense_4 (Dense)             (None, 3)                 603       
                                                                 
Total params: 117403 (458.61 KB)
Trainable params: 117403 (458.61 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [61]:
## Model Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20,batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7d63ac3216c0>

# Step 7: Prediction and Performation for Bidirectional LSTM

In [62]:
y_pred_bi=model.predict(X_test)
# y_pred=np.where(y_pred > 0.6, 1,0) ##AUC ROC Curve
y_pred_bi = (y_pred_bi > 0.2)



In [63]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_bi))

              precision    recall  f1-score   support

           0       0.31      0.49      0.38        53
           1       0.24      0.44      0.31        32
           2       0.50      0.82      0.62        80

   micro avg       0.39      0.64      0.48       165
   macro avg       0.35      0.58      0.44       165
weighted avg       0.39      0.64      0.48       165
 samples avg       0.41      0.64      0.48       165



# End of BiDirectional LSTM