In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# importing the Dataset
import pandas as pd
reviews = pd.read_csv('/content/drive/MyDrive/NLP DataSet/imdb_master.csv',usecols=["review", "label"],
                      encoding='ISO-8859-1').sample(n=500)
reviews.head()

Unnamed: 0,review,label
85847,The reason I chose to rate this movie so low i...,unsup
1209,"Very odd, this seems like a very average movie...",neg
87767,"After surfing through this site, I came across...",unsup
44113,"If this movie proves only one thing, it's that...",pos
40156,Minor Spoilers<br /><br />Alison Parker (Crist...,pos


In [13]:
reviews['label'].value_counts

<bound method IndexOpsMixin.value_counts of 85847    unsup
1209       neg
87767    unsup
44113      pos
40156      pos
         ...  
24531      pos
17167      pos
85868    unsup
24184      pos
60663    unsup
Name: label, Length: 500, dtype: object>

In [3]:
#Data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()
lemma=WordNetLemmatizer()

In [5]:
# For Lemmatizer
corpus = []
for i in range(0, len(reviews)):
    review = re.sub('[^a-zA-Z]', ' ', str(reviews['review'].iloc[i]))
    review = review.lower()
    review = review.split()

    review = [lemma.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [7]:
#  just print to see Corpus

## Create Word Embedding on the reviews

In [6]:
import tensorflow as tf
print(tf.__version__)

2.15.0


In [7]:
##tensorflow >2.0
from tensorflow.keras.preprocessing.text import one_hot

One Hot Reprensentation

In [8]:
### Vocabulary size
voc_size=500

In [9]:
onehot_repr=[one_hot(words,voc_size)for words in corpus]
print(onehot_repr)

[[279, 55, 29, 441, 277, 298, 417, 287, 59, 23, 74, 145, 473, 42, 145, 472, 98, 461, 420, 258, 381, 274, 377, 320, 183, 44, 77, 261, 379, 115, 352, 482, 169, 106, 234, 130, 461, 64, 111, 42, 418, 441, 299, 18, 346, 461, 276, 180, 484, 277, 372, 441, 273, 307, 325, 160, 437, 106, 142, 42, 460, 247, 441], [340, 331, 117, 13, 441, 120, 452, 357, 51, 384, 282, 253, 495, 111, 293, 380, 441, 441, 203, 371, 88, 130, 355, 355, 152, 130, 499, 119, 101, 473, 461, 75, 117, 239, 499, 33, 62, 360, 441, 174, 30, 297, 401, 159, 87, 114, 253, 98, 322, 296, 277, 222, 211, 198, 282, 180, 394, 249, 230, 317, 134, 436, 372, 210, 344, 176, 203, 127, 296, 144, 352, 272, 473, 456, 284, 481, 415, 383, 430, 296, 259, 61, 50, 236, 116, 106, 355, 355, 276, 416, 105, 408, 471, 224, 486, 408, 355, 355, 284, 130, 355, 355, 24, 325, 388, 431, 140, 496, 119, 236, 40, 234, 50, 62, 152, 269, 206, 441, 388, 375, 59, 322, 425, 197, 116, 455, 129, 147, 487, 80, 454, 352, 487, 32, 212, 197, 401, 117, 371, 32, 70, 226, 255,

## Word Embedding Representation

In [10]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
import numpy as np

In [11]:
## post padding
sent_length=50
embedded_docs=pad_sequences(onehot_repr,padding='post',maxlen=sent_length)
print(embedded_docs)

[[ 42 145 472 ... 460 247 441]
 [431 339 387 ... 108 355 355]
 [174 172 220 ... 355 274 382]
 ...
 [183 176 441 ...   0   0   0]
 [468  79  83 ... 292  59 217]
 [296 355 355 ... 122 198 279]]


In [77]:
## Creating model
embedding_vector_features=40 ##features representation
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(LSTM(100))
model.add(Dense(2,activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 50, 40)            20000     
                                                                 
 lstm_9 (LSTM)               (None, 100)               56400     
                                                                 
 dense_9 (Dense)             (None, 2)                 202       
                                                                 
Total params: 76602 (299.23 KB)
Trainable params: 76602 (299.23 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [78]:
## For Dependant Variable
y=pd.get_dummies(reviews['label'])
y=y.iloc[:,2].values

In [79]:
len(embedded_docs),y.shape # Here embedded_docs is the independant variable

(500, (500,))

In [80]:
X_final=np.array(embedded_docs)
y_final=np.array(y)
X_final.shape,y_final.shape

((500, 50), (500,))

In [81]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

## Model Training

In [82]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7916772d79a0>

Performance Matrix

In [83]:
y_pred=model.predict(X_test)



In [84]:
y_pred=np.where(y_pred > 0.6, 1,0) ##AUC ROC Curve

In [87]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

ValueError: Classification metrics can't handle a mix of binary and multilabel-indicator targets