# 1. Essentials

In [2]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import warnings
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding
from tensorflow.keras.optimizers import Adam,SGD
from tensorflow.keras.losses import BinaryCrossentropy,SparseCategoricalCrossentropy,mse
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime

from collections import Counter


warnings.filterwarnings('ignore')

# 2. Data Loading

In [4]:

df = pd.read_csv(r"C:\Users\aredd\Downloads\IMDB Dataset.csv\IMDB Dataset.csv")
df.sample(2)

Unnamed: 0,review,sentiment
13617,where would one start a review of the film Sni...,positive
20108,Scenarist Frederick Fox's sometimes memorable ...,negative


# 3. Data Outlook

In [5]:
df['sentiment'] = pd.get_dummies(df.sentiment)['positive']
print("Null Values\n", df.isnull().sum())

print("\n \t Review Balance count\n", Counter(df.sentiment))
print('')
df.sample(2)


Null Values
 review       0
sentiment    0
dtype: int64

 	 Review Balance count
 Counter({1: 25000, 0: 25000})



Unnamed: 0,review,sentiment
1097,Lame rip-off of THE QUATERMASS XPERIMENT (1955...,0
16680,This is a 100% improvement over the dross of a...,1


# 4. Pre-processing

In [6]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()


def process_text(df,name):
    corpus = []
    for i,txt in enumerate(df[name]):
        text = txt.lower()
        text = re.sub(r'<.*?>',' ',text)
        text = re.sub('[^a-zA-Z0-9]',' ',text)
        df[name][i] = [lemmatizer.lemmatize(x) for x in word_tokenize(text) if x not in stop_words and len(x) > 2]
        corpus.append(' '.join(df[name][i]))
    return corpus

In [7]:
cp = process_text(df,'review')
total_words = len(cp)
maxlen = 128

# 5. Data Splitting (Train & Validation)

In [19]:
X_train,X_test,y_train,y_test = train_test_split(tmp,df.sentiment,test_size=0.3,random_state=1042)

# 6. Text Vectorization

In [20]:
tokenizer = Tokenizer(maxlen,oov_token='<OOV>',)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train,maxlen=maxlen,padding='post')

In [35]:
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test,maxlen=maxlen,padding='post')

In [30]:
# For visualization (TensorBoard) and early stoping
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
)


# 7. Defining LSTM Model

In [32]:
model = Sequential([
    Embedding(input_dim=total_words,output_dim=100,input_length=maxlen,),
    LSTM(128,dropout=0.2,name="Main-LSTM",return_sequences=True),
    LSTM(128,dropout=0.1,name='Second-Lstm'),
    Dense(1,activation='sigmoid')
])

In [33]:
model.compile(Adam(),BinaryCrossentropy(),'accuracy')

# 8. Training Model

In [36]:
model.fit(X_train,y_train,epochs=20,validation_data=(X_test,y_test),callbacks=[tensorboard_callback,stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20


<keras.callbacks.History at 0x21549bb8b80>

# 9. Testing the model with custom review

In [81]:
# positive review
test_sent = 'This is once in a life-time movie i loved the movie must watch'

In [82]:
cp = []
test = test_sent.lower()
test = re.sub(r'<.*?>',' ',test)
test = re.sub('[^a-zA-Z0-9]',' ',test)
test = [lemmatizer.lemmatize(x) for x in word_tokenize(test) if x not in stop_words and len(x) > 2]
cp.append(' '.join(test))

In [83]:
test_ = tokenizer.texts_to_sequences(cp)
test_ = pad_sequences(test_,maxlen=maxlen,padding='post')

In [84]:
test_

array([[ 29,   6,   2,   1,   2, 109,  32,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0]])

In [85]:
pred = model.predict(test_)
pred = [1 if pred > 0.5 else 0]



In [92]:
print("1 ==> Positive Review and 0 ==> Negative Review\n")
print(f"predicted sentiment of the review : '{test_sent}' is = {pred}")

1 ==> Positive Review and 0 ==> Negative Review

predicted sentiment of the review : 'This is once in a life-time movie i loved the movie must watch' is = [1]


In [43]:
# END #