# Mounting the drive to Google colab

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
!pip install -q -U keras-tuner
import keras_tuner as kt
import heapq

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[K     |████████████████████████████████| 97 kB 3.8 MB/s 
[?25h

In [3]:
stemmer = PorterStemmer()

In [4]:
Movie_review_df = pd.read_csv("/content/drive/MyDrive/NLP/MovieReviewNLP/IMDB-Dataset.csv", nrows = 5000)
Movie_review_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
Movie_review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     5000 non-null   object
 1   sentiment  5000 non-null   object
dtypes: object(2)
memory usage: 78.2+ KB


#The dataset is a perfectly Balanced dataset

In [None]:
Movie_review_df["sentiment"].value_counts()

negative    2532
positive    2468
Name: sentiment, dtype: int64

# The reviews contain some html tags. We need to remove them along with the other symbols except alphebets and '.'

In [6]:
cleaned = re.compile(r'<.*?>')
review_corpus = []
for i in range(len(Movie_review_df['review'])):
    sentences = nltk.sent_tokenize(Movie_review_df['review'][i])
    sentences = [''.join(re.sub(cleaned,'',sentence)) for sentence in sentences]
    sentences = [''.join(re.sub('[^.a-zA-Z]',' ',sentence)) for sentence in sentences]
    sent_list = []
    for sentence in sentences:
        sentence = sentence.lower()
        words = sentence.split()
        #words = [stemmer.stem(word) for word in words if not word in stopwords.words('english')] 
        sent_list.append(' '.join(words))
    review_corpus.append(''.join(sent_list))

In [7]:
review_corpus[0]

'one of the other reviewers has mentioned that after watching just oz episode you ll be hooked.they are right as this is exactly what happened with me.the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go.trust me this is not a show for the faint hearted or timid.this show pulls no punches with regards to drugs sex or violence.its is hardcore in the classic use of the word.it is called oz as that is the nickname given to the oswald maximum security state penitentary.it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda.em city is home to many..aryans muslims gangstas latinos christians italians irish and more....so scuffles death stares dodgy dealings and shady agreements are never far away.i would say the main appeal of the show is due to the fact that it goes where other shows wouldn t dare.forget pret

In [8]:
Movie_review_df["sentiment"][0]

'positive'

In [9]:
len(review_corpus)

5000

In [10]:
len(review_corpus[0])

1686

# Summarization of the reviews as the reviews are are too long. Summarizing technique used is Extractive Summarization.

In [11]:
stopwords = nltk.corpus.stopwords.words('english')
for i in range(len(review_corpus)):
    #sentences = nltk.sent_tokenize(review_corpus[i])
    sentences = review_corpus[i].split(".")
    #print(sentences)
    sentence_scores = {}
    for unsummarised_sentence in sentences: 
        #print(unsummarised_sentence)
        if len(unsummarised_sentence) >0:    
            word_frequencies = {}
            for word in nltk.word_tokenize(unsummarised_sentence):
                if word not in stopwords:
                    if word not in word_frequencies.keys():
                        word_frequencies[word] = 1
                    else:
                        word_frequencies[word] += 1
            #print(word_frequencies)

            if len(word_frequencies) > 0:
                maximum_frequncy = max(word_frequencies.values())
                for word in word_frequencies.keys():
                    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
                # print(word_frequencies)

            #print(word_frequencies)  
            #split_sentence = sentences.split('.')
            for sent in sentences:
                #print(sent)
                for word in nltk.word_tokenize(sent.lower()):
                    if word in word_frequencies.keys():
                        #if len(sent.split(' ')) < 30:
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = word_frequencies[word]
                        else:
                            sentence_scores[sent] += word_frequencies[word]
            
            
    #print("sentence_scores : ")
    #print(sentence_scores)        
    summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(summary_sentences)
    #print(summary)
    review_corpus[i] = summary


# Model creation

In [12]:
from tensorflow.keras.layers import Embedding, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM, Dense, Flatten, Dropout
from tensorflow import keras

In [13]:
vocabulary_size = 500
one_hot_representation = [one_hot(words, vocabulary_size) for words in review_corpus]

In [14]:
print(len(one_hot_representation[0]))
print(one_hot_representation[0][0])

212
127


In [15]:
sent_len = 1000
embeded_docs = pad_sequences(one_hot_representation, padding="pre", maxlen=sent_len)

In [16]:
embedding_vector_feature = 400
model = Sequential()
#model.add(Flatten())
model.add(Embedding(vocabulary_size, embedding_vector_feature, input_length=sent_len))
model.add(LSTM(80, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(80))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
#model.compile(le,loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-4),
              loss=keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [17]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 400)         200000    
_________________________________________________________________
lstm (LSTM)                  (None, 1000, 80)          153920    
_________________________________________________________________
dropout (Dropout)            (None, 1000, 80)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 80)                51520     
_________________________________________________________________
dropout_1 (Dropout)          (None, 80)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 81        
Total params: 405,521
Trainable params: 405,521
Non-trainable params: 0
__________________________________________________

In [18]:
X_final = np.array(embeded_docs)
y_final_df = pd.get_dummies(Movie_review_df['sentiment'], drop_first = True)
y_final = np.array(y_final_df)

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state=0)

# Model Training

In [20]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=250)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f11500f0bd0>

# Model Testing with shorter reviews

In [35]:
negative_test_review = "I must admit that this is one of the worst movies I've ever seen. I thought Dennis Hopper had a little more taste than to appear in this kind of yeeeecchh. If this is supposed to be funny then I gotta look around for a new sense of humor. If you're thinking of buying this movie you'd better think again."

In [36]:
cleaned = re.compile(r'<.*?>')
test_review_corpus = []
test_sentences = nltk.sent_tokenize(negative_test_review)
test_sentences = [''.join(re.sub(cleaned,'',sentence)) for sentence in test_sentences]
test_sentences = [''.join(re.sub('[^a-zA-Z]',' ',sentence)) for sentence in test_sentences]
test_sent_list = []
for sentence in test_sentences:
    sentence = sentence.lower()
    words = sentence.split()
    #words = [stemmer.stem(word) for word in words if not word in stopwords.words('english')] 
    sent_list.append(' '.join(words))
test_review_corpus.append(''.join(sent_list))

In [37]:
vocabulary_size = 500
test_one_hot_representation = [one_hot(words, vocabulary_size) for words in test_review_corpus]

In [38]:
sent_len = 1000
test_embeded_docs = pad_sequences(test_one_hot_representation, padding="pre", maxlen=sent_len)

In [39]:
if(model.predict(test_embeded_docs)[0] <= 0.5):
  print("Negative review")
else:
  print("Positive review")

Negative review


In [40]:
positive_test_review = "Halloween is one of the best examples of independent film. It's very well made and has more psychological elements to it than you might realize at first glance. It is a simple movie told very well. The music is perfect and is one of the most haunting scores... If you haven't seen this movie yet, you must check it out. The cast is all terrific. I wish they had never made sequel after sequel. The first one was by far the best and should have ended like it did without having a sequel. It was fun to see Jamie Lee Curtis in the movie. She hasn't seemed to age (she's just as gorgeous today, without the hairdo and seventies clothes). The scenes through the mask are one of the scariest things ever!"

In [41]:
cleaned = re.compile(r'<.*?>')
test_review_corpus = []
test_sentences = nltk.sent_tokenize(positive_test_review)
test_sentences = [''.join(re.sub(cleaned,'',sentence)) for sentence in test_sentences]
test_sentences = [''.join(re.sub('[^a-zA-Z]',' ',sentence)) for sentence in test_sentences]
test_sent_list = []
for sentence in test_sentences:
    sentence = sentence.lower()
    words = sentence.split()
    #words = [stemmer.stem(word) for word in words if not word in stopwords.words('english')] 
    sent_list.append(' '.join(words))
test_review_corpus.append(''.join(sent_list))

In [42]:
vocabulary_size = 500
test_one_hot_representation = [one_hot(words, vocabulary_size) for words in test_review_corpus]

In [43]:
sent_len = 1000
test_embeded_docs = pad_sequences(test_one_hot_representation, padding="pre", maxlen=sent_len)

In [44]:
if(model.predict(test_embeded_docs)[0] <= 0.5):
  print("Negative review")
else:
  print("Positive review")

Positive review


# Saving the Deep Learning model

In [45]:
!pip install h5py

# serialize model to JSON
model_json = model.to_json()
with open("movie_review_model.json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model.save_weights("movie_review_model_weights.h5")

# saving the model in h5 format
model.save("movie_review_model.h5")



# Hyperparameter tuning with Keras 

In [None]:
def model_builder(hp):
  model = Sequential()
  model.add(Flatten())
  model.add(Embedding(vocabulary_size, embedding_vector_feature, input_length=sent_len))
  model.add(keras.layers.Dropout(0.2))

  # Tune the number of units in the first Dense layer
  # Choose an optimal value between 32-512
  hp_units = hp.Int('units', min_value=80, max_value=120, step=10)
  model.add(LSTM(units=hp_units))
  model.add(keras.layers.Dropout(0.4))
  model.add(Dense(1, activation='sigmoid'))

  # Tune the learning rate for the optimizer
  # Choose an optimal value from 0.001, or 0.0001
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-3, 1e-4])

  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss='binary_crossentropy',
                metrics=['accuracy'])

  return model

In [None]:
tuner = kt.RandomSearch(model_builder,
                     objective='val_accuracy',
                     max_trials=5,
                     executions_per_trial=3,
                     overwrite=True,
                     directory='/content/drive/MyDrive/NLP/MovieReviewNLP',
                     project_name='MovieReview')

In [None]:
tuner.search(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Trial 5 Complete [00h 06m 52s]
val_accuracy: 0.7056565483411154

Best val_accuracy So Far: 0.7391919294993082
Total elapsed time: 00h 29m 00s

Search: Running Trial #6

Hyperparameter    |Value             |Best Value So Far 
units             |90                |100               
learning_rate     |0.0001            |0.0001            

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
 15/105 [===>..........................] - ETA: 8s - loss: 0.6894 - accuracy: 0.6000

KeyboardInterrupt: ignored

In [None]:
tuner.results_summary()

Results summary
Results in /content/drive/MyDrive/NLP/MovieReviewNLP/MovieReview
Showing 10 best trials
Objective(name='val_accuracy', direction='max')
Trial summary
Hyperparameters:
units: 100
learning_rate: 0.0001
Score: 0.7391919294993082
Trial summary
Hyperparameters:
units: 110
learning_rate: 0.0001
Score: 0.7387878894805908
Trial summary
Hyperparameters:
units: 100
learning_rate: 0.001
Score: 0.7056565483411154
Trial summary
Hyperparameters:
units: 80
learning_rate: 0.001
Score: 0.7054545481999716
