## Header Files

In [3]:
import pandas as pd
import numpy as np
import random

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from collections import Counter

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras import layers, regularizers
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D,Dropout

lemmatizer = WordNetLemmatizer()
stop = stopwords.words('english')
stop.extend(['@', '.', '#', 'user'])

## Loading Train and Test Data

In [24]:
train_data_add = "./Data/IMDB Dataset.csv"
train_data = pd.read_csv(train_data_add)
one_hot = []
for i in range(len(train_data["sentiment"])):
    if(train_data.iloc[i,1]=="positive"):
        one_hot.append(1)
    else:
        one_hot.append(0)
train_data["encoded"] = one_hot

## Checking Data

In [25]:
print("Training Data")
print(train_data.head(3))
print("Length of Test data = ", len(train_data))

Training Data
                                              review sentiment  encoded
0  One of the other reviewers has mentioned that ...  positive        1
1  A wonderful little production. <br /><br />The...  positive        1
2  I thought this was a wonderful way to spend ti...  positive        1
Length of Test data =  50000


In [26]:
# Converting to Lower Case
for i in range(len(train_data)):
    train_data.iloc[i, 0] = train_data.iloc[i, 0].lower()
print(train_data.head(5))

                                              review sentiment  encoded
0  one of the other reviewers has mentioned that ...  positive        1
1  a wonderful little production. <br /><br />the...  positive        1
2  i thought this was a wonderful way to spend ti...  positive        1
3  basically there's a family where a little boy ...  negative        0
4  petter mattei's "love in the time of money" is...  positive        1


## Tokenization

In [28]:
tokens = []
for i in range(len(train_data)):
    tokens.append(nltk.word_tokenize(train_data.iloc[i, 0]))
print("After Tokenizing          : ", tokens[0])

After Tokenizing          :  ['one', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', '1', 'oz', 'episode', 'you', "'ll", 'be', 'hooked', '.', 'they', 'are', 'right', ',', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'me.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'the', 'first', 'thing', 'that', 'struck', 'me', 'about', 'oz', 'was', 'its', 'brutality', 'and', 'unflinching', 'scenes', 'of', 'violence', ',', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'go', '.', 'trust', 'me', ',', 'this', 'is', 'not', 'a', 'show', 'for', 'the', 'faint', 'hearted', 'or', 'timid', '.', 'this', 'show', 'pulls', 'no', 'punches', 'with', 'regards', 'to', 'drugs', ',', 'sex', 'or', 'violence', '.', 'its', 'is', 'hardcore', ',', 'in', 'the', 'classic', 'use', 'of', 'the', 'word.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'it', 'is', 'called', 'oz', 'as', 'that', 'is', 'the', 'nickname', 'given', 'to', 'the', 'oswald', 'maximum', 'security', 

## Stop Word Removal

In [29]:
filtered_tokens = []
for i in range(len(tokens)):
    filtered_tokens.append([word for word in tokens[i] if word not in stop])
print("After Removing Stop Words : ", filtered_tokens[0])

After Removing Stop Words :  ['one', 'reviewers', 'mentioned', 'watching', '1', 'oz', 'episode', "'ll", 'hooked', 'right', ',', 'exactly', 'happened', 'me.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'first', 'thing', 'struck', 'oz', 'brutality', 'unflinching', 'scenes', 'violence', ',', 'set', 'right', 'word', 'go', 'trust', ',', 'show', 'faint', 'hearted', 'timid', 'show', 'pulls', 'punches', 'regards', 'drugs', ',', 'sex', 'violence', 'hardcore', ',', 'classic', 'use', 'word.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'called', 'oz', 'nickname', 'given', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'focuses', 'mainly', 'emerald', 'city', ',', 'experimental', 'section', 'prison', 'cells', 'glass', 'fronts', 'face', 'inwards', ',', 'privacy', 'high', 'agenda', 'em', 'city', 'home', 'many', '..', 'aryans', ',', 'muslims', ',', 'gangstas', ',', 'latinos', ',', 'christians', ',', 'italians', ',', 'irish', '....', 'scuffles', ',', 'death', 'stares', ',', 'dodgy', 'dealings', 

## Lemmatization

In [30]:
lemmatizers = []
for i in range(len(filtered_tokens)):
    lemmatizers.append([lemmatizer.lemmatize(word) for word in filtered_tokens[i]])
print("After Lemmatization       : ", lemmatizers[0])

After Lemmatization       :  ['one', 'reviewer', 'mentioned', 'watching', '1', 'oz', 'episode', "'ll", 'hooked', 'right', ',', 'exactly', 'happened', 'me.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'first', 'thing', 'struck', 'oz', 'brutality', 'unflinching', 'scene', 'violence', ',', 'set', 'right', 'word', 'go', 'trust', ',', 'show', 'faint', 'hearted', 'timid', 'show', 'pull', 'punch', 'regard', 'drug', ',', 'sex', 'violence', 'hardcore', ',', 'classic', 'use', 'word.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'called', 'oz', 'nickname', 'given', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'focus', 'mainly', 'emerald', 'city', ',', 'experimental', 'section', 'prison', 'cell', 'glass', 'front', 'face', 'inwards', ',', 'privacy', 'high', 'agenda', 'em', 'city', 'home', 'many', '..', 'aryan', ',', 'muslim', ',', 'gangsta', ',', 'latino', ',', 'christian', ',', 'italian', ',', 'irish', '....', 'scuffle', ',', 'death', 'stare', ',', 'dodgy', 'dealing', 'shady', 'agreement'

In [31]:
"""
 The next step currently is converting these words into numbers based on occurances and padding with zeros for length  
"""
a=0
for i in range(len(lemmatizers)):
    a=max(a,len(lemmatizers[i]))

In [32]:
total_words=[]
for i in range(len(lemmatizers)):
    total_words.extend(lemmatizers[i])


tokenizer = Tokenizer(
    num_words = total_words,
    filters = '"#$%&()*+-/:;<=>@[\]^_`{|}~'
)
tokenizer.fit_on_texts(lemmatizers)

In [33]:
print(a)

1706


## Finding Highest frequency of words and creating the int dict

In [34]:
lemmatizers_to_int = Counter(total_words)
total_word_count = len(total_words)
sorted_order = lemmatizers_to_int.most_common(total_word_count)
print(lemmatizers_to_int)
vocab_to_index = {w: i+1 for i, (w, c) in enumerate(sorted_order)}
print(vocab_to_index)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Encoding to numbers

In [35]:
num_encoded_reviews = []
for i in range(len(lemmatizers)):
    num_encoded_reviews.append([vocab_to_index[word] for word in lemmatizers[i]])
print(num_encoded_reviews)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Padding and truncating

In [20]:
padded_reviews = pad_sequences(num_encoded_reviews, maxlen=90)
print(padded_reviews)
y_train = np.asarray(np.array(train_data.iloc[:,1])).astype('float32').reshape((-1,1))


[[    0     0     0 ...   149  7456   361]
 [    0     0     0 ...  7458 15346  9704]
 [    0     0     0 ...     0    33  3159]
 ...
 [    0     0     0 ...  7208    44    82]
 [    0     0     0 ...  1490  1491   491]
 [    0     0     0 ...     0   121   131]]


In [21]:
ping=[]
for i in range(len(num_encoded_reviews)):
    ping.append((num_encoded_reviews[i],y_train[i]))
random.shuffle(ping)

training_set = ping[:20000]
testing_set = ping[20000:]

In [22]:
# Now we are ready to train our model through LSTM network
model = Sequential()
model.add(layers.Embedding(len(vocab_to_index)+1,output_dim = 10,input_length = 90))
model.add(SpatialDropout1D(0.3))
model.add(layers.LSTM(90))
model.add(layers.Dense(1,activation='softmax', input_shape=(50,)))

model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy'])

checkpoint1 = ModelCheckpoint("best_mode1_11-02-2021.hdf5",monitor='val-accuracy',save_best_only = True, save_weights_only= False)

# Our vectorized labels
model.fit(np.array(padded_reviews), y_train, validation_split=0.2 , epochs=50, callbacks=[checkpoint1], verbose=1 )

Epoch 1/50
Epoch 2/50

KeyboardInterrupt: 

In [None]:
count=0
for i in range(len(train_data.iloc[:,1])):
    if(train_data.iloc[i,1]==1):
        count=count+1

In [None]:
count,len(train_data.iloc[:,1])-count

In [None]:
len(train_data.iloc[:,1])


In [None]:
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim = len(vocab_to_index)+1, output_dim = 256, input_length = 70))
model_lstm.add(SpatialDropout1D(0.3))
model_lstm.add(LSTM(256, dropout = 0.3, recurrent_dropout = 0.3))
model_lstm.add(Dense(256, activation = 'relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(1, activation = 'softmax'))
model_lstm.compile(
    loss='binary_crossentropy',
    optimizer='Adam',
    metrics=['accuracy']
)

In [None]:
batch_size = 512
epochs = 8
X_train= np.array(padded_reviews)
y_train = np.array(train_data.iloc[:,1])

In [None]:
history = model_lstm.fit(
    X_train,
    y_train,
    validation_split = 0.1,
    epochs = 8,
    batch_size = 512
)

In [None]:
X_train[0]