In [None]:
import numpy as np
import pandas as pd
import time, os, nltk
from functools import reduce

from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = (30, 10)

from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

## If you are using google colabs use following code for downloading dataset from kaggle

## If you are using local machine follow following steps

In [None]:
os.mkdir("data/")

folder_path = "data/"

In [None]:
os.mkdir(folder_path + "train/")
os.mkdir(folder_path + "test/")

In [None]:
# Collect all positive review files
pos_files = os.listdir(folder_path + "train/pos/")

In [None]:
print(pos_files[:10])

In [None]:
# Collect all negative review files
neg_files = os.listdir(folder_path + "train/neg/")

In [None]:
print(neg_files[:10])

In [None]:
# Total number of files positive & negative i.e. equals to total number of reviews
print(len(pos_files), len(neg_files))

In [None]:
# Put all text data inside X & label data inside Y & unique words inside unq_words
X = []
Y = [] # positive = 1 & negative = 0
unq_words = []

In [None]:
# Download tokenizer 'punket' of nltk library
nltk.download('punkt')

In [None]:
# Read all positive reviews & put inside X & it's respective label inside Y
for file in pos_files:
    with open(folder_path + "train/pos/" + file, "r") as f:
        sentence = f.readline().lower()
        tokens = nltk.word_tokenize(sentence)
        unq_words.extend(tokens)
        X.append(tokens)
        Y.append(1)

In [None]:
len(X), len(Y), len(unq_words)

In [None]:
print(X[:2])

In [None]:
print(unq_words[:10])

In [None]:
# Read all negative reviews & put inside X & it's respective label inside Y
for file in neg_files:
    with open(folder_path + "train/neg/" + file, "r") as f:
        sentence = f.readline().lower()
        tokens = nltk.word_tokenize(sentence)
        unq_words.extend(tokens)
        X.append(tokens)
        Y.append(0)

In [None]:
len(X), len(Y), len(unq_words)

In [None]:
# Put data inside pandas dataframe for some analysis
dataset = pd.DataFrame({"Sentence": X, "Label": Y})

In [None]:
dataset.head(10)

In [None]:
dataset.tail(10)

In [None]:
# Is there any class imbalance?
dataset['Label'].value_counts().plot(kind='barh')

In [None]:
# Let's now find out unique words
unq_words = list(set(unq_words))

# Number of unique words
len(unq_words)

In [None]:
unq_words[:10]

In [None]:
# Add 2 more words i.e. PAD for padding & UNK for unknown words
unq_words.insert(0, "PAD")
unq_words.insert(1, "UNK")

In [None]:
unq_words[:10]

In [None]:
# Number of unique words after adding PAD & UNK
len(unq_words)

In [None]:
# word2idx is for converting sentences into arrays of numbers & idx2word for viceversa
word2idx = {}
idx2word = {}

for i, word in enumerate(unq_words):
    word2idx[word] = i
    
for i, word in enumerate(unq_words):
    idx2word[i] = word

In [None]:
word2idx.get("inning")

In [None]:
idx2word.get(7877)

In [None]:
# Calculate all sentences individual lengths
sentences_len = [len(sent) for sent in X]

In [None]:
# Maximum size of any particular sentence
max_sentence_size = max(sentences_len)
max_sentence_size

In [None]:
# Minimum size of any particular sentence
min_sentence_size = min(sentences_len)
min_sentence_size

In [None]:
# Average size of any particular sentence
avg_sentence_size = reduce((lambda x, y: x + y), sentences_len) // len(X)
avg_sentence_size

In [None]:
# Let's try to see how length of sentences varies
plt.plot(list(range(len(sentences_len))), sentences_len)

In [None]:
# This function convert sentences array into number array
def num_sentence(sentences_tokens):
    x = [[word2idx.get(word, 1) for word in sent] for sent in sentences_tokens]
    
    return x

In [None]:
num_X = num_sentence(X)

In [None]:
len(num_X)

In [None]:
print(num_X[:2])

In [None]:
# Now divide data into training & validations data
train_x, valid_x, train_y, valid_y = train_test_split(num_X, Y, test_size = 0.10, random_state = 10)

In [None]:
# Perform padding for sentences less then 750
train_x_pad = pad_sequences(train_x, maxlen = 750, padding = "pre", truncating = "pre", value = 0)

In [None]:
plt.plot(list(range(len(train_x_pad))), [len(sent) for sent in train_x_pad])

In [None]:
# Model input data dimensions
timesteps = 750
data_dim = 1
output_len = 1

In [None]:
# Neural network architecture
model = Sequential()

model.add(LSTM(50, input_shape=(timesteps, data_dim), return_sequences = True))
model.add(Dropout(0.2))

model.add(LSTM(100, return_sequences = False))
model.add(Dropout(0.2))

model.add(Dense(output_len))
model.add(Activation("sigmoid"))

start = time.time()
model.compile(loss = "binary_crossentropy", optimizer = "rmsprop")
print ('compilation time : ', time.time() - start)

In [None]:
model.summary()

In [None]:
# let's check our data is in proper shape
train_x_pad = np.array(train_x_pad)
train_x_pad.shape

In [None]:
# It's missing dimension of individual timestep data point
train_x_pad = np.reshape(train_x_pad, (train_x_pad.shape[0], train_x_pad.shape[1], 1))
train_x_pad.shape

In [None]:
print(train_x_pad[:2])

In [None]:
# Now data is in proper format let's fit it
model.fit(train_x_pad, train_y, batch_size = 128, epochs = 10, validation_split = 0.05, verbose = 2)