In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.layers import Dense, InputLayer
from tensorflow.keras.models import Sequential
import nltk
import re
import tensorflow as tf
from bs4 import BeautifulSoup
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('IMDB Dataset.csv')
print(df.shape)
df.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [4]:
# Remove the html strips from the dataset
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()
# Removing the square brackets
def remove_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

# Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_square_brackets(text)
    return text
df['review'] = df['review'].apply(denoise_text)

# Remove the special characters from the dataset
def remove_special_chars(text, remove_digits = True):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern,'',text)
    return text
df['review'] = df['review'].apply(remove_special_chars)

In [5]:
# Remove the stop words from the review
tokenizer = ToktokTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def remove_stopwords(text, is_lower_case = False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stop_words]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

df['review'] = df['review'].apply(remove_stopwords)
df.head()

Unnamed: 0,review,sentiment
0,One reviewers mentioned watching 1 Oz episode ...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,Basically theres family little boy Jake thinks...,negative
4,Petter Matteis Love Time Money visually stunni...,positive


In [6]:
df['sentiment'] = df['sentiment'].map({'negative':0, 'positive':1})

In [7]:
test_reviews = df.review[:10000]
test_sentiments = df.sentiment[:10000].values

train_reviews = df.review[10000:]
train_sentiments = df.sentiment[10000:].values
print(train_reviews.shape, test_reviews.shape)
print(train_sentiments.shape, test_sentiments.shape)

(40000,) (10000,)
(40000,) (10000,)


In [8]:
#Count vectorizer for bag of words
cv = CountVectorizer(max_features = 10000)

train_reviews = cv.fit_transform(train_reviews)
test_reviews = cv.transform(test_reviews)

print('Train:', train_reviews.shape)
print('Test:', test_reviews.shape)
train_reviews = train_reviews.toarray()
test_reviews = test_reviews.toarray()
print(train_reviews.dtype)
print(train_sentiments.dtype)

Train: (40000, 10000)
Test: (10000, 10000)
int64
int64


In [9]:
n_words = train_reviews.shape[1]
print(n_words)

10000


In [10]:
model = Sequential()
model.add(InputLayer(input_shape = (n_words,)))
model.add(Dense(16, activation = 'relu'))
model.add(Dense(16, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))

In [11]:
model.compile(optimizer = 'rmsprop',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [12]:
model.fit(train_reviews, train_sentiments,
          epochs = 20, batch_size = 100, validation_data = (test_reviews, test_sentiments))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x15fb55974c0>