# Importing Libraries

In [1]:
import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.models import load_model
import pickle

Using TensorFlow backend.


# Importing Dataset

In [2]:
dataset = pd.read_csv('20191226-reviews.csv')
items = pd.read_csv('20191226-items.csv')

In [3]:
dataset

Unnamed: 0,asin,name,rating,date,verified,title,body,helpfulVotes
0,B0000SX2UC,Janet,3,"October 11, 2005",False,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0
1,B0000SX2UC,Luke Wyatt,1,"January 7, 2004",False,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...,17.0
2,B0000SX2UC,Brooke,5,"December 30, 2003",False,Love This Phone,"This is a great, reliable phone. I also purcha...",5.0
3,B0000SX2UC,amy m. teague,3,"March 18, 2004",False,"Love the Phone, BUT...!","I love the phone and all, because I really did...",1.0
4,B0000SX2UC,tristazbimmer,4,"August 28, 2005",False,"Great phone service and options, lousy case!",The phone has been great for every purpose it ...,1.0
...,...,...,...,...,...,...,...,...
67981,B081H6STQQ,jande,5,"August 16, 2019",False,"Awesome Phone, but finger scanner is a big mis...",I love the camera on this phone. The screen is...,1.0
67982,B081H6STQQ,2cool4u,5,"September 14, 2019",False,Simply Amazing!,I've been an Xperia user for several years and...,1.0
67983,B081H6STQQ,simon,5,"July 14, 2019",False,"great phon3, but many bugs need to fix. still ...",buy one more for my cousin,
67984,B081TJFVCJ,Tobiasz Jedrysiak,5,"December 24, 2019",True,Phone is like new,Product looks and works like new. Very much re...,


In [4]:
dataset.drop(columns=['asin', 'name' , 'date' , 'verified' , 'helpfulVotes' ] , inplace = True) 

In [5]:
dataset

Unnamed: 0,rating,title,body
0,3,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...
1,1,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...
2,5,Love This Phone,"This is a great, reliable phone. I also purcha..."
3,3,"Love the Phone, BUT...!","I love the phone and all, because I really did..."
4,4,"Great phone service and options, lousy case!",The phone has been great for every purpose it ...
...,...,...,...
67981,5,"Awesome Phone, but finger scanner is a big mis...",I love the camera on this phone. The screen is...
67982,5,Simply Amazing!,I've been an Xperia user for several years and...
67983,5,"great phon3, but many bugs need to fix. still ...",buy one more for my cousin
67984,5,Phone is like new,Product looks and works like new. Very much re...


In [6]:
dataset['sentiment'] = dataset['rating'].apply(lambda rating : +1 if rating > 3 else 0)

In [7]:
dataset

Unnamed: 0,rating,title,body,sentiment
0,3,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,0
1,1,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...,0
2,5,Love This Phone,"This is a great, reliable phone. I also purcha...",1
3,3,"Love the Phone, BUT...!","I love the phone and all, because I really did...",0
4,4,"Great phone service and options, lousy case!",The phone has been great for every purpose it ...,1
...,...,...,...,...
67981,5,"Awesome Phone, but finger scanner is a big mis...",I love the camera on this phone. The screen is...,1
67982,5,Simply Amazing!,I've been an Xperia user for several years and...,1
67983,5,"great phon3, but many bugs need to fix. still ...",buy one more for my cousin,1
67984,5,Phone is like new,Product looks and works like new. Very much re...,1


In [8]:
dataset.isnull().any()

rating       False
title         True
body          True
sentiment    False
dtype: bool

In [9]:
dataset["title"].fillna(dataset["title"].mode()[0] , inplace = True)
dataset["body"].fillna(dataset["body"].mode()[0] , inplace = True)

In [10]:
dataset.isnull().any()

rating       False
title        False
body         False
sentiment    False
dtype: bool

# Text Cleaning or Preprocessing

In [11]:
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
review = []

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
len(dataset)

67986

# Remove Punctuations & Numbers and Stemming

In [13]:
for i in range(len(dataset)):
    data = dataset["title"][i]
    data = re.sub('[^a-zA-Z]', ' ',data)
    data = data.lower()
    data = data.split()
    data = [ps.stem(word) for word in data if not word in set(stopwords.words('english'))]
    data = ' '.join(data)
    review.append(data)

# Creating Dependent Variables

In [14]:
cv = CountVectorizer(max_features = 3000)
x = cv.fit_transform(review).toarray()
y = dataset.iloc[:,3:4].values

# Splitting Data into Training and Test set

In [15]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 0,test_size = 0.2)

In [16]:
x_train.shape

(54388, 3000)

In [17]:
x_test.shape

(13598, 3000)

In [18]:
y_train.shape

(54388, 1)

In [19]:
y_test.shape

(13598, 1)

# Initializing the model

In [20]:
model = Sequential()

# Adding Input Layer

In [21]:
model.add(Dense(units = 3000, activation = "relu", kernel_initializer = "random_uniform"))

# Adding Hidden Layer

In [22]:
model.add(Dense(units = 4500, activation = "relu", kernel_initializer = "random_uniform"))

# Adding Output Layer

In [23]:
model.add(Dense(units = 1, activation = "sigmoid", kernel_initializer = "random_uniform"))

# Configuring the learning process

In [24]:
model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])

# : Training the model

In [25]:
model.fit(x_train, y_train, epochs = 20, batch_size = 32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x7ff55709b9e8>

# Saving Model

In [26]:
model.save('project.h5')

# Prediction

In [27]:
model = load_model('project.h5')

In [29]:
with open('CountVectorizer','wb') as file:
    pickle.dump(cv,file)

In [33]:
entered_input = "It is a very bad product"

In [34]:
x_intent = cv.transform([entered_input])
y_pred = model.predict(x_intent)
if (y_pred>0.5):
  print("It is a positive review")
else:
  print("It is a negative review")

It is a negative review


In [35]:
entered_input = "It is a bad product"

In [36]:
x_intent = cv.transform([entered_input])
y_pred = model.predict(x_intent)
if (y_pred>0.5):
  print("It is a positive review")
else:
  print("It is a negative review")

It is a negative review


In [37]:
entered_input = "It is a very good product"

In [38]:
x_intent = cv.transform([entered_input])
y_pred = model.predict(x_intent)
if (y_pred>0.5):
  print("It is a positive review")
else:
  print("It is a negative review")

It is a positive review


In [39]:
entered_input = "Awesome product"

In [40]:
x_intent = cv.transform([entered_input])
y_pred = model.predict(x_intent)
if (y_pred>0.5):
  print("It is a positive review")
else:
  print("It is a negative review")

It is a positive review
