<a href="https://colab.research.google.com/github/summit99/Major_ml/blob/main/Review_Analysis_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analysis of Cell Phone Reviews

***************************************************

## Importing required libraries

In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing and cleaning the dataset

In [5]:
df = pd.read_csv('/content/drive/MyDrive/review_data/data/20191226-reviews.csv')

In [6]:
df.head()

Unnamed: 0,asin,name,rating,date,verified,title,body,helpfulVotes
0,B0000SX2UC,Janet,3,"October 11, 2005",False,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0
1,B0000SX2UC,Luke Wyatt,1,"January 7, 2004",False,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...,17.0
2,B0000SX2UC,Brooke,5,"December 30, 2003",False,Love This Phone,"This is a great, reliable phone. I also purcha...",5.0
3,B0000SX2UC,amy m. teague,3,"March 18, 2004",False,"Love the Phone, BUT...!","I love the phone and all, because I really did...",1.0
4,B0000SX2UC,tristazbimmer,4,"August 28, 2005",False,"Great phone service and options, lousy case!",The phone has been great for every purpose it ...,1.0


In [7]:
df.shape

(67986, 8)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67986 entries, 0 to 67985
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   asin          67986 non-null  object 
 1   name          67984 non-null  object 
 2   rating        67986 non-null  int64  
 3   date          67986 non-null  object 
 4   verified      67986 non-null  bool   
 5   title         67972 non-null  object 
 6   body          67965 non-null  object 
 7   helpfulVotes  27215 non-null  float64
dtypes: bool(1), float64(1), int64(1), object(5)
memory usage: 3.7+ MB


In [9]:
df.isnull().sum()

asin                0
name                2
rating              0
date                0
verified            0
title              14
body               21
helpfulVotes    40771
dtype: int64

In [10]:
title_miss = df[df.title.isnull()]

In [12]:
title_miss

Unnamed: 0,asin,name,rating,date,verified,title,body,helpfulVotes
30010,B01NB1KG8U,Sylvester Ofosuhene,5,"December 24, 2019",True,,,
30949,B06XR1K6HR,MOHAMED ALI,5,"January 17, 2019",True,,Almost like pretty new,
32883,B06XSF5C42,Candice,5,"June 13, 2019",True,,Love this phone. Everything's worked great. So...,
35016,B071H9KKKF,Wauany,5,"June 17, 2018",True,,Like the phone so far!!! Never had an expensiv...,
42935,B077T4MVZ6,Evaldina,4,"November 14, 2018",True,,Love it,
45899,B079X7DQ4Q,Roberto,5,"November 25, 2019",True,,,
45905,B079X7DQ4Q,Mahmood al rahawi,5,"December 7, 2018",True,,"I get that phone I needed ,, thanks .",
46470,B07BHT4KGM,Roberto,5,"November 25, 2019",True,,,
46476,B07BHT4KGM,Mahmood al rahawi,5,"December 7, 2018",True,,"I get that phone I needed ,, thanks .",
50404,B07FZH9BGV,Henry,5,"November 1, 2018",True,,Great phone...A++,1.0


In [13]:
title_miss[['rating', 'title', 'body']]

Unnamed: 0,rating,title,body
30010,5,,
30949,5,,Almost like pretty new
32883,5,,Love this phone. Everything's worked great. So...
35016,5,,Like the phone so far!!! Never had an expensiv...
42935,4,,Love it
45899,5,,
45905,5,,"I get that phone I needed ,, thanks ."
46470,5,,
46476,5,,"I get that phone I needed ,, thanks ."
50404,5,,Great phone...A++


In [14]:
dataset = df[~df.title.isnull()]

In [15]:
dataset.reset_index(inplace = True)

In [16]:
dataset.isnull().sum()

index               0
asin                0
name                2
rating              0
date                0
verified            0
title               0
body               16
helpfulVotes    40758
dtype: int64

In [17]:
reviews = dataset.title
ratings = dataset.rating

In [21]:
ratings = np.where(ratings >= 3, 1, 0)

In [22]:
dataset.shape

(67972, 9)

In [23]:
reviews.head()

0                     Def not best, but not worst
1                     Text Messaging Doesn't Work
2                                 Love This Phone
3                         Love the Phone, BUT...!
4    Great phone service and options, lousy case!
Name: title, dtype: object

 ## Text Preprocessing

### Import required libraries

In [24]:
import nltk #Natural Language Toolkit library
from nltk.corpus import stopwords #Library to remove stopwords
from nltk.stem.porter import PorterStemmer #Library to stem words
import re 

In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [26]:
ps = PorterStemmer()

In [27]:
m = len(reviews)

In [28]:
data = []
for i in range(m):
    review = reviews[i]
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    data.append(review)

### Creating bag of words model

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

In [30]:
cv = CountVectorizer()

In [31]:
X = cv.fit_transform(data).toarray()

## Creating train and test set

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, ratings.ravel(), test_size = 0.25, random_state = 42)

In [34]:
max_features = X.shape[-1]

In [35]:
max_features

6984

# Model Building

## Libraries to build model

In [36]:
import tensorflow.keras as keras

In [37]:
model = keras.Sequential() #Model Instantiation

In [38]:
model.add(keras.layers.Input(shape = (max_features,))) #Input Layer

In [39]:
model.add(keras.layers.Dense(units = 200, activation = 'relu')) #Hidden Layer
model.add(keras.layers.Dropout(rate = 0.8))

In [40]:
model.add(keras.layers.Dense(units = 1, activation = 'sigmoid')) #Output Layer

In [41]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [42]:
model_checkpoint = keras.callbacks.ModelCheckpoint('sentiment', save_best_only = True)

In [43]:
history = model.fit(X_train, y_train, batch_size = 32, epochs = 100, validation_split = 0.2, callbacks = [model_checkpoint])

Epoch 1/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 2/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 3/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 4/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 5/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 6/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 7/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 8/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 9/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 10/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 11/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 12/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 13/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 14/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 15/100
INFO:tensorflow:Assets written to: sentiment/assets
Epoch 16/100
INFO:tensorflow:Asset

In [44]:
model.evaluate(X_test, y_test)



[5.153904316466799e-18, 1.0]

In [45]:
best = keras.models.load_model('sentiment')

In [46]:
model.save('sentiment.h5')

## Save CountVectorizer object

In [47]:
import pickle

In [48]:
with open('countvectorizer', 'wb') as fout:
    pickle.dump(cv, fout)