In [1]:
## Steps
# 1. Data cleaning and Preprocessing
# 2. Applying Train Test Split
# 3. Text Processing ( sentences -> vectors )
# 4. Model Training
# 5. Prediction

In [2]:
## Loading Dataset

import pandas as pd
data = pd.read_csv("all_kindle_review.csv")
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [3]:
## We doesn't need all columns(features) , so we are selecting specific columns
data = data[["reviewText","rating"]]
data.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [4]:
## Input Feature = reviewText
## Output Feature = rating

In [12]:
## Data Insights
def data_insights_fun():
  print("Dimensions = ",data.shape)
  print("Null Values =\n",data.isnull().sum())
  print("Summary = ",data.info())

data_insights_fun()

Dimensions =  (12000, 2)
Null Values =
 reviewText    0
rating        0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviewText  12000 non-null  object
 1   rating      12000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 187.6+ KB
Summary =  None


In [13]:
## Unique values in "rating" column
data["rating"].unique()

array([3, 5, 4, 2, 1])

In [14]:
## checking for imbalance dataset
data['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,3000
4,3000
3,2000
2,2000
1,2000


In [15]:
## It is not an Imbalance Dataset

In [16]:
## We have 5 categories of rating in "rating" column (1,2,3,4,5) but we need only two values in output
## positive rating = 1 ( rating(x) > 3 )
## negative rating = 0 ( rating(x) < 3 )
data["rating"] = data["rating"].apply(lambda x : 0 if x < 3 else 1)
data["rating"]

Unnamed: 0,rating
0,1
1,1
2,1
3,1
4,1
...,...
11995,1
11996,1
11997,1
11998,0


In [17]:
data["rating"].unique(),data["rating"].value_counts()

(array([1, 0]),
 rating
 1    8000
 0    4000
 Name: count, dtype: int64)

In [18]:
## Now our output feature is divided into 2 categories and it is still balanced

In [21]:
## Data Cleaning and Preprocessing

## importing required libraries
import nltk
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [22]:
def data_cleaning():
  ## lowering case of "reviewText" column
  data["reviewText"] = data["reviewText"].str.lower()
  ## removing special characters
  data["reviewText"] = data["reviewText"].apply(lambda x : re.sub("[^a-z A-Z 0-9- ]+"," ",x)) ## values except expression will be replaced by " "
  ## removing stopwords
  data["reviewText"] = data["reviewText"].apply(lambda x : " ".join([ word for word in x.split() if word not in stopwords.words("english")]))
  ## removing url
  data["reviewText"] = data["reviewText"].apply(lambda x : re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'," ",str(x)))
  ## removing html tags
  data["reviewText"] = data["reviewText"].apply(lambda x : BeautifulSoup(x,"lxml").get_text())
  ## removing any additional spaces
  data["reviewText"] = data["reviewText"].apply(lambda x : " ".join(x.split()))
  return None

data_cleaning()

In [24]:
## Applying Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

## lemmatization function
def lemmatize_fun(x):
    return " ".join([lemmatizer.lemmatize(word) for word in x.split()])

## applying function
data["reviewText"] = data["reviewText"].apply(lambda x : lemmatize_fun(x))

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [25]:
data["reviewText"].head()

Unnamed: 0,reviewText
0,jace rankin may short nothing mess man hauled ...
1,great short read want put read one sitting sex...
2,start saying first four book expecting 34 conc...
3,aggie angela lansbury carry pocketbook instead...
4,expect type book library pleased find price right


In [26]:
## Applying Train Test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data["reviewText"],data["rating"],test_size=0.20)

In [27]:
## Applying BOW and TF-DF
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer()
X_train_bow = bow.fit_transform(X_train).toarray()
X_test_bow = bow.transform(X_test).toarray()

## fit_transform for training data and only transform for test data to prevent Data leakage
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [28]:
X_train_bow,X_train_bow.shape,X_train_tfidf,X_train_tfidf.shape

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]),
 (9600, 24803),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 (9600, 24803))

In [29]:
## Applying ML Algorithm
from sklearn.naive_bayes import GaussianNB ## we can also use multinomial naive bayes
gaussian_nb = GaussianNB()

## training the model
bow_model = gaussian_nb.fit(X_train_bow,y_train)
tfidf_model = gaussian_nb.fit(X_train_tfidf,y_train)

In [31]:
## Prediction
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

y_pred_bow = bow_model.predict(X_test_bow)
y_pred_tfidf = tfidf_model.predict(X_test_tfidf)

In [33]:
## confusion matrix
print(confusion_matrix(y_test,y_pred_bow))
print(confusion_matrix(y_test,y_pred_tfidf))

[[ 475  353]
 [ 571 1001]]
[[513 315]
 [732 840]]


In [34]:
## accuracy score
print("BOW accuracy score = " ,accuracy_score(y_test,y_pred_bow))
print("TF-IDF accuracy score = ",accuracy_score(y_test,y_pred_tfidf))

BOW accuracy score =  0.615
TF-IDF accuracy score =  0.56375


In [35]:
## Not too good accuracy so we can use Word2vec model for better accuracy👌