In [32]:
import pandas as pd
data = pd.read_csv('all_kindle_review.csv')
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [20]:
data = data[['reviewText','rating']]
data.head()

Unnamed: 0,reviewText,rating
0,Jace Rankin may short nothing mess man hauled ...,1
1,Great short read I want put I read one sitting...,1
2,I start saying first four books I expecting 34...,1
3,Aggie Angela Lansbury carries pocketbooks inst...,1
4,I expect type book library pleased find price ...,1


In [21]:
data.isnull().sum()

reviewText    0
rating        0
dtype: int64

In [22]:
data.shape

(12000, 2)

In [23]:
data['rating'].unique()

array([1, 0])

In [24]:
data['rating'].value_counts()

1    8000
0    4000
Name: rating, dtype: int64

In [25]:
## Preprocessing and Cleaning

In [26]:
data['rating'] = data['rating'].apply(lambda x: 0 if x<3 else 1)

In [27]:
data['rating']

0        0
1        0
2        0
3        0
4        0
        ..
11995    0
11996    0
11997    0
11998    0
11999    0
Name: rating, Length: 12000, dtype: int64

In [28]:
data.rating.unique()

array([0])

In [29]:
data.rating.value_counts()

0    12000
Name: rating, dtype: int64

In [30]:
data['reviewText'].str.lower()

0        jace rankin may short nothing mess man hauled ...
1        great short read i want put i read one sitting...
2        i start saying first four books i expecting 34...
3        aggie angela lansbury carries pocketbooks inst...
4        i expect type book library pleased find price ...
                               ...                        
11995    valentine cupid vampire jena ian another vampi...
11996    i read seven books series apocalyptic adventur...
11997    this book really cuppa the situation man captu...
11998    tried use charge kindle even register charging...
11999    taking instruction look often hidden world sex...
Name: reviewText, Length: 12000, dtype: object

In [31]:
## Removing Special Characters
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [34]:
data['reviewText'] = data['reviewText'].str.replace('[^a-zA-Z0-9 ]', ' ', regex=True)
## remove the stopwords
data['reviewText'] = data['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in set(stopwords.words('english'))]))
## Remove url
data['reviewText'] = data['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?',' ',str(x)))
## remove html tags
data['reviewText'] = data['reviewText'].apply(lambda x: BeautifulSoup(x,'lxml').get_text())
## remoce any additional spaces
data['reviewText'] = data['reviewText'].apply(lambda x:" ".join(x.split()))

In [19]:
data.head()

Unnamed: 0,reviewText,rating
0,Jace Rankin may short nothing mess man hauled ...,1
1,Great short read I want put I read one sitting...,1
2,I start saying first four books I expecting 34...,1
3,Aggie Angela Lansbury carries pocketbooks inst...,1
4,I expect type book library pleased find price ...,1


In [35]:
from nltk.stem import WordNetLemmatizer
wd = WordNetLemmatizer()

In [36]:
def lemmatize_words(text):
    return " ".join([wd.lemmatize(word) for word in text.split()])

In [37]:
data['reviewText'] = data['reviewText'].apply(lambda x:lemmatize_words(x))

In [38]:
## train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(data['reviewText'],data['rating'],test_size=0.20)

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x_train_cv = cv.fit_transform(x_train).toarray()
x_test_cv = cv.transform(x_test).toarray()

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
x_train_tfidf = tfidf.fit_transform(x_train).toarray()
x_test_tfidf = tfidf.transform(x_test).toarray()

In [45]:
x_train_cv.shape

(9600, 25468)

In [46]:
from sklearn.naive_bayes import GaussianNB
nb_model_bow = GaussianNB().fit(x_train_cv,y_train)
nb_model_tfidf = GaussianNB().fit(x_train_tfidf,y_train)


In [47]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


In [48]:
y_pred_bow = nb_model_bow.predict(x_test_cv)
y_pred_tfidf = nb_model_tfidf.predict(x_test_tfidf)

In [49]:
print(classification_report(y_test,y_pred_bow))
print(confusion_matrix(y_test,y_pred_bow))
print(accuracy_score(y_test,y_pred_bow))

              precision    recall  f1-score   support

           1       0.23      0.48      0.31       408
           2       0.21      0.22      0.21       389
           3       0.20      0.19      0.19       381
           4       0.36      0.24      0.29       601
           5       0.50      0.33      0.40       621

    accuracy                           0.29      2400
   macro avg       0.30      0.29      0.28      2400
weighted avg       0.33      0.29      0.29      2400

[[196  83  50  52  27]
 [163  84  59  52  31]
 [147  70  71  55  38]
 [182  81  90 143 105]
 [158  75  93  92 203]]
0.29041666666666666


In [50]:
print(classification_report(y_test,y_pred_tfidf))
print(confusion_matrix(y_test,y_pred_tfidf))
print(accuracy_score(y_test,y_pred_tfidf))

              precision    recall  f1-score   support

           1       0.25      0.44      0.32       408
           2       0.23      0.21      0.22       389
           3       0.22      0.24      0.23       381
           4       0.35      0.27      0.31       601
           5       0.48      0.34      0.40       621

    accuracy                           0.30      2400
   macro avg       0.31      0.30      0.29      2400
weighted avg       0.33      0.30      0.31      2400

[[180  78  54  61  35]
 [140  82  70  63  34]
 [126  60  90  64  41]
 [154  72  93 165 117]
 [128  63  97 122 211]]
0.30333333333333334


In [51]:
!pip install gensim

4012.68s - pydevd: Sending message related to process being replaced timed-out after 5 seconds




In [None]:
import gensim
from gensim.models import word2vec, keyedvectors
# import gensim.downloader as api
# wv = api.load('word2vec-google-news-300')

In [61]:
import gensim

In [76]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [79]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return words

data['tokens'] = data['reviewText'].apply(preprocess)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vaibhavkale/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [80]:
from sklearn.model_selection import train_test_split

X_tokens = data['tokens']
y = data['rating']

X_train_tokens, X_test_tokens, y_train, y_test = train_test_split(
    X_tokens, y, test_size=0.2, random_state=42
)


In [81]:
from gensim.models import Word2Vec

model = Word2Vec(
    sentences=X_train_tokens.tolist(),  # only use training tokens
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)


In [82]:
import numpy as np

def avg_word2vec(doc, model):
    vectors = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Convert training and test data separately
X_train_vec = np.vstack(X_train_tokens.apply(lambda doc: avg_word2vec(doc, model)))
X_test_vec = np.vstack(X_test_tokens.apply(lambda doc: avg_word2vec(doc, model)))


In [83]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train_vec, y_train)

print("Test accuracy:", clf.score(X_test_vec, y_test))


Test accuracy: 0.395


In [84]:
y_pred_wd2vec = clf.predict(X_test_vec)

In [85]:
from sklearn.metrics import classification_report,accuracy_score

In [86]:
accuracy_score(y_test,y_pred_wd2vec)

0.395

In [88]:
print(classification_report(y_test,y_pred_wd2vec))

              precision    recall  f1-score   support

           1       0.42      0.45      0.43       404
           2       0.31      0.27      0.29       399
           3       0.24      0.13      0.16       387
           4       0.36      0.45      0.40       587
           5       0.51      0.55      0.53       623

    accuracy                           0.40      2400
   macro avg       0.37      0.37      0.36      2400
weighted avg       0.38      0.40      0.38      2400

