In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore")

In [None]:
# load the dataset
import pandas as pd
df=pd.read_csv("all_kindle_review.csv")
df

In [None]:
df.columns

In [None]:
df=df[["reviewText", "rating"]]
df.head()

In [None]:
df.shape

In [None]:
#check for missing values
df.isnull().sum()

In [None]:
df["rating"].unique()

In [None]:
df["rating"].value_counts()

In [None]:
#Preprocessing and cleaning

In [None]:
#positive review is 1 and negative review is 0

df.loc[df["rating"]<=3,"rating"] = 0
df.loc[df["rating"]>=3,"rating"] = 1
# df.loc[:, "rating"] = df["rating"].apply(lambda x: 0 if x < 3 else 1)

In [None]:
df.head()

In [None]:
df["rating"].unique()

In [None]:
# to check how many counts are there for positive and negative sentiments
df["rating"].value_counts()

In [None]:
## lower all the cases 
df["reviewText"].str.lower()

In [None]:
df.loc[:, "reviewText"] = df["reviewText"].str.lower()

In [None]:
df.head()

### Cleaning the data


In [None]:
import nltk
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
nltk.download("stopwords")

In [None]:
# Function to clean review text
def clean_text(text):
    # Remove special characters
    text = re.sub('[^a-zA-Z0-9-]+', ' ', text)
    # Remove URLs
    text = re.sub(r'(http|https|ftp|ssh)://\S+', '', text)
    # Remove HTML tags
    text = BeautifulSoup(text, 'lxml').get_text()
    # Remove stopwords
    text = " ".join([word for word in text.split() if word.lower() not in stopwords.words('english')])
    # Remove any additional spaces
    text = " ".join(text.split())
    
    return text

# Apply the function to the reviewText column
df['reviewText'] = df['reviewText'].apply(clean_text)


In [None]:
df.head()

In [None]:
##lemmatizer
from nltk.stem import WordNetLemmatizer

In [None]:
lemmatizer=WordNetLemmatizer()

In [None]:
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word)for word in text.split()])
df.loc[:,'reviewText'] = df['reviewText'].apply(lemmatize_words)   

In [None]:
df.head()

In [None]:
consolidated=' '.join(word for word in df['rating'][df['reviewText']==0].astype(str))
wordCloud=WordCloud(width=1600,height=800,random_state=21,max_font_size=110)
plt.figure(figsize=(15,10))
plt.imshow(wordCloud.generate(consolidated),interpolation='bilinear')
plt.axis('off')
plt.show(

In [33]:
#train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df["reviewText"], df["rating"],test_size=0.2)

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer()
X_train_bow=bow.fit_transform(X_train).toarray()
X_test_bow=bow.transform(X_test).toarray()

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=CountVectorizer()
X_train_tfidf=tfidf.fit_transform(X_train).toarray()
X_test_tfidf=tfidf.transform(X_test).toarray()

In [56]:
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [57]:
X_train_tfidf

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [58]:
from sklearn.naive_bayes import GaussianNB
nb_model_bow=GaussianNB().fit(X_train_bow,y_train)
nb_model_tfidf=GaussianNB().fit(X_train_tfidf,y_train)

In [59]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [60]:
y_pred_tfidf=nb_model_bow.predict(X_test_bow)

In [50]:
y_pred_bow=nb_model_tfidf.predict(X_test_tfidf)

In [66]:
print("BOW accuracy:",accuracy_score(y_test,y_pred_bow))
print("BOW confusion matrix:",confusion_matrix(y_test,y_pred_bow))
# print("BOW classification report:",classification_report(y_test,y_pred_bow))

BOW accuracy: 0.60875
BOW confusion matrix: [[563 246]
 [693 898]]


In [67]:
print("TFIDF accuracy:",accuracy_score(y_test,y_pred_tfidf))
print("TFIDF confusion matrix:",confusion_matrix(y_test,y_pred_tfidf))

TFIDF accuracy: 0.60875
TFIDF confusion matrix: [[563 246]
 [693 898]]
