CB.EN.U4CSE20211


In [None]:
import pandas as pd
import nltk 
nltk.download('stopwords')                 # download the stopwords from NLTK
                                 # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words tht come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

from sklearn.linear_model import LogisticRegression  
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.model_selection import train_test_split  
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt            # library for visualization
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import spacy
import pickle

import plotly.express as px
import re
import numpy as np
from wordcloud import WordCloud, STOPWORDS

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


1. Read the dataset

In [None]:
df=pd.read_csv('/content/amazon_alexa_data.csv')

2. Remove handle null values (if any).


In [None]:
df.isnull().any()

Unnamed: 0          False
rating              False
date                False
variation           False
verified_reviews    False
feedback            False
dtype: bool

3. Preprocess the Amazon Alexa reviews based on the following parameter:


In [None]:
df['new_reviews'] = df['verified_reviews'].str.lower()
df['new_reviews']

0                                           love my echo!
1                                               loved it!
2       sometimes while playing a game, you can answer...
3       i have had a lot of fun with this thing. my 4 ...
4                                                   music
                              ...                        
3145    perfect for kids, adults and everyone in betwe...
3146    listening to music, searching locations, check...
3147    i do love these things, i have them running my...
3148    only complaint i have is that the sound qualit...
3149                                                 good
Name: new_reviews, Length: 3150, dtype: object

In [None]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text
df['new_reviews'] = df['verified_reviews'].str.replace('[^\w\s]','')
df['new_reviews']

  """


0                                            Love my Echo
1                                                Loved it
2       Sometimes while playing a game you can answer ...
3       I have had a lot of fun with this thing My 4 y...
4                                                   Music
                              ...                        
3145      Perfect for kids adults and everyone in between
3146    Listening to music searching locations checkin...
3147    I do love these things i have them running my ...
3148    Only complaint I have is that the sound qualit...
3149                                                 Good
Name: new_reviews, Length: 3150, dtype: object

In [None]:
def process_rev(rev):
    """Process review function.
    Input:
        rev: a string containing a review
    Output:
        rev_clean: a list of words containing the processed review

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # tokenize reviews
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    rev_tokens = tokenizer.tokenize(rev)

    rev_clean = []
    for word in rev_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # rev_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            rev_clean.append(stem_word)

    return rev_clean

In [None]:
# using the process_rev function for:
# 1. Removing stop words
# 2. Tokenization
# 3. Stemming
A = []
a = df['verified_reviews']
for i in a:
  i = process_rev(i)
  A.append(i)
df['verified_reviews'] = A
df

Unnamed: 0.1,Unnamed: 0,rating,date,variation,verified_reviews,feedback,new_reviews
0,0,5,31-Jul-18,Charcoal Fabric,"[love, echo]",1,Love my Echo
1,1,5,31-Jul-18,Charcoal Fabric,[love],1,Loved it
2,2,4,31-Jul-18,Walnut Finish,"[sometim, play, game, answer, question, correc...",1,Sometimes while playing a game you can answer ...
3,3,5,31-Jul-18,Charcoal Fabric,"[lot, fun, thing, 4, yr, old, learn, dinosaur,...",1,I have had a lot of fun with this thing My 4 y...
4,4,5,31-Jul-18,Charcoal Fabric,[music],1,Music
...,...,...,...,...,...,...,...
3145,3145,5,30-Jul-18,Black Dot,"[perfect, kid, adult, everyon]",1,Perfect for kids adults and everyone in between
3146,3146,5,30-Jul-18,Black Dot,"[listen, music, search, locat, check, time, lo...",1,Listening to music searching locations checkin...
3147,3147,5,30-Jul-18,Black Dot,"[love, thing, run, entir, home, tv, light, the...",1,I do love these things i have them running my ...
3148,3148,5,30-Jul-18,White Dot,"[complaint, sound, qualiti, great, mostli, use...",1,Only complaint I have is that the sound qualit...


4. Transform the words into vectors using Count Vectorizer 

In [None]:
cv = CountVectorizer(max_features=1500, analyzer='word', lowercase=False) 
df['verified_reviews'] = df['verified_reviews'].apply(lambda x: " ".join(x) )  # to join all words in the lists
X = cv.fit_transform(df['verified_reviews'])  # predictor variable 'X'
df

Unnamed: 0.1,Unnamed: 0,rating,date,variation,verified_reviews,feedback,new_reviews
0,0,5,31-Jul-18,Charcoal Fabric,love echo,1,Love my Echo
1,1,5,31-Jul-18,Charcoal Fabric,love,1,Loved it
2,2,4,31-Jul-18,Walnut Finish,sometim play game answer question correctli al...,1,Sometimes while playing a game you can answer ...
3,3,5,31-Jul-18,Charcoal Fabric,lot fun thing 4 yr old learn dinosaur control ...,1,I have had a lot of fun with this thing My 4 y...
4,4,5,31-Jul-18,Charcoal Fabric,music,1,Music
...,...,...,...,...,...,...,...
3145,3145,5,30-Jul-18,Black Dot,perfect kid adult everyon,1,Perfect for kids adults and everyone in between
3146,3146,5,30-Jul-18,Black Dot,listen music search locat check time look weat...,1,Listening to music searching locations checkin...
3147,3147,5,30-Jul-18,Black Dot,love thing run entir home tv light thermostat ...,1,I do love these things i have them running my ...
3148,3148,5,30-Jul-18,White Dot,complaint sound qualiti great mostli use comma...,1,Only complaint I have is that the sound qualit...


5. Split data into training and test data.

In [None]:
y = pd.DataFrame(df['feedback'])  # respose variable 'y'
y.head()

Unnamed: 0,feedback
0,1
1,1
2,1
3,1
4,1


In [None]:
x_train,x_test,y_train,y_test = train_test_split(df['verified_reviews'],df['feedback'],test_size=0.2)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train)

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(2520,)
(2520,)
(630,)
(630,)


In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train_count,y_train)

MultinomialNB()

In [None]:
data_list = list(x_test)
data_test = cv.transform(data_list)
predict = model.predict(data_test)

In [None]:
predict

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(predict,y_test)
cm


array([[ 18,   6],
       [ 41, 565]])

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,predict)

0.9253968253968254

In [None]:
X = np.array(df['rating']).reshape(-1,1)
Y = np.array(df['feedback']).reshape(-1,1)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)

modelLogistic = LogisticRegression()
modelLogistic.fit(x_train,y_train)

print("The intercept b0= ", modelLogistic.intercept_)

print("The coefficient b1= ", modelLogistic.coef_)

The intercept b0=  [-13.31264046]
The coefficient b1=  [[5.42434842]]


  y = column_or_1d(y, warn=True)


In [None]:
ConfusionMatrix = confusion_matrix(y_test, y_pred)
print(ConfusionMatrix)

[[ 71   0]
 [  0 717]]


In [None]:
roc_auc_score(y_test, y_pred)

1.0

In [None]:
X = np.array(df['rating']).reshape(-1,1)
Y = np.array(df['feedback']).reshape(-1,1)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.25, random_state = 0)



In [None]:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)



In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=8) 
classifier.fit(x_train, y_train) 
y_pred = classifier.predict(x_test) 
y_pred



  return self._fit(X, y)


array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))


[[ 71   0]
 [  0 717]]


In [None]:
print(classification_report(y_test, y_pred))
from sklearn.metrics import accuracy_score
roc_auc_score(y_test, y_pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        71
           1       1.00      1.00      1.00       717

    accuracy                           1.00       788
   macro avg       1.00      1.00      1.00       788
weighted avg       1.00      1.00      1.00       788



1.0

From above analysis I infer that
Both logistic and KNN models has the best accuracy than the Multinomial Naïve Bayes Classification model