### "Amazon-Alexa" text classification using "Bag of Words and "TF-IDF"

#### https://www.kaggle.com/datasets/sid321axn/amazon-alexa-reviews

### import the required libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
#from sklearn.base import TransformerMixin
#from sklearn.pipeline import Pipeline

### Load the input data ( "amazon alexa reviews data")

In [None]:
# Loading TSV file
df_amazon = pd.read_csv("amazon_alexa.tsv", sep="\t")

In [None]:
# Top 5 records
df_amazon.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


### Vectorization using sample Dataset

In [None]:
df_amazon_sample =  df_amazon.head()
df_amazon_sample.shape

(5, 5)

In [None]:
data_sample = df_amazon_sample.verified_reviews.tolist()
data_sample

['Love my Echo!',
 'Loved it!',
 'Sometimes while playing a game, you can answer a question correctly but Alexa says you got it wrong and answers the same as you.  I like being able to turn lights on and off while away from home.',
 'I have had a lot of fun with this thing. My 4 yr old learns about dinosaurs, i control the lights and play games like categories. Has nice sound when playing music as well.',
 'Music']

### Steps to follow :
#### 1. Instantiate the vectorizer object
#### 2. convert the documents into a matrix
#### 3. get the features / corpus

In [None]:
# instantiate the vectorizer object
countvectorizer = CountVectorizer(analyzer= 'word', stop_words='english')
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')

In [None]:
# convert th documents into a matrix
count_wm = countvectorizer.fit_transform(data_sample)
tfidf_wm = tfidfvectorizer.fit_transform(data_sample)

In [None]:
count_wm.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
        1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
tfidf_wm.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.25860653, 0.25860653, 0.25860653, 0.25860653, 0.25860653,
        0.        , 0.    

In [None]:
pip install -U scikit-learn




In [None]:
# Retrieve the terms found in the corpora
# Ensure you're using the tokens as needed without overriding
count_tokens = countvectorizer.get_feature_names_out()
tfidf_tokens = tfidfvectorizer.get_feature_names_out()


In [None]:
count_tokens

array(['able', 'alexa', 'answer', 'answers', 'away', 'categories',
       'control', 'correctly', 'dinosaurs', 'echo', 'fun', 'game',
       'games', 'got', 'home', 'learns', 'lights', 'like', 'lot', 'love',
       'loved', 'music', 'nice', 'old', 'play', 'playing', 'question',
       'says', 'sound', 'thing', 'turn', 'wrong', 'yr'], dtype=object)

In [None]:
bow_array = count_wm.toarray()
tfidf_array = tfidf_wm.toarray()
df = pd.DataFrame(bow_array, columns = count_tokens)
df

Unnamed: 0,able,alexa,answer,answers,away,categories,control,correctly,dinosaurs,echo,...,old,play,playing,question,says,sound,thing,turn,wrong,yr
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,0,0,1,0,0,...,0,0,1,1,1,0,0,1,1,0
3,0,0,0,0,0,1,1,0,1,0,...,1,1,1,0,0,1,1,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df2 = pd.DataFrame(tfidf_array, columns =  tfidf_tokens)
df2

Unnamed: 0,able,alexa,answer,answers,away,categories,control,correctly,dinosaurs,echo,...,old,play,playing,question,says,sound,thing,turn,wrong,yr
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.258607,0.258607,0.258607,0.258607,0.258607,0.0,0.0,0.258607,0.0,0.0,...,0.0,0.0,0.208642,0.258607,0.258607,0.0,0.0,0.258607,0.258607,0.0
3,0.0,0.0,0.0,0.0,0.0,0.253155,0.253155,0.0,0.253155,0.0,...,0.253155,0.253155,0.204244,0.0,0.0,0.253155,0.253155,0.0,0.0,0.253155
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Data processing and modeling / classification on 'Amazon-Alexa' full dataset

In [None]:
# Remove rows where 'verified_reviews' is NaN
df_amazon = df_amazon.dropna(subset=['verified_reviews'])

# Proceed with vectorization
count_wm = countvectorizer.fit_transform(df_amazon.verified_reviews)
tfidf_wm = tfidfvectorizer.fit_transform(df_amazon.verified_reviews)


In [None]:
X = count_wm.toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
y = np.array(df_amazon.feedback)
y

array([1, 1, 1, ..., 1, 1, 1])

### Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(f'X_train dimension: {X_train.shape}')
print(f'y_train dimension: {y_train.shape}')
print(f'X_test dimension: {X_test.shape}')
print(f'y_train dimension: {y_test.shape}')

X_train dimension: (2204, 3784)
y_train dimension: (2204,)
X_test dimension: (945, 3784)
y_train dimension: (945,)


### Modeling - Classification

In [None]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

classifier.fit(X_train,y_train)
predicted = classifier.predict(X_test)

In [None]:
from sklearn import metrics
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.9566137566137566
Logistic Regression Precision: 0.9647964796479648
Logistic Regression Recall: 0.989841986455982
