### "Amazon-Alexa" text classification using "Bag of Words and "TF-IDF" ( w/ spacy and sklearn pipelines )

#### https://www.kaggle.com/datasets/sid321axn/amazon-alexa-reviews

In [None]:


!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


### import the required libraries

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

### Load the input data ( "amazon alexa reviews data")

In [None]:
# Loading TSV file
df_amazon = pd.read_csv ("amazon_alexa.tsv", sep="\t")

In [None]:
# Top 5 records
df_amazon.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [None]:
import string
import spacy

from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):

    """This function will accepts a sentence as input and processes the sentence into tokens, performing lemmatization,
    lowercasing, removing stop words and punctuations."""

    # Creating our token object, which is used to create documents with linguistic annotations.
    #mytokens = parser(sentence)
    mytokens = nlp(sentence)
    #print(mytokens)

    # Lemmatizing each token and converting each token into lowercase
    # Note that spaCy uses '-PRON-' as lemma for all personal pronouns lkike me, I etc
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [None]:
from sklearn.base import TransformerMixin

# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # It checks the parameters in the dataset using X_train and y_train. It then converts them into a
        #         structure that the model can understand
        # Cleaning Text - Override the transform method to clean text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        # it fits the model into the dataset. This enables the model to learn by understanding patterns in the
        #.   dataset
        return self

    def get_params(self, deep=True):
        # This method retrieves all the converted and optimized parameters to produce an optimized model
        return {}

# Basic function to clean the text
def clean_text(text):
    # This function cleans our dataset and converts all the texts into lower case
    # Removing spaces and converting text into lowercase
    text = str(text)
    return text.strip().lower()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [None]:
from sklearn.model_selection import train_test_split

X = df_amazon['verified_reviews'] # the features we want to analyze
ylabels = df_amazon['feedback'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)
print(f'X_train dimension: {X_train.shape}')
print(f'y_train dimension: {y_train.shape}')
print(f'X_test dimension: {X_test.shape}')
print(f'y_train dimension: {y_test.shape}')

X_train dimension: (2205,)
y_train dimension: (2205,)
X_test dimension: (945,)
y_train dimension: (945,)


In [None]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)



In [None]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)
print(pipe.score(X_test, y_test))

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

0.9248677248677248
Logistic Regression Accuracy: 0.9248677248677248
Logistic Regression Precision: 0.9274725274725275
Logistic Regression Recall: 0.9941107184923439


In [None]:
import string
import spacy

from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

def test(sentence):

    """This function will accepts a sentence as input and processes the sentence into tokens, performing lemmatization,
    lowercasing, removing stop words and punctuations."""

    # Creating our token object, which is used to create documents with linguistic annotations.
    #mytokens = parser(sentence)
    mytokens = nlp(sentence)
    print(mytokens)

    # Lemmatizing each token and converting each token into lowercase
    # Note that spaCy uses '-PRON-' as lemma for all personal pronouns lkike me, I etc
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    print(mytokens)
    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    #print(mytokens)
    # return preprocessed list of tokens
    return mytokens

In [None]:
test('This is the sentence to work')

This is the sentence to work
['this', 'be', 'the', 'sentence', 'to', 'work']


['sentence', 'work']

In [None]:
from sklearn.base import TransformerMixin

# Custom transformer using spaCy
class predictors2(TransformerMixin):
    def transform(self, X, **transform_params):
        # It checks the parameters in the dataset using X_train and y_train. It then converts them into a
        #         structure that the model can understand
        # Cleaning Text - Override the transform method to clean text
        #print(X)
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        # it fits the model into the dataset. This enables the model to learn by understanding patterns in the
        #.   dataset
        return self

    #def get_params(self, deep=True):
        # This method retrieves all the converted and optimized parameters to produce an optimized model
    #    return {}

# Basic function to clean the text
def clean_text(text):
    # This function cleans our dataset and converts all the texts into lower case
    # Removing spaces and converting text into lowercase
    text = str(text)
    #print(text.strip().lower())
    return text.strip().lower()

In [None]:
clean_text('This is the 4 sentence ')

'this is the 4 sentence'

In [None]:
pipe2 = Pipeline([("cleaner", predictors2()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe2.fit(X_train,y_train)



In [None]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)
print(pipe.score(X_test, y_test))

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

0.9248677248677248
Logistic Regression Accuracy: 0.9248677248677248
Logistic Regression Precision: 0.9274725274725275
Logistic Regression Recall: 0.9941107184923439
