In [None]:
import pandas as pd
# lets read the dataset
data = pd.read_csv('/content/amazon_alexa.tsv', delimiter = '\t')

# lets check the shape of the dataset
data.shape

(3150, 5)

In [None]:
# lets check the head of the dataset
data.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


In [None]:
# cleaning the texts
# importing the libraries for Natural Language Processing

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
corpus = []

for i in range(0, 3150):
    # Check if the element is a valid string
    if isinstance(data['verified_reviews'][i], str):
        review = re.sub('[^a-zA-Z]', ' ', data['verified_reviews'][i])  ## Removing all Unnecessary items
        review = review.lower()                                         ## Converting into Lower Case
        review = review.split()
        ps = PorterStemmer()                                            ## Stemming
        review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]  ## Removing Stopwords
        review = ' '.join(review)
        corpus.append(review)
    else:
        # Handle NaN or non-string values
        corpus.append("")  # or any other handling strategy, like skipping the entry

# Now corpus contains cleaned text data


In [None]:
corpus

['love echo',
 'love',
 'sometim play game answer question correctli alexa say got wrong answer like abl turn light away home',
 'lot fun thing yr old learn dinosaur control light play game like categori nice sound play music well',
 'music',
 'receiv echo gift need anoth bluetooth someth play music easili access found smart speaker wait see els',
 'without cellphon cannot use mani featur ipad see use great alarm u r almost deaf hear alarm bedroom live room reason enough keep fun ask random question hear respons seem smartbon polit yet',
 'think th one purchas work get one everi room hous realli like featur offer specifili play music echo control light throughout hous',
 'look great',
 'love listen song heard sinc childhood get news weather inform great',
 'sent year old dad talk constantli',
 'love learn knew thing eveyday still figur everyth work far easi use understand make laugh time',
 'purchas mother knee problem give someth tri come get around fast like enjoy littl big thing ale

In [None]:
# Extracting the Features using the Tfidf Vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(max_features = 2500)
x = tf.fit_transform(corpus).toarray()
y = data.iloc[:, 4].values

print(x.shape)
print(y.shape)

(3150, 2500)
(3150,)


In [None]:
# splitting the data into training and testing sets

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 15)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


(2205, 2500)
(2205,)
(945, 2500)
(945,)


In [None]:
#Support Vector machine
from sklearn.svm import SVC

model = SVC()
model.fit(x_train, y_train)



In [None]:
print("Training Accuracy :", model.score(x_train, y_train))
print("Testing Accuracy :", model.score(x_test, y_test))

Training Accuracy : 0.982766439909297
Testing Accuracy : 0.9291005291005291


In [None]:
y_pred = model.predict(x_test)

In [None]:
#model evaluation,confusion matrix and classification report
from sklearn.metrics import confusion_matrix ,classification_report
cm=confusion_matrix(y_test,y_pred)
print(cm)

[[  8  67]
 [  0 870]]


In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.11      0.19        75
           1       0.93      1.00      0.96       870

    accuracy                           0.93       945
   macro avg       0.96      0.55      0.58       945
weighted avg       0.93      0.93      0.90       945



In [None]:
#Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train, y_train)


print("Training Accuracy :", model.score(x_train, y_train))
print("Testing Accuracy :", model.score(x_test, y_test))

Training Accuracy : 0.9170068027210885
Testing Accuracy : 0.9185185185185185


In [None]:
y_pred = model.predict(x_test)
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
#model evaluation,confusion matrix and classification report
from sklearn.metrics import confusion_matrix ,classification_report
cm=confusion_matrix(y_test,y_pred)
print(cm)

[[  0  75]
 [  2 868]]


In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        75
           1       0.92      1.00      0.96       870

    accuracy                           0.92       945
   macro avg       0.46      0.50      0.48       945
weighted avg       0.85      0.92      0.88       945



In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

# List of models
models = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
}

# Train each model and evaluate accuracy
for model_name, model in models.items():
    # Train the model
    model.fit(x_train, y_train)

    # Make predictions
    y_pred = model.predict(x_test)

    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    print(f"{model_name} Accuracy: {accuracy * 100:.2f}%")


SVM Accuracy: 92.91%
Random Forest Accuracy: 93.65%
Logistic Regression Accuracy: 92.17%
Naive Bayes Accuracy: 91.85%
K-Nearest Neighbors Accuracy: 92.06%
Decision Tree Accuracy: 91.96%
Gradient Boosting Accuracy: 93.44%
AdaBoost Accuracy: 92.17%


Implementation with bag of word

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

# Ensure stopwords are downloaded
import nltk
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv('/content/amazon_alexa.tsv', delimiter='\t')

# Initialize the Porter Stemmer and stopwords set
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Preprocess the reviews
def preprocess(text):
    if not isinstance(text, str):
        return ""
    review = re.sub('[^a-zA-Z]', ' ', text)  # Removing non-alphabetic characters
    review = review.lower()                  # Convert to lower case
    review = review.split()                  # Tokenize
    review = [ps.stem(word) for word in review if word not in stop_words]  # Remove stopwords and apply stemming
    return ' '.join(review)                  # Reassemble the text

# Apply preprocessing
data['processed_reviews'] = data['verified_reviews'].apply(preprocess)

# Features and target
X = data['processed_reviews']
y = data['feedback']  # Replace with the actual column name for sentiment (binary label)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Bag of word

In [None]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer(max_features=5000)  # Limit to 5000 features
X_bow = vectorizer.fit_transform(X)

# Convert to array for compatibility with scikit-learn
X_bow = X_bow.toarray()


In [None]:
# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)


In [None]:
# List of models
models = {
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
}

# Train each model and evaluate accuracy
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    print(f"{model_name} Accuracy: {accuracy * 100:.2f}%")


SVM Accuracy: 91.59%
Random Forest Accuracy: 94.13%
Logistic Regression Accuracy: 93.33%
Naive Bayes Accuracy: 91.59%
K-Nearest Neighbors Accuracy: 91.11%
Decision Tree Accuracy: 92.22%
Gradient Boosting Accuracy: 91.59%
AdaBoost Accuracy: 91.75%
