In [1]:
# import libraries

import pandas as pd
import numpy as np

In [2]:
# Read data

data = pd.read_csv('Amazon_customer_Reviews.csv')
data.drop('Unnamed: 0',axis=1,inplace=True)
data.head()

Unnamed: 0,Ratings,Reviews
0,Good,"I use Amazon.com often, and 99% of the time, ..."
1,Good,With Amazon you can leisurely shop in the com...
2,Good,I am very happy with all my purchases since I...
3,Good,"I absolutely love Amazon. Their selections, pr..."
4,Good,We have shopped Amazon for years and always fi...


In [3]:
#Data cleaning and preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet = WordNetLemmatizer()

In [4]:
# Function for preprocessing
def preprocessing(data):    
    
    reviews =  []
    for i in range(0, len(data)):
        # Replacing values in rows
        review = data['Reviews'][i]
        review = review.replace('$','dollars')
    
        # keeping only text and numbers in reviews
        review = re.sub('%', ' percent', review)
        review = re.sub('[^a-zA-Z0-9/]', ' ', review)
        review = review.lower()
        review = review.split()
    
        # Removing Stopwords
        review = [wordnet.lemmatize(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
    
        reviews.append(review)
    
    return reviews
        
reviews_1 = preprocessing(data)

<h1>TF-IDF</h3>

In [5]:
# TFIDF using TFIDFVectorizer    
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(reviews_1).toarray()

#dummyfying output variable
y=pd.get_dummies(data['Ratings'],drop_first=True)

In [6]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

<h3>Naive Bayes Classifier</h3>

In [11]:
# Training model using Naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB(alpha=0.01).fit(X_train, y_train)

y_pred=nb_model.predict(X_test)

  return f(**kwargs)


In [12]:
# Training Evaluation

y_pred_t = nb_model.predict(X_train)

# Classification metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm_train = confusion_matrix(y_pred_t, y_train)
print("Confusion Matrix:")
print(cm_train)

report_train = classification_report(y_pred_t, y_train)
print("Classification Report:",)
print (report_train)

accuracy_train = accuracy_score(y_pred_t,y_train)
print("Accuracy:",accuracy_train)

Confusion Matrix:
[[3654  111]
 [  54 1477]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      3765
           1       0.93      0.96      0.95      1531

    accuracy                           0.97      5296
   macro avg       0.96      0.97      0.96      5296
weighted avg       0.97      0.97      0.97      5296

Accuracy: 0.9688444108761329


In [13]:
# Testing Evaluation Evaluation

y_pred = nb_model.predict(X_test)

# Classification metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm_test = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:")
print(cm_test)

report_test = classification_report(y_pred, y_test)
print("Classification Report:",)
print (report_test)

accuracy_test = accuracy_score(y_pred,y_test)
print("Accuracy:",accuracy_test)

Confusion Matrix:
[[928  79]
 [ 18 300]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.92      0.95      1007
           1       0.79      0.94      0.86       318

    accuracy                           0.93      1325
   macro avg       0.89      0.93      0.91      1325
weighted avg       0.94      0.93      0.93      1325

Accuracy: 0.9267924528301886


In [14]:
# Hyperparameter tuning

from sklearn.model_selection import RandomizedSearchCV

# Create the random grid
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0 ]}



# create classifier
nb_model= MultinomialNB()
nb_randomcv = RandomizedSearchCV(estimator=nb_model,param_distributions=params,n_iter=300,cv=10,verbose=2,
                               random_state=100,n_jobs=-1)

### fit the randomized model
nb_randomcv.fit(X_train,y_train)

# Provides best parameters
nb_randomcv.best_params_


best_random_grid=nb_randomcv.best_estimator_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.6min finished
  return f(**kwargs)


In [15]:
# Provides best parameters
print(nb_randomcv.best_params_)

{'alpha': 0.1}


<h3>Linear Classifier</h3>

In [16]:
# Training model using Logistic regression
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression().fit(X_train, y_train)

y_pred=log_model.predict(X_test)

  return f(**kwargs)


In [17]:
# Training Evaluation

y_pred_t = log_model.predict(X_train)

# Classification metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm_train = confusion_matrix(y_pred_t, y_train)
print("Confusion Matrix:")
print(cm_train)

report_train = classification_report(y_pred_t, y_train)
print("Classification Report:",)
print (report_train)

accuracy_train = accuracy_score(y_pred_t,y_train)
print("Accuracy:",accuracy_train)

Confusion Matrix:
[[3656  218]
 [  52 1370]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.94      0.96      3874
           1       0.86      0.96      0.91      1422

    accuracy                           0.95      5296
   macro avg       0.92      0.95      0.94      5296
weighted avg       0.95      0.95      0.95      5296

Accuracy: 0.9490181268882175


In [18]:
# Testing Evaluation Evaluation

y_pred = log_model.predict(X_test)

# Classification metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm_test = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:")
print(cm_test)

report_test = classification_report(y_pred, y_test)
print("Classification Report:",)
print (report_test)

accuracy_test = accuracy_score(y_pred,y_test)
print("Accuracy:",accuracy_test)

Confusion Matrix:
[[932  77]
 [ 14 302]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.92      0.95      1009
           1       0.80      0.96      0.87       316

    accuracy                           0.93      1325
   macro avg       0.89      0.94      0.91      1325
weighted avg       0.94      0.93      0.93      1325

Accuracy: 0.9313207547169812


<h3>Support Vector Machine

In [19]:
# Training model using Support Vector Machine

from sklearn import svm
svm_model = svm.SVC()
svm_model.fit(X_train,y_train)

y_pred=svm_model.predict(X_test)

  return f(**kwargs)


In [20]:
# Training Evaluation

y_pred_t = svm_model.predict(X_train)

# Classification metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm_train = confusion_matrix(y_pred_t, y_train)
print("Confusion Matrix:")
print(cm_train)

report_train = classification_report(y_pred_t, y_train)
print("Classification Report:",)
print (report_train)

accuracy_train = accuracy_score(y_pred_t,y_train)
print("Accuracy:",accuracy_train)

Confusion Matrix:
[[3680   49]
 [  28 1539]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3729
           1       0.97      0.98      0.98      1567

    accuracy                           0.99      5296
   macro avg       0.98      0.98      0.98      5296
weighted avg       0.99      0.99      0.99      5296

Accuracy: 0.9854607250755287


In [21]:
# Testing Evaluation Evaluation

y_pred = svm_model.predict(X_test)

# Classification metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm_test = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:")
print(cm_test)

report_test = classification_report(y_pred, y_test)
print("Classification Report:",)
print (report_test)

accuracy_test = accuracy_score(y_pred,y_test)
print("Accuracy:",accuracy_test)

Confusion Matrix:
[[930  71]
 [ 16 308]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.96      1001
           1       0.81      0.95      0.88       324

    accuracy                           0.93      1325
   macro avg       0.90      0.94      0.92      1325
weighted avg       0.94      0.93      0.94      1325

Accuracy: 0.9343396226415094


<h3>NGram TF-IDF</h3>

In [23]:

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
X = tfidf_vect_ngram.fit_transform(reviews_1).toarray()

#dummyfying output variable
y=pd.get_dummies(data['Ratings'],drop_first=True)

In [27]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [33]:
# Training model using Naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB(alpha=0.5).fit(X_train, y_train)

y_pred=nb_model.predict(X_test)

  return f(**kwargs)


In [34]:
# Training Evaluation

y_pred_t = nb_model.predict(X_train)

# Classification metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm_train = confusion_matrix(y_pred_t, y_train)
print("Confusion Matrix:")
print(cm_train)

report_train = classification_report(y_pred_t, y_train)
print("Classification Report:",)
print (report_train)

accuracy_train = accuracy_score(y_pred_t,y_train)
print("Accuracy:",accuracy_train)

Confusion Matrix:
[[3609  214]
 [  99 1374]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.96      3823
           1       0.87      0.93      0.90      1473

    accuracy                           0.94      5296
   macro avg       0.92      0.94      0.93      5296
weighted avg       0.94      0.94      0.94      5296

Accuracy: 0.9408987915407855


In [35]:
# Testing Evaluation Evaluation

y_pred = nb_model.predict(X_test)

# Classification metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm_test = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:")
print(cm_test)

report_test = classification_report(y_pred, y_test)
print("Classification Report:",)
print (report_test)

accuracy_test = accuracy_score(y_pred,y_test)
print("Accuracy:",accuracy_test)

Confusion Matrix:
[[915  76]
 [ 31 303]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.92      0.94       991
           1       0.80      0.91      0.85       334

    accuracy                           0.92      1325
   macro avg       0.88      0.92      0.90      1325
weighted avg       0.92      0.92      0.92      1325

Accuracy: 0.9192452830188679


In [31]:
# Hyperparameter tuning

from sklearn.model_selection import RandomizedSearchCV

# Create the random grid
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0 ]}



# create classifier
nb_model= MultinomialNB()
nb_randomcv = RandomizedSearchCV(estimator=nb_model,param_distributions=params,n_iter=300,cv=10,verbose=2,
                               random_state=100,n_jobs=-1)

### fit the randomized model
nb_randomcv.fit(X_train,y_train)

# Provides best parameters
nb_randomcv.best_params_


best_random_grid=nb_randomcv.best_estimator_



Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   58.9s finished
  return f(**kwargs)


In [32]:
print(nb_randomcv.best_params_)

{'alpha': 0.5}
