In [1]:
# import libraries

import pandas as pd
import numpy as np

In [2]:
# Read data

data = pd.read_csv('Amazon_customer_Reviews.csv')
data.drop('Unnamed: 0',axis=1,inplace=True)
data.head()

Unnamed: 0,Ratings,Reviews
0,Good,"I use Amazon.com often, and 99% of the time, ..."
1,Good,With Amazon you can leisurely shop in the com...
2,Good,I am very happy with all my purchases since I...
3,Good,"I absolutely love Amazon. Their selections, pr..."
4,Good,We have shopped Amazon for years and always fi...


In [4]:
data['Ratings'].value_counts()

Bad     4654
Good    1967
Name: Ratings, dtype: int64

In [3]:
#Data cleaning and preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet = WordNetLemmatizer()

In [4]:
# Function for preprocessing
def preprocessing(data):    
    
    reviews =  []
    for i in range(0, len(data)):
        # Replacing values in rows
        review = data['Reviews'][i]
        review = review.replace('$','dollars')
    
        # keeping only text and numbers in reviews
        review = re.sub('%', ' percent', review)
        review = re.sub('[^a-zA-Z0-9/]', ' ', review)
        review = review.lower()
        review = review.split()
    
        # Removing Stopwords
        review = [wordnet.lemmatize(word) for word in review if not word in stopwords.words('english')]
        review = ' '.join(review)
    
        reviews.append(review)
    
    return reviews
        
reviews_1 = preprocessing(data)

In [22]:
# Converting to dataframe and then csv file of pre processed data

reviews_pre = pd.DataFrame(reviews_1)

reviews_pre.to_csv('preprocessed.csv')

<h3>Bag of Words Model</h3>

In [5]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(reviews_1).toarray()

#dummyfying output variable
y=pd.get_dummies(data['Ratings'],drop_first=True)


In [7]:
# Save Count vectorizer
#from sklearn.externals import joblib
import pickle

pickle.dump(cv, open('tranform.pkl', 'wb'))

In [8]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)


<h3>Naive Bayes Classifier</h3>

In [93]:
# Training model using Naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=0.01).fit(X_train, y_train)

y_pred=model.predict(X_test)

  return f(**kwargs)


In [94]:
# Training Evaluation

y_pred_t = model.predict(X_train)

# Classification metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm_train = confusion_matrix(y_pred_t, y_train)
print("Confusion Matrix:")
print(cm_train)

report_train = classification_report(y_pred_t, y_train)
print("Classification Report:",)
print (report_train)

accuracy_train = accuracy_score(y_pred_t,y_train)
print("Accuracy:",accuracy_train)


Confusion Matrix:
[[3590  103]
 [ 118 1485]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3693
           1       0.94      0.93      0.93      1603

    accuracy                           0.96      5296
   macro avg       0.95      0.95      0.95      5296
weighted avg       0.96      0.96      0.96      5296

Accuracy: 0.9582703927492447


In [95]:
# Testing Evaluation Evaluation

y_pred = model.predict(X_test)

# Classification metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm_test = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:")
print(cm_test)

report_test = classification_report(y_pred, y_test)
print("Classification Report:",)
print (report_test)

accuracy_test = accuracy_score(y_pred,y_test)
print("Accuracy:",accuracy_test)

Confusion Matrix:
[[916  59]
 [ 30 320]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95       975
           1       0.84      0.91      0.88       350

    accuracy                           0.93      1325
   macro avg       0.91      0.93      0.92      1325
weighted avg       0.94      0.93      0.93      1325

Accuracy: 0.9328301886792453


In [62]:
# Hyperparameter tuning

from sklearn.model_selection import RandomizedSearchCV

# Create the random grid
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0 ]}



# create classifier
nb_model= MultinomialNB()
nb_randomcv = RandomizedSearchCV(estimator=nb_model,param_distributions=params,n_iter=300,cv=10,verbose=2,
                               random_state=100,n_jobs=-1)

### fit the randomized model
nb_randomcv.fit(X_train,y_train)

# Provides best parameters
nb_randomcv.best_params_


best_random_grid=nb_randomcv.best_estimator_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   47.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.1min finished
  return f(**kwargs)


In [63]:
print(nb_randomcv.best_params_)

{'alpha': 0.01}


<h3>Linear Classifier</h3>

In [9]:
# Training model using Logistic regression
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(solver= 'newton-cg', penalty= 'l2', C=0.1).fit(X_train, y_train)

y_pred=log_model.predict(X_test)

  return f(**kwargs)


In [110]:
# Training Evaluation

y_pred_t = log_model.predict(X_train)

# Classification metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm_train = confusion_matrix(y_pred_t, y_train)
print("Confusion Matrix:")
print(cm_train)

report_train = classification_report(y_pred_t, y_train)
print("Classification Report:",)
print (report_train)

accuracy_train = accuracy_score(y_pred_t,y_train)
print("Accuracy:",accuracy_train)


Confusion Matrix:
[[3655   98]
 [  53 1490]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      3753
           1       0.94      0.97      0.95      1543

    accuracy                           0.97      5296
   macro avg       0.96      0.97      0.97      5296
weighted avg       0.97      0.97      0.97      5296

Accuracy: 0.971487915407855


In [111]:
# Testing Evaluation Evaluation

y_pred = log_model.predict(X_test)

# Classification metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm_test = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:")
print(cm_test)

report_test = classification_report(y_pred, y_test)
print("Classification Report:",)
print (report_test)

accuracy_test = accuracy_score(y_pred,y_test)
print("Accuracy:",accuracy_test)

Confusion Matrix:
[[918  55]
 [ 28 324]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.96       973
           1       0.85      0.92      0.89       352

    accuracy                           0.94      1325
   macro avg       0.91      0.93      0.92      1325
weighted avg       0.94      0.94      0.94      1325

Accuracy: 0.9373584905660377


In [106]:
# Hyperparameter tuning

from sklearn.model_selection import RandomizedSearchCV

# Create the random grid
params = {'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
         'penalty':['none','l1','l2','elasticnet'],
         'C' : [100, 10, 1.0, 0.1, 0.01]}



# create classifier
nb_model= MultinomialNB()
nb_randomcv = RandomizedSearchCV(estimator=log_model,param_distributions=params,n_iter=300,cv=10,verbose=2,
                               random_state=100,n_jobs=-1)

### fit the randomized model
nb_randomcv.fit(X_train,y_train)

# Provides best parameters
nb_randomcv.best_params_


best_random_grid=nb_randomcv.best_estimator_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 28.4min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 56.2min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 90.6min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 132.5min finished
  return f(**kwargs)


In [107]:
print(nb_randomcv.best_params_)

{'solver': 'newton-cg', 'penalty': 'l2', 'C': 0.1}


In [11]:
# Save Count vectorizer
#from sklearn.externals import joblib
import pickle

pickle.dump(log_model, open('clf.pkl', 'wb'))

<h3>Support Vector Machine</h3>

In [8]:
# Training model using Support Vector Machine

from sklearn import svm
svm_model = svm.SVC()
svm_model.fit(X_train,y_train)

y_pred=svm_model.predict(X_test)

  return f(**kwargs)


In [26]:
# Training Evaluation

y_pred_t = svm_model.predict(X_train)

# Classification metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm_train = confusion_matrix(y_pred_t, y_train)
print("Confusion Matrix:")
print(cm_train)

report_train = classification_report(y_pred_t, y_train)
print("Classification Report:",)
print (report_train)

accuracy_train = accuracy_score(y_pred_t,y_train)
print("Accuracy:",accuracy_train)

Confusion Matrix:
[[3644  158]
 [  64 1430]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      3802
           1       0.90      0.96      0.93      1494

    accuracy                           0.96      5296
   macro avg       0.94      0.96      0.95      5296
weighted avg       0.96      0.96      0.96      5296

Accuracy: 0.9580815709969789


In [27]:
# Testing Evaluation Evaluation

y_pred = svm_model.predict(X_test)

# Classification metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm_test = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:")
print(cm_test)

report_test = classification_report(y_pred, y_test)
print("Classification Report:",)
print (report_test)

accuracy_test = accuracy_score(y_pred,y_test)
print("Accuracy:",accuracy_test)

Confusion Matrix:
[[920  63]
 [ 26 316]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.94      0.95       983
           1       0.83      0.92      0.88       342

    accuracy                           0.93      1325
   macro avg       0.90      0.93      0.92      1325
weighted avg       0.94      0.93      0.93      1325

Accuracy: 0.9328301886792453


In [None]:
# Hyperparameter tuning

from sklearn.model_selection import RandomizedSearchCV

# Create the random grid
params = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['linear', 'poly', 'rbf']} 


# create classifier
svm_model = svm.SVC()
svm_randomcv = RandomizedSearchCV(estimator=svm_model,param_distributions=params,n_iter=300,cv=10,verbose=2,
                               random_state=100,n_jobs=-1)

### fit the randomized model
svm_randomcv.fit(X_train,y_train)

# Provides best parameters
svm_randomcv.best_params_


best_random_grid=svm_randomcv.best_estimator_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 10 folds for each of 75 candidates, totalling 750 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 31.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 135.9min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 324.0min
