In [3]:
import pandas as pd
import re
from nltk import word_tokenize
from nltk.classify import NaiveBayesClassifier
import nltk.classify.util
from nltk.corpus import stopwords

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\e-rbnunez\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:
df_reviews = pd.read_csv('amazon-cell-phones-reviews/20190928-reviews.csv')
df_reviews.loc[df_reviews['rating'] <= 3, 'liked'] = False
df_reviews.loc[df_reviews['rating'] > 3, 'liked'] = True
stop_words = set(stopwords.words('english'))


In [7]:
df_reviews.shape

(82815, 9)

In [9]:
threshold_factor = 0.7
model_index = int(threshold_factor * len(df_reviews))
print(f'model_index:{model_index}')
df_model_data = df_reviews.iloc[:model_index]

model_index:57970


In [10]:
threshold_factor = 0.8

positive_reviews = df_model_data[df_model_data['liked'] == True]
negative_reviews = df_model_data[df_model_data['liked'] == False]


In [11]:
m_str = 'this is a sentence.and this other 12.90'
re.sub("[^\w]", " ",  m_str).split()
def get_list_words(reviews_str):
    token = str(reviews_str)
    return re.sub("[^\w]", " ",  token).split()

def extract_features(word_list):
    return dict([(word, True) for word in word_list if word.lower() not in stop_words])
#if word not in stop_words

In [12]:
positive_series = [get_list_words(review) for review in positive_reviews['body'].values]
negative_series = [get_list_words(review) for review in negative_reviews['body'].values]

In [13]:
positive_features = [(extract_features(a_review), 'Positive') for a_review in positive_series]
negative_features = [(extract_features(a_review), 'Negative') for a_review in negative_series]

In [14]:
print(len(positive_features))
print(len(negative_features))

36984
20986


In [15]:
# Split the data into train and test (80/20)
threshold_factor = 0.8
threshold_positive = int(threshold_factor * len(positive_features))
threshold_negative = int(threshold_factor * len(negative_features))

In [16]:
features_train = positive_features[:threshold_positive] + negative_features[:threshold_negative]
features_test = positive_features[threshold_positive:] + negative_features[threshold_negative:]  
print("\nNumber of training datapoints:", len(features_train))
print("Number of test datapoints:", len(features_test))


Number of training datapoints: 46375
Number of test datapoints: 11595


In [17]:
# Train a Naive Bayes classifier
classifier = NaiveBayesClassifier.train(features_train)
print("\nAccuracy of the classifier:", nltk.classify.util.accuracy(classifier, features_test))


Accuracy of the classifier: 0.8146614920224234


In [18]:
classifier.show_most_informative_features(15)

Most Informative Features
                Horrible = True           Negati : Positi =     88.7 : 1.0
                   Waste = True           Negati : Positi =     74.6 : 1.0
            Disappointed = True           Negati : Positi =     66.4 : 1.0
                   Worst = True           Negati : Positi =     65.2 : 1.0
               Excelente = True           Positi : Negati =     57.7 : 1.0
               Returning = True           Negati : Positi =     55.8 : 1.0
                Terrible = True           Negati : Positi =     51.1 : 1.0
                  Return = True           Negati : Positi =     45.2 : 1.0
                  LOCKED = True           Negati : Positi =     42.9 : 1.0
                 Perfect = True           Positi : Negati =     35.9 : 1.0
                   Avoid = True           Negati : Positi =     35.8 : 1.0
                  BEWARE = True           Negati : Positi =     32.5 : 1.0
             paperweight = True           Negati : Positi =     26.9 : 1.0

In [23]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC


In [25]:
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, features_test))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(features_train)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, features_test))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(features_train)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, features_test))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(features_train)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, features_test))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(features_train)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, features_test))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(features_train)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, features_test))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(features_train)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, features_test))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(features_train)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, features_test))*100)

Original Naive Bayes Algo accuracy percent: 81.46614920224235
Most Informative Features
                Horrible = True           Negati : Positi =     88.7 : 1.0
                   Waste = True           Negati : Positi =     74.6 : 1.0
            Disappointed = True           Negati : Positi =     66.4 : 1.0
                   Worst = True           Negati : Positi =     65.2 : 1.0
               Excelente = True           Positi : Negati =     57.7 : 1.0
               Returning = True           Negati : Positi =     55.8 : 1.0
                Terrible = True           Negati : Positi =     51.1 : 1.0
                  Return = True           Negati : Positi =     45.2 : 1.0
                  LOCKED = True           Negati : Positi =     42.9 : 1.0
                 Perfect = True           Positi : Negati =     35.9 : 1.0
                   Avoid = True           Negati : Positi =     35.8 : 1.0
                  BEWARE = True           Negati : Positi =     32.5 : 1.0
            



LogisticRegression_classifier accuracy percent: 87.79646399310047
SGDClassifier_classifier accuracy percent: 88.09831824062096




SVC_classifier accuracy percent: 63.79473911168607




LinearSVC_classifier accuracy percent: 86.3130659767141




NuSVC_classifier accuracy percent: 84.62268219059939


In [26]:
from nltk.classify import ClassifierI
from statistics import mode

In [31]:
class UnoceroClassifier(ClassifierI):
    
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    
    def classify(self, features):
        votes = []
        for a_classifier in self._classifiers:
            a_vote = a_classifier.classify(features)
            votes.append(a_vote)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for a_classifier in self._classifiers:
            a_vote = a_classifier.classify(features)
            votes.append(a_vote)
        choice_votes = votes.count(mode(votes))
        return choice_votes / len(votes)

In [32]:
unocero_classifier = UnoceroClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("unocero_classifier accuracy percent:", (nltk.classify.accuracy(unocero_classifier, features_test))*100)

unocero_classifier accuracy percent: 88.21043553255714


In [42]:
print(features_test[11][0])
print("Classification:", unocero_classifier.classify(features_test[0][0]), "Confidence %:",unocero_classifier.confidence(features_test[0][0])*100)
print("Classification:", unocero_classifier.classify(features_test[1][0]), "Confidence %:",unocero_classifier.confidence(features_test[1][0])*100)

{'far': True, 'good': True, 'goes': True, 'crazy': True, 'sometimes': True, 'gets': True, 'wet': True, 'still': True, 'working': True}
Classification: Positive Confidence %: 100.0
Classification: Positive Confidence %: 100.0
