In [50]:
import pandas as pd
import re
import numpy as np
from nltk import word_tokenize
from nltk.classify import NaiveBayesClassifier
import nltk.classify.util
from nltk.corpus import stopwords

## Exploración y transformación de datos

In [2]:
df_cell_phones = pd.read_csv('amazon-cell-phones-reviews/20190928-items.csv')
df_cell_phones.head(2)

Unnamed: 0,asin,brand,title,url,image,rating,reviewUrl,totalReviews,prices
0,B0000SX2UC,Nokia,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,
1,B0009N5L7K,Motorola,Motorola I265 phone,https://www.amazon.com/Motorola-i265-I265-phon...,https://m.media-amazon.com/images/I/419WBAVDAR...,2.9,https://www.amazon.com/product-reviews/B0009N5L7K,7,$49.95


In [3]:
df_reviews = pd.read_csv('amazon-cell-phones-reviews/20190928-reviews.csv')
df_reviews.head(2)

Unnamed: 0,asin,name,rating,date,verified,title,body,helpfulVotes
0,B0000SX2UC,Janet,3,"October 11, 2005",False,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0
1,B0000SX2UC,Luke Wyatt,1,"January 7, 2004",False,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...,17.0


In [4]:
df_reviews_merge = pd.merge(df_reviews, df_cell_phones, how="left", left_on="asin", right_on="asin")
print(df_reviews_merge.shape)
df_reviews_merge.head(2)

(82815, 16)


Unnamed: 0,asin,name,rating_x,date,verified,title_x,body,helpfulVotes,brand,title_y,url,image,rating_y,reviewUrl,totalReviews,prices
0,B0000SX2UC,Janet,3,"October 11, 2005",False,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0,Nokia,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,
1,B0000SX2UC,Luke Wyatt,1,"January 7, 2004",False,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...,17.0,Nokia,Dual-Band / Tri-Mode Sprint PCS Phone w/ Voice...,https://www.amazon.com/Dual-Band-Tri-Mode-Acti...,https://m.media-amazon.com/images/I/2143EBQ210...,3.0,https://www.amazon.com/product-reviews/B0000SX2UC,14,


In [5]:
## Agregar columna 'appreciation'
df_reviews_merge.loc[df_reviews['rating'] < 3, 'appreciation'] = -1
df_reviews_merge.loc[df_reviews['rating'] > 3, 'appreciation'] = 1
df_reviews_merge.loc[df_reviews['rating'] == 3, 'appreciation'] = 0
stop_words = set(stopwords.words('english'))


## Obtener datos para el modelo(entrenamiento y test)

In [15]:
threshold_factor = 0.7
model_index = int(threshold_factor * len(df_reviews))
df_model_data = df_reviews_merge.iloc[:model_index]

threshold_factor = 0.8

positive_reviews = df_model_data[df_model_data['appreciation'] == 1]
negative_reviews = df_model_data[df_model_data['appreciation'] <= 0]
#neutral_reviews = df_model_data[df_model_data['appreciation'] <= 0]


In [16]:
# definir funciones
def get_list_words(reviews_str):
    token = str(reviews_str)
    return re.sub("[^\w]", " ",  token).split()

def extract_features(word_list):
    return dict([(word, True) for word in word_list if word.lower() not in stop_words])


In [17]:
positive_series = [get_list_words(review) for review in positive_reviews['body'].values]
negative_series = [get_list_words(review) for review in negative_reviews['body'].values]
neutral_series = [get_list_words(review) for review in neutral_reviews['body'].values]

positive_features = [(extract_features(a_review), 'Positive') for a_review in positive_series]
negative_features = [(extract_features(a_review), 'Negative') for a_review in negative_series]
neutral_features = [(extract_features(a_review), 'Neutral') for a_review in neutral_series]

### Separar dataset para entrenamiento y pruebas

In [18]:
# Split the data into train and test (80/20)
threshold_factor = 0.8
threshold_positive = int(threshold_factor * len(positive_features))
threshold_negative = int(threshold_factor * len(negative_features))
threshold_neutral = int(threshold_factor * len(neutral_features))

features_train = positive_features[:threshold_positive] + negative_features[:threshold_negative]
features_test = positive_features[threshold_positive:] + negative_features[threshold_negative:]

print("\nNumber of training datapoints:", len(features_train))
print("Number of test datapoints:", len(features_test))


Number of training datapoints: 46375
Number of test datapoints: 11595


### Entrenar con NaiveBayes

In [19]:
# Train a Naive Bayes classifier
classifier = NaiveBayesClassifier.train(features_train)
print("\nAccuracy of the classifier:", nltk.classify.util.accuracy(classifier, features_test))
classifier.show_most_informative_features(15)


Accuracy of the classifier: 0.8146614920224234
Most Informative Features
                Horrible = True           Negati : Positi =     88.7 : 1.0
                   Waste = True           Negati : Positi =     74.6 : 1.0
            Disappointed = True           Negati : Positi =     66.4 : 1.0
                   Worst = True           Negati : Positi =     65.2 : 1.0
               Excelente = True           Positi : Negati =     57.7 : 1.0
               Returning = True           Negati : Positi =     55.8 : 1.0
                Terrible = True           Negati : Positi =     51.1 : 1.0
                  Return = True           Negati : Positi =     45.2 : 1.0
                  LOCKED = True           Negati : Positi =     42.9 : 1.0
                 Perfect = True           Positi : Negati =     35.9 : 1.0
                   Avoid = True           Negati : Positi =     35.8 : 1.0
                  BEWARE = True           Negati : Positi =     32.5 : 1.0
             paperweight =

### Entrenar con otros clasificadores

In [20]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC


In [21]:
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, features_test))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(features_train)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, features_test))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(features_train)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, features_test))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression(solver='lbfgs', multi_class='auto'))
LogisticRegression_classifier.train(features_train)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, features_test))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(features_train)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, features_test))*100)

#SVC_classifier = SklearnClassifier(SVC())
#SVC_classifier.train(features_train)
#print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, features_test))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(features_train)
LinearSVC_classifier.
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, features_test))*100)

#NuSVC_classifier = SklearnClassifier(NuSVC(gamma='auto'))
#NuSVC_classifier.train(features_train)
#print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, features_test))*100)

Original Naive Bayes Algo accuracy percent: 81.46614920224235
Most Informative Features
                Horrible = True           Negati : Positi =     88.7 : 1.0
                   Waste = True           Negati : Positi =     74.6 : 1.0
            Disappointed = True           Negati : Positi =     66.4 : 1.0
                   Worst = True           Negati : Positi =     65.2 : 1.0
               Excelente = True           Positi : Negati =     57.7 : 1.0
               Returning = True           Negati : Positi =     55.8 : 1.0
                Terrible = True           Negati : Positi =     51.1 : 1.0
                  Return = True           Negati : Positi =     45.2 : 1.0
                  LOCKED = True           Negati : Positi =     42.9 : 1.0
                 Perfect = True           Positi : Negati =     35.9 : 1.0
                   Avoid = True           Negati : Positi =     35.8 : 1.0
                  BEWARE = True           Negati : Positi =     32.5 : 1.0
            



LogisticRegression_classifier accuracy percent: 87.79646399310047
SGDClassifier_classifier accuracy percent: 87.96895213454074




LinearSVC_classifier accuracy percent: 86.3130659767141


### Crear mi propio Clasificador que combina los anteriores

In [30]:
from nltk.classify import ClassifierI
from statistics import mode
from collections import Counter

class UnoceroClassifier(ClassifierI):
    
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    
    def classify(self, features):
        votes = []
        for a_classifier in self._classifiers:
            a_vote = a_classifier.classify(features)
            votes.append(a_vote)
        m_counter = Counter(votes)
        return m_counter.most_common(1)[0][0]
    
    def confidence(self, features):
        votes = []
        for a_classifier in self._classifiers:
            a_vote = a_classifier.classify(features)
            votes.append(a_vote)
        choice_votes = votes.count(mode(votes))
        return choice_votes / len(votes)
    

In [31]:
unocero_classifier = UnoceroClassifier(classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier)

print("unocero_classifier accuracy percent:", (nltk.classify.accuracy(unocero_classifier, features_test))*100)

unocero_classifier accuracy percent: 88.17593790426909


In [34]:
print(f'Clasificar la reseña:{features_test[0][0]}')
print("Classification:", unocero_classifier.classify(features_test[0][0]), "Confidence %:",unocero_classifier.confidence(features_test[0][0])*100)

Clasificar la reseña:{'Love': True, 'phone': True, 'Everything': True, 'worked': True, 'great': True, 'far': True, 'good': True, 'anyways': True}
Classification: Positive Confidence %: 100.0


In [40]:
predicted = [unocero_classifier.classify(a_feature[0]) for a_feature in features_test]
real = [a_feature[1] for a_feature in features_test]

In [55]:
#print(f'{np.shape(predicted)}  {np.shape(real)}')
pd.crosstab(np.array(real), np.array(predicted), rownames=['Actual Rate'], colnames=['Predicted Rate'])
#from sklearn.metrics import classification_report,confusion_matrix
#print(confusion_matrix(real,predicted))


Predicted Rate,Negative,Positive
Actual Rate,Unnamed: 1_level_1,Unnamed: 2_level_1
Negative,3609,589
Positive,782,6615


In [77]:
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from scipy.stats import norm

fpr, tpr, thresholds = roc_curve(np.array(real), np.array(predicted), pos_label = "Positive")
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='NB')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()


UFuncTypeError: ufunc 'subtract' did not contain a loop with signature matching types (dtype('<U8'), dtype('<U8')) -> dtype('<U8')

In [83]:
dict(zip(('a','b','c','d','e'),(1,2,3,4,5)))

help(zip)

Help on class zip in module builtins:

class zip(object)
 |  zip(*iterables) --> zip object
 |  
 |  Return a zip object whose .__next__() method returns a tuple where
 |  the i-th element comes from the i-th iterable argument.  The .__next__()
 |  method continues until the shortest iterable in the argument sequence
 |  is exhausted and then it raises StopIteration.
 |  
 |  Methods defined here:
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __iter__(self, /)
 |      Implement iter(self).
 |  
 |  __next__(self, /)
 |      Implement next(self).
 |  
 |  __reduce__(...)
 |      Return state information for pickling.
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.

