In [1]:
import re
import numpy as np
import pandas as pd
import nltk

In [2]:
df = pd.read_csv("Amazon_Unlocked_Mobile.csv")

In [3]:
from sklearn import naive_bayes

In [4]:
clf = naive_bayes.MultinomialNB()

In [5]:
#sample data to speed up computation
df = df.sample(frac=0.1, random_state=10)
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0


In [6]:
print df.shape

(41384, 6)


In [7]:
#drop missing values
df.dropna(inplace=True)

In [8]:
#remove neutral ratings equal to 3
df = df[df["Rating"] != 3]

In [9]:
print df.shape

(30737, 6)


In [10]:
#Encode score of 4 and 5 as 1 (positive), 1 and 2 as negative (poor)
df["Positively Rated"] = np.where(df["Rating"] > 3, 1, 0)
df.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0,0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0,1
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0,0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0,1
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0,1
100311,Blackberry Torch 2 9810 Unlocked Phone with 1....,BlackBerry,77.49,5,I am pleased with this Blackberry phone! The p...,0.0,1
251669,Motorola Moto E (1st Generation) - Black - 4 G...,Motorola,89.99,5,"Great product, best value for money smartphone...",0.0,1
279878,OtterBox 77-29864 Defender Series Hybrid Case ...,OtterBox,9.99,5,I've bought 3 no problems. Fast delivery.,0.0,1
406017,Verizon HTC Rezound 4G Android Smarphone - 8MP...,HTC,74.99,4,Great phone for the price...,0.0,1
302567,"RCA M1 Unlocked Cell Phone, Dual Sim, 5Mp Came...",RCA,159.99,5,My mom is not good with new technoloy but this...,4.0,1


In [11]:
#get average rating
df["Positively Rated"].mean()

0.7471776686078667

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
xtrain, xtest, ytrain, ytest = train_test_split(df["Reviews"], df["Positively Rated"], random_state=0)

In [14]:
print "Xtrain first entry:", xtrain.iloc[0]

Xtrain first entry: Everything about it is awesome!


In [15]:
print xtrain.shape

(23052L,)


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
vect = CountVectorizer().fit(xtrain)

In [20]:
vect.get_feature_names()[::2000]

[u'00',
 u'arroja',
 u'comapa\xf1ias',
 u'dvds',
 u'golden',
 u'lands',
 u'oil',
 u'razonable',
 u'smallsliver',
 u'tweak']

In [21]:
print len(vect.get_feature_names())

19601


In [22]:
#transform documents in training data to document-term matrix
xtrain_vector = vect.transform(xtrain)
xtrain_vector

<23052x19601 sparse matrix of type '<type 'numpy.int64'>'
	with 613289 stored elements in Compressed Sparse Row format>

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
model = LogisticRegression()
model.fit(xtrain_vector, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
from sklearn.metrics import roc_auc_score

In [26]:
preds = model.predict(vect.transform(xtest))
print "AUC:", roc_auc_score(ytest, preds)

AUC: 0.897433277667


In [27]:
feature_names = np.array(vect.get_feature_names())
#sort model coefficients
sorted_coef_index = model.coef_[0].argsort()

In [29]:
#find 10 smallest/largest coefficients
print "Smallest coefficients:", feature_names[sorted_coef_index[:10]]
print "Largest coefficients:", feature_names[sorted_coef_index[:-11:-1]]

Smallest coefficients: [u'worst' u'terrible' u'slow' u'junk' u'poor' u'sucks' u'horrible'
 u'useless' u'waste' u'disappointed']
Largest coefficients: [u'excelent' u'excelente' u'excellent' u'perfectly' u'love' u'perfect'
 u'exactly' u'great' u'best' u'awesome']


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
vect = TfidfVectorizer(min_df=5).fit(xtrain) #minimum of 5 documents for token to be part of vocab
len(vect.get_feature_names())

5442

In [32]:
xtrain_vector = vect.transform(xtrain)
model = LogisticRegression()
model.fit(xtrain_vector, ytrain)
preds = model.predict(vect.transform(xtest))
print "AUC:", roc_auc_score(ytest, preds)

AUC: 0.889951006492


In [33]:
feature_names = np.array(vect.get_feature_names())
#sort model coefficients
sorted_tfidf_index = xtrain_vector.max(0).toarray()[0].argsort()

In [34]:
#find 10 smallest/largest tfidf
print "Smallest tfidf:", feature_names[sorted_tfidf_index[:10]]
print "Largest tfidf:", feature_names[sorted_tfidf_index[:-11:-1]]

Smallest tfidf: [u'61' u'printer' u'approach' u'adjustment' u'consequences' u'length'
 u'emailing' u'degrees' u'handsfree' u'chipset']
Largest tfidf: [u'unlocked' u'handy' u'useless' u'cheat' u'up' u'original' u'exelent'
 u'exelente' u'exellent' u'satisfied']


In [36]:
#use ngram features, min document still 5 and extract 1-gram and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(1, 2)).fit(xtrain)
xtrain_vector = vect.transform(xtrain)
print len(vect.get_feature_names())

 29072


In [37]:
model = LogisticRegression()
model.fit(xtrain_vector, ytrain)
preds = model.predict(vect.transform(xtest))
print "AUC:", roc_auc_score(ytest, preds)

AUC: 0.91106617946


In [38]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
[u'no good' u'junk' u'poor' u'slow' u'worst' u'broken' u'not good'
 u'terrible' u'defective' u'horrible']

Largest Coefs: 
[u'excellent' u'excelente' u'excelent' u'perfect' u'great' u'love'
 u'awesome' u'no problems' u'good' u'best']


In [39]:
# These reviews are now correctly identified
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[1 0]
