In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
data = pd.read_csv("Amazon_reviews.csv")
print(data.head())

                                        Product Name Brand Name   Price  \
0  "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
1  "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
2  "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
3  "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   
4  "CLEAR CLEAN ESN" Sprint EPIC 4G Galaxy SPH-D7...    Samsung  199.99   

   Rating                                            Reviews  Review Votes  
0       5  I feel so LUCKY to have found this used (phone...           1.0  
1       4  nice phone, nice up grade from my pantach revu...           0.0  
2       5                                       Very pleased           0.0  
3       4  It works good but it goes slow sometimes but i...           0.0  
4       4  Great phone to replace my lost phone. The only...           0.0  


In [3]:
data.dropna(inplace = True)
data = data[data['Rating'] != 3]
data['Sentiment'] = np.where(data['Rating'] > 3, 1, 0)
print(data.shape)
print(data['Sentiment'].mean())

(308277, 7)
0.7482686025879323


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data['Reviews'], data['Sentiment'], test_size=0.15, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.235, random_state=1)
print("X_train:", X_train.shape)
print("X_val:", X_val.shape)
print("X_test:", X_test.shape)


X_train: (200456,)
X_val: (61579,)
X_test: (46242,)


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
countVect = CountVectorizer().fit(X_train)
print(countVect.get_feature_names()[::5000])
print("Feature count:", len(countVect.get_feature_names()))

['00', 'anounced', 'chartered', 'displaythe', 'functionsmeasurementslength', 'itits', 'nand', 'primera', 'selectboard', 'thid', 'wouuuuuu']
Feature count: 50863


In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
X_train_vectorized = countVect.transform(X_train)

In [10]:
model = LogisticRegression(C = 200)
model.fit(X_train_vectorized, y_train)

LogisticRegression(C=200, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
predictions = model.predict(countVect.transform(X_val))

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [13]:
print("Accuracy score", accuracy_score(y_val, predictions))
print("f1_score:", f1_score(y_val, predictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_val, predictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_val, predictions, average = 'weighted'))

Accuracy score 0.9521590152487049
f1_score: 0.9517986245433183
average_precision_score(weighted): 0.9560651601937877
recall_score(weighted): 0.9521590152487049


In [14]:
testPredictions = model.predict(countVect.transform(X_test))

In [15]:
print("Accuracy score", accuracy_score(y_test, testPredictions))
print("f1_score(weighted):", f1_score(y_test, testPredictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_test, testPredictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_test, testPredictions, average = 'weighted'))

Accuracy score 0.9523809523809523
f1_score(weighted): 0.9520365125338477
average_precision_score(weighted): 0.9561505149836593
recall_score(weighted): 0.9523809523809523


In [16]:
feature_names = np.array(countVect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['mony' 'false' 'worst' 'horribly' 'unsatisfied' 'worthless' 'nope'
 'messing' 'nit' 'lemon']

Largest Coefs: 
['excelent' 'excelente' '4eeeks' 'superb' 'efficient' 'exelente'
 'pleasantly' 'lovely' 'matching' 'satisfy']


In [17]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,#;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z _]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    text = text.lower()
    text = re.sub(REPLACE_BY_SPACE_RE, " ", text)
    text = re.sub(BAD_SYMBOLS_RE, "", text)
    text_tokens = word_tokenize(text)
    filtered_sentence = ""
    for w in text_tokens:
        if w not in STOPWORDS:
            filtered_sentence += w + " "
    return filtered_sentence[:-1]

In [18]:
X2_train = [text_prepare(x) for x in X_train]
X2_val = [text_prepare(x) for x in X_val]
X2_test = [text_prepare(x) for x in X_test]

In [19]:
words_counts = {}

for review in X2_train:
    for word in review.split():
        words_counts[word] = words_counts.setdefault(word, 0) + 1

most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]

print(most_common_words)

[('phone', 209054), ('great', 51217), ('good', 47339), ('one', 31106), ('like', 28646), ('screen', 27201), ('use', 26345), ('battery', 25525), ('works', 24194), ('love', 23325)]


In [20]:
DICT_SIZE = 5000

most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:DICT_SIZE]

WORDS_TO_INDEX = {}
i = 0
for entry in most_common_words:
    WORDS_TO_INDEX[entry[0]] = i
    i += 1

def my_bag_of_words(text, words_to_index, dict_size):
    result_vector = np.zeros(dict_size)
    for word in text.split():
        if word in words_to_index:
            result_vector[words_to_index[word]] += 1
    return result_vector

In [21]:
from scipy import sparse as sp_sparse

In [22]:
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X2_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X2_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X2_test])
print('X_train shape ', X_train_mybag.shape)
print('X_val shape ', X_val_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

X_train shape  (200456, 5000)
X_val shape  (61579, 5000)
X_test shape  (46242, 5000)


In [23]:
classifier = LogisticRegression(C = 200).fit(X_train_mybag, y_train)

In [24]:
predictions2 = classifier.predict(X_val_mybag)

In [25]:
print("Accuracy score", accuracy_score(y_val, predictions2))
print("f1_score(weighted):", f1_score(y_val, predictions2, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_val, predictions2, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_val, predictions2, average = 'weighted'))

Accuracy score 0.9364556098670002
f1_score(weighted): 0.935654715781444
average_precision_score(weighted): 0.9408343638316503
recall_score(weighted): 0.9364556098670002


In [26]:
testPredictions2 = classifier.predict(X_test_mybag)

In [27]:
print("Accuracy score", accuracy_score(y_test, testPredictions2))
print("f1_score(weighted):", f1_score(y_test, testPredictions2, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_test, testPredictions2, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_test, testPredictions2, average = 'weighted'))

Accuracy score 0.9354699191211453
f1_score(weighted): 0.9345606685708138
average_precision_score(weighted): 0.9388241660868866
recall_score(weighted): 0.9354699191211453


In [26]:
'''
Trialset = ['Bad phone']
X2_trial = [text_prepare(x) for x in Trialset]
X_trial_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X2_trial])
print(classifier.predict(countVect.transform(['phone is good', 'not good'])))
'''

"\nTrialset = ['Bad phone']\nX2_trial = [text_prepare(x) for x in Trialset]\nX_trial_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X2_trial])\nprint(classifier.predict(countVect.transform(['phone is good', 'not good'])))\n"

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
tfidfVect = TfidfVectorizer(min_df=5).fit(X_train)
print(len(tfidfVect.get_feature_names()))

16725


In [30]:
tfX_train_vectorized = tfidfVect.transform(X_train)

In [31]:
tfmodel = LogisticRegression(C = 50)
tfmodel.fit(tfX_train_vectorized, y_train)

LogisticRegression(C=50, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
tfpredictions = tfmodel.predict(tfidfVect.transform(X_val))

In [33]:
print("Accuracy score", accuracy_score(y_val, tfpredictions))
print("f1_score(weighted):", f1_score(y_val, tfpredictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_val, tfpredictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_val, tfpredictions, average = 'weighted'))

Accuracy score 0.955861576186687
f1_score(weighted): 0.9557326389086596
average_precision_score(weighted): 0.9619931937214505
recall_score(weighted): 0.955861576186687


In [34]:
testTfpredictions = tfmodel.predict(tfidfVect.transform(X_test))

In [35]:
print("Accuracy score", accuracy_score(y_test, testTfpredictions))
print("f1_score(weighted):", f1_score(y_test, testTfpredictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_test, testTfpredictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_test, testTfpredictions, average = 'weighted'))

Accuracy score 0.9554085030924268
f1_score(weighted): 0.9552966629601434
average_precision_score(weighted): 0.9616930026787053
recall_score(weighted): 0.9554085030924268


In [36]:
tffeature_names = np.array(tfidfVect.get_feature_names())

sorted_tfidf_index = tfX_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(tffeature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(tffeature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['1300' 'messiah' 'v7' 'keynote' 'ionized' 'bigtime' 'hosts' 'brawns'
 'bridging' '1b']

Largest tfidf: 
['buen' 'top' 'fire' 'fits' 'brilliant' 'five' 'flimsy' 'too' 'a1' 'bravo']


In [37]:
tfsorted_coef_index = tfmodel.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(tffeature_names[tfsorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(tffeature_names[tfsorted_coef_index[:-11:-1]]))

Smallest Coefs:
['worst' 'mony' 'jun' 'prediction' 'theory' 'false' 'remembering' 'pos'
 'nope' 'expiration']

Largest Coefs: 
['wacky' 'love' 'aviv' '4eeeks' 'great' 'tact' 'pleasantly' 'hesitate'
 'excellent' 'amazing']


In [38]:
print(tfmodel.predict(tfidfVect.transform(['love',
                                    'working'])))

[1 1]


In [39]:
ngramVect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)
print(len(ngramVect.get_feature_names()))

179049


In [40]:
ngramX_train_vectorized = ngramVect.transform(X_train)

In [41]:
ngramModel = LogisticRegression(C = 15)
ngramModel.fit(ngramX_train_vectorized, y_train)

LogisticRegression(C=15, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [42]:
ngramPredictions = ngramModel.predict(ngramVect.transform(X_val))

In [43]:
print("Accuracy score", accuracy_score(y_val, ngramPredictions))
print("f1_score(weighted):", f1_score(y_val, ngramPredictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_val, ngramPredictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_val, ngramPredictions, average = 'weighted'))

Accuracy score 0.975527371344127
f1_score(weighted): 0.9754771704157668
average_precision_score(weighted): 0.9783579085479265
recall_score(weighted): 0.975527371344127


In [44]:
testNgramPredictions = ngramModel.predict(ngramVect.transform(X_test))

In [45]:
print("Accuracy score", accuracy_score(y_test, testNgramPredictions))
print("f1_score(weighted):", f1_score(y_test, testNgramPredictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_test, testNgramPredictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_test, testNgramPredictions, average = 'weighted'))

Accuracy score 0.9761688508282513
f1_score(weighted): 0.9761118295595008
average_precision_score(weighted): 0.9785059251371223
recall_score(weighted): 0.9761688508282513


In [46]:
ngramfeature_names = np.array(ngramVect.get_feature_names())

ngramsorted_coef_index = ngramModel.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(ngramfeature_names[ngramsorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(ngramfeature_names[ngramsorted_coef_index[:-11:-1]]))

Smallest Coefs:
['good get' 'perfect no' 'knot' 'junk' 'middling' 'to happy' 'one star'
 'holster' 'garbage' 'worst']

Largest Coefs: 
['best not' 'not bad' 'no problems' 'all like' 'pictures quality' 'no bad'
 'no issues' 'not too' 'no problem' 'bad it']


In [47]:
print(ngramModel.predict(ngramVect.transform(['phone is good',
                                    'an issue, phone is not working', 'horrible phone'])))

[1 0 0]


In [None]:
from sklearn import svm

In [None]:
clfVect = CountVectorizer(min_df=5).fit(X_train)


In [None]:
clfX_train_vectorized = clfVect.transform(X_train) 
print("Feature count:", len(countVect.get_feature_names()))

In [12]:
clf = svm.LinearSVC()
clf.fit(clfX_train_vectorized, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [None]:
clf2= svm.NuSVC(kernel = 'poly, degree = 2)
clf2.fit(clfX_train_vectorized, y_train)

In [13]:
clfpredictions = clf.predict(clfVect.transform(X_val))

In [14]:
print("Accuracy score", accuracy_score(y_val, clfpredictions))
print("f1_score(weighted):", f1_score(y_val, clfpredictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_val, clfpredictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_val, clfpredictions, average = 'weighted'))

Accuracy score 0.9521102973416262
f1_score(weighted): 0.9518790080001326
average_precision_score(weighted): 0.9577655231535342
recall_score(weighted): 0.9521102973416262


In [15]:
TestClfpredictions = clf.predict(clfVect.transform(X_test))

In [16]:
print("Accuracy score", accuracy_score(y_test, TestClfpredictions))
print("f1_score(weighted):", f1_score(y_test, TestClfpredictions, average = 'weighted'))
print("average_precision_score(weighted):", average_precision_score(y_test, TestClfpredictions, average = 'weighted'))
print("recall_score(weighted):", recall_score(y_test, TestClfpredictions, average = 'weighted'))

Accuracy score 0.9533108429566195
f1_score(weighted): 0.9530618905281582
average_precision_score(weighted): 0.9581256805105789
recall_score(weighted): 0.9533108429566195


In [None]:
clffeature_names = np.array(clfVect.get_feature_names())

clfsorted_coef_index = clf.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(clffeature_names[clfsorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(clffeature_names[clfsorted_coef_index[:-11:-1]]))

In [None]:
repeat = True
while repeat == True:
    string = input("Enter custom string:(0 to exit)")
    if string != '0':
        print(ngramModel.predict(ngramVect.transform([string])))
    else:
        repeat = False