DATA : Customer review data of mobile phone devices purchased from Amazon's ecommerce platform.

MODEL: Logistic Regression, SVM,Multi-layer perceptron and one-dimensional CNN

METHODOLOGY: To classify the reviews as either positive or negative, a classifier is built. The text data is converted to vector form using bag-ofwords and TFIDF(Term Frequency Inverse Document Frequency) approach.

In [0]:
#import the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#upload the dataset
from google.colab import files
uploaded = files.upload()

Saving Modified_Amazon_Unlocked_Mobile.csv to Modified_Amazon_Unlocked_Mobile.csv


In [3]:
#converting dictionary form of uploaded to a dataframe
import io
df = pd.read_csv(io.BytesIO(uploaded['Modified_Amazon_Unlocked_Mobile.csv']))
df.head()

Unnamed: 0.1,Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,0,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
1,1,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
2,2,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
3,3,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
4,4,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0


In [4]:
# Drop missing values
df.dropna(inplace=True)

# Remove any 'neutral' ratings equal to 3
df = df[df['Rating'] != 3]

# Encode 4s and 5s as 1 (rated positively)
# Encode 1s and 2s as 0 (rated poorly)
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head()

Unnamed: 0.1,Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
1,1,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0,0
2,2,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0,1
3,3,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0,0
4,4,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0,1
5,5,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0,1


In [5]:
# Most ratings are positive
df['Positively Rated'].mean()

0.7471776686078667

In [0]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)

In [7]:
print('X_train first entry:\n\n', X_train.iloc[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry:

 Everything about it is awesome!


X_train shape:  (23052,)


In [0]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

In [9]:
vect.get_feature_names()[::2000]

['00',
 'arroja',
 'comapañias',
 'dvds',
 'golden',
 'lands',
 'oil',
 'razonable',
 'smallsliver',
 'tweak']

In [10]:
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<23052x19601 sparse matrix of type '<class 'numpy.int64'>'
	with 613289 stored elements in Compressed Sparse Row format>

In [11]:
#TFIDF model will be used to vectorize text data
#TF(w) = (Number of times term w appears in a document) / (Total number of terms in the document)
#IDF(w) = log_e(Total number of documents / Number of documents with term w in it)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer()
vectorizer_tfidf.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [12]:
len(vectorizer_tfidf.get_feature_names())

19601

In [13]:
X_train_vectorized_tfidf = vectorizer_tfidf.transform(X_train)
X_train_vectorized_tfidf

<23052x19601 sparse matrix of type '<class 'numpy.float64'>'
	with 613289 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.linear_model import LogisticRegression

# Train the model
clfr_logR = LogisticRegression(max_iter=500)
clfr_logR.fit(X_train_vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
from sklearn.metrics import roc_auc_score,accuracy_score

# Predict the transformed test documents
predictions = clfr_logR.predict(vect.transform(X_test))
bow_logR = roc_auc_score(y_test, predictions)
print('Accuracy for logistic regession BoW approach:',accuracy_score(y_test,predictions))
print('AUC: ', bow_logR)

Accuracy for logistic regession BoW approach: 0.9130774235523748
AUC:  0.9015745754687453


In [21]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = clfr_logR.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['not' 'slow' 'disappointed' 'worst' 'terrible' 'return' 'never' 'doesn'
 'horrible' 'waste']

Largest Coefs: 
['great' 'love' 'excellent' 'good' 'best' 'perfect' 'price' 'awesome'
 'far' 'perfectly']


In [22]:
#TFIDF vectorized dataset is given as input to logistic regression classifier
clfr_logR = LogisticRegression()
clfr_logR.fit(X_train_vectorized_tfidf, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
# Predict the transformed test documents
predictions = clfr_logR.predict(vectorizer_tfidf.transform(X_test))
tfidf_logR = roc_auc_score(y_test, predictions)
print('Accuracy for logistic regession TFIDF approach:',accuracy_score(y_test,predictions))
print('AUC: ', tfidf_logR)

Accuracy for logistic regession TFIDF approach: 0.9301236174365647
AUC:  0.8890901979167192


In [0]:
#create a dictionary and store the AUC score
auc_dictionary = dict()
auc_dictionary['logR'] = [bow_logR,tfidf_logR]

SVM classifier with linear kernel is used to classify the review

In [25]:
from sklearn.svm import SVC

clfr_svc = SVC(kernel='linear')
clfr_svc.fit(X_train_vectorized,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [26]:
predictions = clfr_svc.predict(vect.transform(X_test))
bow_svc = roc_auc_score(y_test, predictions)
print('Accuracy for SVM classifier BoW approach:', accuracy_score(y_test, predictions))
print('AUC: ',bow_svc)

Accuracy for SVM classifier BoW approach: 0.92765126870527
AUC:  0.8975711995090037


In [29]:
clfr_svc = SVC(kernel='linear')
clfr_svc.fit(X_train_vectorized_tfidf,y_train)

predictions = clfr_svc.predict(vectorizer_tfidf.transform(X_test))
tfidf_svc = roc_auc_score(y_test, predictions)
print('Accuracy for SVM classifier TFIDF approach:', accuracy_score(y_test, predictions))
print('AUC: ',tfidf_svc)

Accuracy for SVM classifier TFIDF approach: 0.936629798308393
AUC:  0.910610412303114


In [0]:
auc_dictionary['svc'] = [bow_svc,tfidf_svc]

Deep learning models are used to predict the sentiment of the customer reviews

In [31]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import one_hot

Using TensorFlow backend.


In [0]:
reviews = []
for i in X_train:
  reviews.append(i)

In [33]:
len(reviews) #list of docs

23052

In [34]:
# integer encode the documents
vocab_size = 200
encoded_docs = [one_hot(d, vocab_size) for d in reviews]
print(encoded_docs[0])

[4, 92, 123, 158, 151]


In [35]:
# pad documents to a max length of 4 words
max_length = 4
padded_docs = sequence.pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs[:2])

[[ 92 123 158 151]
 [ 16 168  37  49]]


In [36]:
clfr_MLP = Sequential()
clfr_MLP.add(Embedding(vocab_size, 32, input_length=max_length))
clfr_MLP.add(Flatten())
clfr_MLP.add(Dense(250, activation='relu'))
clfr_MLP.add(Dense(1, activation='sigmoid'))
clfr_MLP.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
clfr_MLP.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 32)             6400      
_________________________________________________________________
flatten_1 (Flatten)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               32250     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
Total params: 38,901
Trainable params: 38,901
Non-trainable params: 0
_________________________________________________________________


In [37]:
#fit the model
clfr_MLP.fit(padded_docs, y_train, epochs=10, verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fa7d7f34f28>

In [38]:
reviews_test = []
for i in X_test:
  reviews_test.append(i)

vocab_size = 200
encoded_docs_test = [one_hot(d, vocab_size) for d in reviews_test]
print(encoded_docs_test[:2])

[[16], [109, 21, 62, 63, 111]]


In [39]:
max_length = 4
test_docs = sequence.pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')
print(test_docs[:2])

[[ 16   0   0   0]
 [ 21  62  63 111]]


In [40]:
# evaluate the model
loss, accuracy = clfr_MLP.evaluate(test_docs, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 78.985035


One-dimensional CNN model

In [0]:
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [42]:
# create the model
clfr_CNN = Sequential()
clfr_CNN.add(Embedding(vocab_size, 32, input_length=max_length))
clfr_CNN.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
clfr_CNN.add(MaxPooling1D(pool_size=2))
clfr_CNN.add(Flatten())
clfr_CNN.add(Dense(250, activation='relu'))
clfr_CNN.add(Dense(1, activation='sigmoid'))
clfr_CNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
clfr_CNN.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 4, 32)             6400      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 4, 32)             3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2, 32)             0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 250)               16250     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 251       
Total params: 26,005
Trainable params: 26,005
Non-trainable params: 0
__________________________________________________

In [43]:
clfr_MLP.fit(padded_docs, y_train, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7fa7d6fc1b70>

In [44]:
# evaluate the CNN model on test dataset
loss, accuracy = clfr_CNN.evaluate(test_docs, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 50.943398
