---

_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-text-mining/resources/d9pwm) course resource._

---

*Note: Some of the cells in this notebook are computationally expensive. To reduce runtime, this notebook is using a subset of the data.*

# Case Study: Sentiment Analysis

### Data Prep

In [1]:
import pandas as pd
import numpy as np


# Read in the data
df = pd.read_csv('Amazon_Unlocked_Mobile_SPA.csv')

# Sample the data to speed up computation
# Comment out this line to match with lecture
#df = df.sample(frac=0.1, random_state=10)

#
#
# This dataset contains reviews from users on productos sold by Amazon, particularly MOBILE PHONES ...

df.head(100)

The minimum supported version is 2.4.6



Unnamed: 0.1,Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Me siento tan AFORTUNADO de haber encontrado e...,1.0
1,1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"buen teléfono, buen grado de mi revista de pan...",0.0
2,2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Muy complacido,0.0
3,3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"Funciona bien, pero va lento a veces, pero es ...",0.0
4,4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Gran teléfono para reemplazar mi teléfono perd...,0.0
5,5,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,Ya tenía un teléfono con problemas ... Sé que ...,1.0
6,6,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,El puerto de carga estaba suelto. Lo solté. Lu...,0.0
7,7,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,El teléfono se ve bien pero no se mantendrá ca...,0.0
8,8,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Originalmente estaba usando el Samsung S2 Gala...,0.0
9,9,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,3,La duración de la batería es excelente. Es muy...,0.0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413840 entries, 0 to 413839
Data columns (total 7 columns):
Unnamed: 0      413840 non-null int64
Product Name    413840 non-null object
Brand Name      348669 non-null object
Price           407907 non-null float64
Rating          413840 non-null int64
Reviews         413840 non-null object
Review Votes    401544 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 22.1+ MB


In [3]:
# 25% of the elements in the datase have a price under $ 79,990 
# 50%                                                  $ 144,71, that is 79.990 <= price < 144.71
# 75%                                                  $ 269,99, that is 144,71 <= price < 299,99
df.describe()[['Price','Rating']]

Unnamed: 0,Price,Rating
count,407907.0,413840.0
mean,226.867155,3.819578
std,273.006259,1.548216
min,1.73,1.0
25%,79.99,3.0
50%,144.71,5.0
75%,269.99,5.0
max,2598.0,5.0


In [4]:
# Drop missing values
df.dropna(inplace=True)

# Remove any 'neutral' ratings equal to 3
df = df[df['Rating'] != 3]



In [5]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Me siento tan AFORTUNADO de haber encontrado e...,1.0
1,1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"buen teléfono, buen grado de mi revista de pan...",0.0
2,2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Muy complacido,0.0
3,3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"Funciona bien, pero va lento a veces, pero es ...",0.0
4,4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Gran teléfono para reemplazar mi teléfono perd...,0.0
5,5,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,Ya tenía un teléfono con problemas ... Sé que ...,1.0
6,6,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,El puerto de carga estaba suelto. Lo solté. Lu...,0.0
7,7,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,El teléfono se ve bien pero no se mantendrá ca...,0.0
8,8,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Originalmente estaba usando el Samsung S2 Gala...,0.0
11,11,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Este es un gran producto que vino después de d...,0.0


In [6]:
# Encode 4s and 5s as 1 (rated positively)
# Encode 1s and 2s as 0 (rated poorly)

#
# np.where is going to set the value of 'Positively Rated' this way:  1 if 'Rating' > 3, otherwise 0
# this could also have been done like this:
#
# df['Positively Rated'] = df['Rating'].apply(lambda x: 1 if x > 3 else 0)
# 
# This new column will be our class or label ... a binary one by the way
#
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head(50)

Unnamed: 0.1,Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
0,0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Me siento tan AFORTUNADO de haber encontrado e...,1.0,1
1,1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"buen teléfono, buen grado de mi revista de pan...",0.0,1
2,2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Muy complacido,0.0,1
3,3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"Funciona bien, pero va lento a veces, pero es ...",0.0,1
4,4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Gran teléfono para reemplazar mi teléfono perd...,0.0,1
5,5,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,Ya tenía un teléfono con problemas ... Sé que ...,1.0,0
6,6,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,El puerto de carga estaba suelto. Lo solté. Lu...,0.0,0
7,7,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,El teléfono se ve bien pero no se mantendrá ca...,0.0,0
8,8,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Originalmente estaba usando el Samsung S2 Gala...,0.0,1
11,11,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Este es un gran producto que vino después de d...,0.0,1


In [7]:
# Most ratings are positive ("Positively Rated" mean is around 0.75)
df.describe()[['Price','Rating', 'Positively Rated']]

Unnamed: 0,Price,Rating,Positively Rated
count,308321.0,308321.0,308321.0
mean,226.061859,3.894542,0.748253
std,285.051614,1.585494,0.434018
min,1.73,1.0,0.0
25%,77.99,2.0,0.0
50%,139.04,5.0,1.0
75%,269.1,5.0,1.0
max,2598.0,5.0,1.0


In [8]:
#
# Now, to carry out the analisys we want to do, is enough to work with just two columns: 'Reviews' and 'Positively Rated'
# which correspondigly contains the TEXTs (from what we will extract features) and the labels
# 

from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)

In [9]:
X_train[0:10]

43929     Todas las funciones del teléfono funcionan bie...
360785                           ¡Gran vendedor y producto!
113459    En general, este teléfono está bien. Es muy ba...
314213           Excelente experiencia de compra y producto
318025    El teléfono es horrible. Recibí el teléfono el...
50501     Muy buen producto, Nieto está muy orgulloso de...
67392     ¡¡Debes comprar!! ¡Parece absolutamente nuevo!...
49936                                       A Son le gusta.
369161    El móvil viene con un enchufe y un software as...
152307                                            Excelente
Name: Reviews, dtype: object

In [10]:
y_train[0:10]

43929     0
360785    1
113459    1
314213    1
318025    0
50501     1
67392     1
49936     1
369161    0
152307    1
Name: Positively Rated, dtype: int64

In [11]:
print("X_train first entry:\n\n", X_train.iloc[0])
print("\n\nX_train shape: ", X_train.shape)
print("----------------------------------")
print("Y_train first entry:\n\n", y_train.iloc[0])
print("\n\nX_train shape: ", y_train.shape)


X_train first entry:

 Todas las funciones del teléfono funcionan bien, sin embargo, el Bluetooth y el inalámbrico están rotos. Intenté varios reinicios con el software así que debe ser un problema de hardware. Muy decepcionante. No vale la pena los $ 300 + pagué por ello.


X_train shape:  (231240,)
----------------------------------
Y_train first entry:

 0


X_train shape:  (231240,)


In [None]:
"""
   For the models we have studied so far, they work with numerical features, so we must transform the textual data.
   
   the "bag of words" model ignores the structure of the text and simply counts how many times each word appears
   in the text...
""" 


# CountVectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

#
# From the text column of X_train, this tool (CountVectorizer) creates/produces  a vocabulary (unique tokens) 
# using the lowercased word of the text, and have two or more letters or numbers,

# Take a look at http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
#
#
# AT THE END WHAT WE ARE DOING HERE IS CREATING A VOCABULARY from all documents of X_train.
# EACH WORD WILL BE A COLUMN OR FEATURE for the model
# 
# Suppose we have a document a="THIS IS A  DOCUMENT VERY well written"
#                       and  b="BUT THIS SEEM TO BE A DOCUMENT not so well written"
#
# the vector will contain the following features:
#
# ['this', 'is', 'document', 'very', 'well', 'written', 'but', 'seem', 'to', 'be', 'not', 'so', 'well'] 
# 
# these words will be the colums or features of the model. Some documents will contain values in some
# of these columns (when the words are present in the document).
#
# This is the base of the BAG-OF-WORDS approach for Machine Learning with TEXT
#
# CountVectorizer is kind of TOKENIZATION of the input data (all rows of text) and building a vocabulary of
# unique tockens or words
vect = CountVectorizer().fit(X_train)

In [14]:
type(vect)

sklearn.feature_extraction.text.CountVectorizer

In [15]:
# Returns a list of features. the [::2000] at the end of the list
# produces the elements positioned in 0,2000,4000,6000,8000,10000 indexes. etc
#For example:
#
#a=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
#a[::2]
# will produce: [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]
#
#
# vect.get_feature_names()[40000::2000] will produce a list with elements at positions
# 40000 42000 44000 46000 48000 50000 52000
# that is:  ['responseive', 'send', 'sos', 'synch', 'trace', 'utiles', 'withstanding']
#
vect.get_feature_names()[::2000]

['00',
 '6ft',
 'advierte',
 'apex',
 'año',
 'básicamente',
 'clicadas',
 'contenían',
 'decodificadores',
 'detallees',
 'eficazmente',
 'escenario',
 'falsificaciones',
 'geocaché',
 'hound',
 'installcons',
 'latido',
 'masticar',
 'mura',
 'ooma',
 'phonedog',
 'prometer',
 'reconmedable',
 'reunir',
 'serviceable',
 'subcontratan',
 'tirara',
 'valdrá',
 'y2']

In [16]:
vect.get_feature_names()[40000::1000]

['phonedog',
 'practicidad',
 'prometer',
 'quisieras',
 'reconmedable',
 'renací',
 'reunir',
 'sacude',
 'serviceable',
 'sobreprecios',
 'subcontratan',
 'tambien',
 'tirara',
 'trouth',
 'valdrá',
 'volcado',
 'y2']

In [17]:
#
# There seems to be more that 55000 unique tokens ...
#
len(vect.get_feature_names())

56538

In [18]:
%%time
# Transform the documents in the training data to a document-term matrix
# this is the so-called bag_of_words representation of the training data
#
# what we have here is a matrix with 231207 instances or rows (all the documents)
# and 53216 columns (all the distinct words). The entries or cells in this matrix
# are the number of times a word (column j) appears in a document (column i)
#
# This is going to be a sparse matrix, given that the number of features (over 53000)
# is much larger than the words in each review, so many features in each document
# will have zero as their values.
#

X_train_vectorized = vect.transform(X_train)

X_train_vectorized

CPU times: user 14.8 s, sys: 84 ms, total: 14.9 s
Wall time: 14.8 s


In [19]:
%%time
#Now, we will train our model, in this case a Logistic Regressor, with the sparse matrix X_train vectorized
#(all documents with the number of times each word of the vocabulary appears in them), and 
#y_train (positively rated or not)

from sklearn.linear_model import LogisticRegression

# Train the model. Our model LEARNS from each document, and will be able to predict
# for a new document, if it's going to be rated POSITIVE OR NEGATIVE
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

CPU times: user 1min 27s, sys: 100 ms, total: 1min 27s
Wall time: 1min 27s


In [20]:
def print_roc_auc_score(clf,X_test,y_test):
    from sklearn.metrics import  auc,roc_curve
    y_proba = clf.predict_proba(X_test)
    
    #print(y_proba)
    fpr, tpr, _ = roc_curve(y_test, y_proba[:, 1])
    roc_auc = auc(fpr, tpr)
    print('AUC: {:.5f}'.format(roc_auc))

In [21]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents (the sparse matrix form of X_test)

predictions = model.predict(vect.transform(X_test))
print(predictions[0:20])
# A very good performance of about 93%. Remember we can use roc_auc_score as this is a BINARY CLASSIFIER
print('AUC: ', roc_auc_score(y_test, predictions))

[1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1]
AUC:  0.9327841162982874


In [22]:
X_test_vectorized=vect.transform(X_test)
print_roc_auc_score(model,X_test_vectorized,y_test)

AUC: 0.98038


In [23]:
# NICE BEHAVIOUR or FEATURE: We transform all these comments into its sparse matrix form, then see
# how the model performs
comments_transformed=vect.transform(['Estoy decepcionado con la compra',
                                    'El teléfono no está funcionando y apenas lo compre hace dos meses',
                                    'Excelente teléfono, vale mucho mas que su precio',
                                    'Maravilloso teléfono',
                                    'Absolutamente satisfecho con la compra',
                                    'realmente satisfecho',
                                    'Algo insatisfecho con el funcionamiento del teléfono',
                                    'Realmente feliz con la compra realizada'])


print("Predicted labels [Positive or Negative RATED]")
print(model.predict(comments_transformed))


Predicted labels [Positive or Negative RATED]
[0 0 1 1 1 1 0 1]


In [24]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()


#print(sorted_coef_index)

print("These are the ten lowest coeficients")
print("------------------------------------")
print(model.coef_[0][sorted_coef_index[0:10]])


print("These are the ten largest coeficients")
print("------------------------------------")
print(model.coef_[0][sorted_coef_index[:-11:-1]])
# Find the words with the 10 largest coefficients and the 10 smallest
# coefficients. 


print("These are the corresponding words")
print("---------------------------------")
#Here, sorted_coef_index[0:10] is a list with the indexes of the ten smallest
#coeficients, and sorted_coef_index[:-11:-1]] is the list with the ten largest

#So, feature_names[sorted_coef_index[:10]] will return a list of ten words
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))

#In reverse orden, starting at the end, thru the end-11 position, step -1
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

#As we can see, the model has associated words like horrible, messy, worthless with
#NEGATIVE reviews, and exellent, excelente, efficient, superthin with POSITIVE reviews


These are the ten lowest coeficients
------------------------------------
[-3.65070397 -3.52318355 -3.40779907 -3.3822025  -3.34613561 -3.18880818
 -3.15659871 -3.08752691 -3.06260543 -3.05148123]
These are the ten largest coeficients
------------------------------------
[5.14891419 3.91860868 3.89636295 3.71349409 3.71120508 3.61342099
 3.49934855 3.46234574 3.38545193 3.27800424]
These are the corresponding words
---------------------------------
Smallest Coefs:
['moriría' 'insatisfecho' 'dárselo' 'mony' 'peor' 'injusto' 'inútil' 'bad'
 'califica' 'horrible']

Largest Coefs: 
['quiéralo' 'asombroso' 'amo' 'estupendo' 'excelente' 'grandioso'
 'exelente' '4eeeks' 'ama' 'dejarse']


In [25]:
# For saving a model for future use, without having to train it again. We must
# save both the model and the trained CountVectorizer vector
import pickle
model_filename='LR-with-CountVectorizer-for-SentimentAnalisis.pkl'
pickle.dump(model, open(model_filename, 'wb'))
vector_filename = 'CountVectorizer-vector.pkl'
pickle.dump(vect, open(vector_filename, 'wb'))

In [26]:
"""Restoring the MODEL and the VECTOR"""
loaded_model = pickle.load(open('LR-with-CountVectorizer-for-SentimentAnalisis.pkl', 'rb'))
loaded_vector = pickle.load(open('CountVectorizer-vector.pkl', 'rb'))
print(type(loaded_model))
print(type(loaded_vector))

<class 'sklearn.linear_model.logistic.LogisticRegression'>
<class 'sklearn.feature_extraction.text.CountVectorizer'>


In [27]:
comments_transformed=loaded_vector.transform(['Estoy decepcionado con la compra',
                                    'El teléfono no está funcionando y apenas lo compre hace dos meses',
                                    'Excelente teléfono, vale mucho mas que su precio',
                                    'Maravilloso teléfono',
                                    'Absolutamente satisfecho con la compra',
                                    'realmente satisfecho',
                                    'Algo insatisfecho con el funcionamiento del teléfono',
                                    'Realmente feliz con la compra realizada'])


print("Predicted labels [Positive or Negative RATED]")
print(loaded_model.predict(comments_transformed))

Predicted labels [Positive or Negative RATED]
[0 0 1 1 1 1 0 1]


# Tfidf

In [None]:
#This approach is based on how important are the terms or word for the document

from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifying a minimum document frequency of 5.
# That is, for a word to be part of the vocabulary, IT HAS TO APPEAR AT LEAST IN 5 DOCUMENTS.
# This param can also be specified for CountVectorizer. This means that words which appears
# in 4 documents or less, are not useful  predictors

vect = TfidfVectorizer(min_df=5).fit(X_train)
print(len(vect.get_feature_names()))

#As far as we have set min_df=5, the vector is almost three times shorter

In [None]:
print(vect.get_feature_names()[1000:1020])

In [None]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

#We didn't see any improvement in the AUC SCORE, BUT we used three times less features

In [None]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

In [None]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

In [None]:
# These reviews are treated the same by our current model
print(model.predict(vect.transform(['Estoy decepcionado con la compra',
                                    'El teléfono no está funcionando y apenas lo compre hace dos meses',
                                    'Excelente teléfono, vale mucho mas que su precio',
                                    'Maravilloso teléfono',
                                    'Absolutamente satisfecho con la compra',
                                    'realmente satisfecho',
                                    'Algo insatisfecho con el funcionamiento del teléfono',
                                    'Realmente feliz con la compra realizada'])))

# n-grams

In [None]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams up to 3-grams

#By adding bi-grams, CountVectorizer can extract features like "is working" and "not working", which
#have meanings completely different. We should be very carefull using this feature because the
#number of features can increase dramatically. (features are one-word, two-words amd three-words)

vect = CountVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

In [None]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

#OUR ROC_AUC has increased by 0.4

In [None]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

In [None]:
# These reviews are now correctly identified
#print(model.predict(vect.transform(['not an issue, phone is working',
#                                   'an issue, phone is not working'])))
print(model.predict(vect.transform(['no tiene problemas esta funcionando bien',
                                    'tengo problemas con el teléfono, no está funcionando',
                                    'buen teléfono',
                                    'maravilloso teléfono',
                                    'realmente satisfecho',
                                    'algo insatisfecho con el rendimiento del teléfono',
                                    'Feliz con la compra'])))
