## Import Libraries

In [1]:
# importing required libraries
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

## Read in Dataset and visualize it

In [2]:
data = pd.read_csv('https://data-bootcamp-x399.s3.us-east-2.amazonaws.com/Reviews.csv')

In [3]:

data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## Extract the Scores and Texts, Rename the Columns, and View Properties of the Data

In [4]:
df = data[["Score","Text"]]
df.head()

Unnamed: 0,Score,Text
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [5]:
reviews_df = df.rename(columns={"Score": "Rating", "Text": "Review"})
reviews_df.head()

Unnamed: 0,Rating,Review
0,5,I have bought several of the Vitality canned d...
1,1,Product arrived labeled as Jumbo Salted Peanut...
2,4,This is a confection that has been around a fe...
3,2,If you are looking for the secret ingredient i...
4,5,Great taffy at a great price. There was a wid...


In [6]:
reviews_df.shape

(568454, 2)

In [7]:
reviews_df.Rating.value_counts()

5    363122
4     80655
1     52268
3     42640
2     29769
Name: Rating, dtype: int64

## Split the Data into a Training and Test Sets

In [8]:
# We stratify the data in order to have equal proportions of ratings in both training and test sets
train, test = train_test_split(reviews_df, test_size = 0.2, stratify = reviews_df['Rating'], random_state=0)

In [9]:
train.shape, test.shape

((454763, 2), (113691, 2))

In [10]:
train.Rating.value_counts(normalize=True)

5    0.638790
4    0.141885
1    0.091947
3    0.075010
2    0.052368
Name: Rating, dtype: float64

In [11]:
test.Rating.value_counts(normalize=True)

5    0.638784
4    0.141885
1    0.091951
3    0.075010
2    0.052370
Name: Rating, dtype: float64

## Create a Term Frequency Inverse Document Frequency (TF-IDF) Vectorizer Object

In [12]:
tfidf_vectorizer = TfidfVectorizer(lowercase= True, max_features=1000, stop_words=ENGLISH_STOP_WORDS)

## Fit Object with Training Dataset of Reviews

In [13]:
tfidf_vectorizer.fit(train.Review)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=1000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                strip_accents=None, sublinear_tf=False,
                token_pa

In [14]:
train_idf = tfidf_vectorizer.transform(train.Review)
test_idf  = tfidf_vectorizer.transform(test.Review)

## Fitting and Prediction of the Logistic Regression Model

In [15]:
# Create an object of the logistic regression model
classifier = LogisticRegression(max_iter=400)

In [16]:
# Fit the model with the training dataset
classifier.fit(train_idf, train.Rating)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=400,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
# predict the label on the traning dataset
predict_train = classifier.predict(train_idf)

In [18]:
# predict the label on the test dataset
predict_test = classifier.predict(test_idf)

In [19]:
# f1 score on train data
# f1 = 2 * (precision * recall) / (precision + recall)
f1_score(y_true= train.Rating, y_pred= predict_train, average='weighted')

0.6609515386825814

In [20]:
# f1 score on train data
f1_score(y_true= test.Rating, y_pred= predict_test, average='weighted')

0.6574684352432125

## Set up Pipeline

In [21]:
# define the stages of the pipeline

pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(lowercase=True,
                                                      max_features=1000,
                                                      stop_words= ENGLISH_STOP_WORDS)),
                            ('model', LogisticRegression(max_iter=400))])

In [22]:

# fit the pipeline model with the training data  
pipeline.fit(train.Review, train.Rating)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=1000,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterward...
                                 strip_accents=None, sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),


In [23]:
pipeline.predict(train.Review)

array([5, 5, 4, ..., 5, 1, 1])

## Test the Pipeline with a Actual Sample Reviews from Amazon

In [24]:
review1 = ["Great taffy at a great price. There was a wide assortment of yummy taffy. Delivery was very quick. If your a taffy lover, this is a deal."]

In [25]:
pipeline.predict(review1)

array([5])

In [26]:
review2 = ["They don't. They're sent in those tear off plastic bags you find in supermarket produce sections. No padding, so they often arrived bruised. We stopped ordering them months ago for this reason :("]

In [27]:
pipeline.predict(review2)

array([1])

In [28]:
review3 = ["I'm not sure what you mean by 'what label' but the avocados I received yesterday, the label says grown in the USA.."]

In [29]:
pipeline.predict(review3)

array([1])

In [30]:
review4 = ["No, not until you receive the product. We've always received fresh products."]

In [31]:
pipeline.predict(review4)

array([5])

In [32]:
review5 = ["I could not understand why I was consuming vast quantities of water ( 3 liters) within one hour during the winter and without any recent exercise, until I recalled I consumed 2 oz. of this Applegate dry salami. Beware, especially if you are medically salt restricted. This Applegate item is adequate to send you to the ER for treatment of uncontrolled hypertention."]

In [33]:
pipeline.predict(review5)

array([1])

## Save Pipeline Object Using the Dump Funfcion

In [34]:
from joblib import dump

In [35]:
dump(pipeline, "amazon_reviews.joblib")

['amazon_reviews.joblib']

In [36]:
reviews_df[reviews_df.Rating == 5]

Unnamed: 0,Rating,Review
0,5,I have bought several of the Vitality canned d...
4,5,Great taffy at a great price. There was a wid...
6,5,This saltwater taffy had great flavors and was...
7,5,This taffy is so good. It is very soft and ch...
8,5,Right now I'm mostly just sprouting this so my...
...,...,...
568448,5,My only complaint is that there's so much of i...
568449,5,Great for sesame chicken..this is a good if no...
568451,5,"These stars are small, so you can give 10-15 o..."
568452,5,These are the BEST treats for training and rew...


In [37]:
reviews_df[reviews_df.Rating == 4]

Unnamed: 0,Rating,Review
2,4,This is a confection that has been around a fe...
5,4,I got a wild hair for taffy and ordered this f...
13,4,good flavor! these came securely packed... the...
27,4,I was so glad Amazon carried these batteries. ...
32,4,McCann's Instant Oatmeal is great if you must ...
...,...,...
568400,4,These are GREAT for carrying in my purse for e...
568406,4,This apple butter has a great taste but the pr...
568417,4,This is the best brand of Mango Chutney that I...
568420,4,Love the coconut flavor of this tea. Two thin...


In [38]:
reviews_df[reviews_df.Rating == 3]

Unnamed: 0,Rating,Review
45,3,This seems a little more wholesome than some o...
47,3,"The flavors are good. However, I do not see a..."
49,3,This is the same stuff you can buy at the big ...
53,3,we're used to spicy foods down here in south t...
60,3,Watch your prices with this. While the assort...
...,...,...
568369,3,"An admitted fan of Stash Earl Grey, neverthele..."
568379,3,I hadn't tried this taco seasoning prior to pu...
568394,3,I picked up these honey sticks because I'm in ...
568401,3,It's great to have agave in a portable format....


In [39]:
reviews_df[reviews_df.Rating == 2]

Unnamed: 0,Rating,Review
3,2,If you are looking for the secret ingredient i...
16,2,I love eating them and they are good for watch...
67,2,"I purchased the Mango flavor, and to me it doe..."
74,2,It is okay. I would not go out of my way to b...
110,2,I was diappointed in the flavor and texture of...
...,...,...
568416,2,"When I ordered this chutney, it was supposed t..."
568434,2,This soup is mostly broth. Although it has a k...
568435,2,"It is mostly broth, with the advertised 3/4 cu..."
568446,2,I had ordered some of these a few months back ...


In [40]:
reviews_df[reviews_df.Rating == 1]

Unnamed: 0,Rating,Review
1,1,Product arrived labeled as Jumbo Salted Peanut...
12,1,My cats have been happily eating Felidae Plati...
26,1,"The candy is just red , No flavor . Just plan..."
50,1,"This oatmeal is not good. Its mushy, soft, I d..."
62,1,Arrived in 6 days and were so stale i could no...
...,...,...
568402,1,"I was disappointed in this product, as I had r..."
568426,1,"The candy is tasty, but they totally scam you ..."
568431,1,Definitely not worth buying flavored water wit...
568432,1,I thought this soup would be more like a chill...
