### Import Requirements

In [40]:
import pandas as pd
import numpy as np
import gzip
import json
import re
import string
import random

import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aiju2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




### Data Exploration

In [2]:
# Load Office Product Amazon Reviews as pandasDF
def parse(path):
    g = gzip.open(path, "rb")
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient="index")

data_path = "../Office_Products_5.json.gz"
df = getDF(data_path)

In [3]:
df

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,True,"11 7, 2017",A2NIJTYWADLK57,0140503528,{'Format:': ' Board book'},cotton clay,kids like story BUT while i really wanted a bo...,"good story, small size book though",1510012800,,
1,4.0,True,"03 7, 2017",A2827D8EEURMP4,0140503528,{'Format:': ' Hardcover'},emankcin,Bought this used and it came in great conditio...,Good,1488844800,,
2,5.0,True,"06 25, 2016",APB6087F4J09J,0140503528,{'Format:': ' Board book'},Starbucks Fan,Every story and book about Corduroy is Fantast...,Best Books for All Children,1466812800,,
3,5.0,True,"02 21, 2016",A2DHERRZIPFU7X,0140503528,{'Format:': ' Paperback'},Caitlyn Jacobson,I purchased this book for my first grade class...,Great for Math!,1456012800,,
4,5.0,False,"08 2, 2015",A2XCLJRGFANRC,0140503528,{'Format:': ' Hardcover'},E. Ervin,Having spent numerous years in an elementary s...,Love Corduroy,1438473600,,
...,...,...,...,...,...,...,...,...,...,...,...,...
800352,5.0,True,"09 9, 2017",ACHG5QICJCS1F,B01HHLLLIO,{'Size:': ' 6 Rolls'},Anky,Delivered on time and is as expected.,5 stars,1504915200,,
800353,5.0,True,"06 2, 2017",A1YB5J8SF6QX3L,B01HHLLLIO,{'Size:': ' 6 Rolls'},DM,worked great.,good stuff,1496361600,,
800354,5.0,True,"04 29, 2017",A3N4O7L4HBYJX4,B01HHLLLIO,{'Size:': ' 6 Rolls'},Verdant Treasures,I used to score free UPS 4x6 labels. The blac...,Couldn't be happier with these labels,1493424000,,
800355,5.0,True,"02 21, 2017",A2DR3CXSWWLYXY,B01HHLLLIO,{'Size:': ' 6 Rolls'},C. F.,great,Five Stars,1487635200,,


This Dataset is an updated version of the Amazon review dataset released in 2014. As in the previous version, this dataset includes reviews (ratings, text, helpfulness votes), product metadata (descriptions, category information, price, brand, and image features), and links (also viewed/also bought graphs). In addition, this version provides the following features:

More reviews:
The total number of reviews is 233.1 million (142.8 million in 2014).
Newer reviews:
Current data includes reviews in the range May 1996 - Oct 2018.
Metadata:
We have added transaction metadata for each review shown on the review page. Such information includes:
Product information, e.g. color (white or black), size (large or small), package type (hardcover or electronics), etc.
Product images that are taken after the user received the product.
Added more detailed metadata of the product landing page. Such detailed information includes:
Bullet-point descriptions under product title.
Technical details table (attribute-value pairs).
Similar products table.
More categories:
Includes 5 new product categories.

Source: https://nijianmo.github.io/amazon/index.html#code

### Pre-processing

In [4]:
# a. Randomly select 500-1000 reviews from your dataset and perform steps b through d.
n_samples = random.randint(500, 1000)
review_samples = df.sample(n=n_samples)
review_samples

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
746104,5.0,True,"01 11, 2018",A2O5190IGXSN6C,B00006ICG0,{'Size:': ' 1 PACK'},Penny,Works Great! Sturdy. Nice to get something sub...,Five Stars,1515628800,,
432160,5.0,True,"02 25, 2015",A25IGRP8M03U1B,B0058M2YLS,{'Size:': ' 1 1/2&quot; x 1 1/2&quot;'},Francisco Coello,great!,Five Stars,1424822400,,
242727,5.0,True,"04 28, 2017",AN4TWQS0LRAH3,B001603YXI,"{'Color:': ' ss-cc-black', 'Package Type:': ' ...",Mags,"I've always loved these pens, they write well ...",I've always loved these pens,1493337600,,
12930,5.0,True,"10 20, 2008",A3ARLFYMW4QNNX,B00002NDRT,,Amazon Customer,I HAVE USED THIS PRODUCT FOR YEARS AND IT WAS ...,GREAT PRODUCT FOR ORGANIZING YOUR LIFE,1224460800,,
578692,1.0,False,"03 8, 2017",A6WILXTKP77L0,B00JXLGESY,{'Style:': ' Printer'},J E T III,"I am never, ever, buying anything with the nam...",it burns through ink like an arsonist through ...,1488931200,2,
...,...,...,...,...,...,...,...,...,...,...,...,...
290261,3.0,True,"03 15, 2013",A1TGX98768Z3Y2,B0016004PE,{'Size:': ' Each'},SouthernJill,"These are nice, large sized and professional l...",Nice product,1363305600,,
575671,5.0,True,"06 15, 2016",A393PHX1YCW0B,B00JKMVI50,{'Color:': ' Red'},Alexander P.,I was looking for some inexpensive gifts for a...,Exquisite for such an inexpensive pen.,1465948800,3,
221840,5.0,True,"07 8, 2016",ANLKE2XTH3A8E,B000ZLYNH0,,Jen O.,Ordered this set of Sharpies after purchasing ...,Sharpie fun with a click..,1467936000,,
448269,5.0,True,"10 2, 2013",A20NIKEGI7H7G6,B005VC8CG6,{'Style Name:': ' Canon CL-241XL Color Ink Car...,Guy,These cartridges print nice and seem to have a...,Canon Ink,1380672000,,


In [5]:
#b. Label your data based on the value of “rating of the product” i.e. as follows:
    #i. Ratings 4,5: Positive
    #ii. Rating 3: Neutral
    #iii. Ratings 1,2: Negative

# Create new column
review_samples.insert(1, "sentiment", np.nan)
review_samples

Unnamed: 0,overall,sentiment,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
746104,5.0,,True,"01 11, 2018",A2O5190IGXSN6C,B00006ICG0,{'Size:': ' 1 PACK'},Penny,Works Great! Sturdy. Nice to get something sub...,Five Stars,1515628800,,
432160,5.0,,True,"02 25, 2015",A25IGRP8M03U1B,B0058M2YLS,{'Size:': ' 1 1/2&quot; x 1 1/2&quot;'},Francisco Coello,great!,Five Stars,1424822400,,
242727,5.0,,True,"04 28, 2017",AN4TWQS0LRAH3,B001603YXI,"{'Color:': ' ss-cc-black', 'Package Type:': ' ...",Mags,"I've always loved these pens, they write well ...",I've always loved these pens,1493337600,,
12930,5.0,,True,"10 20, 2008",A3ARLFYMW4QNNX,B00002NDRT,,Amazon Customer,I HAVE USED THIS PRODUCT FOR YEARS AND IT WAS ...,GREAT PRODUCT FOR ORGANIZING YOUR LIFE,1224460800,,
578692,1.0,,False,"03 8, 2017",A6WILXTKP77L0,B00JXLGESY,{'Style:': ' Printer'},J E T III,"I am never, ever, buying anything with the nam...",it burns through ink like an arsonist through ...,1488931200,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
290261,3.0,,True,"03 15, 2013",A1TGX98768Z3Y2,B0016004PE,{'Size:': ' Each'},SouthernJill,"These are nice, large sized and professional l...",Nice product,1363305600,,
575671,5.0,,True,"06 15, 2016",A393PHX1YCW0B,B00JKMVI50,{'Color:': ' Red'},Alexander P.,I was looking for some inexpensive gifts for a...,Exquisite for such an inexpensive pen.,1465948800,3,
221840,5.0,,True,"07 8, 2016",ANLKE2XTH3A8E,B000ZLYNH0,,Jen O.,Ordered this set of Sharpies after purchasing ...,Sharpie fun with a click..,1467936000,,
448269,5.0,,True,"10 2, 2013",A20NIKEGI7H7G6,B005VC8CG6,{'Style Name:': ' Canon CL-241XL Color Ink Car...,Guy,These cartridges print nice and seem to have a...,Canon Ink,1380672000,,


In [6]:
# Label the sentimental values
for index, row in review_samples.iterrows():
    #i. Ratings 4,5: Positive
    if (row.overall >= 4.):
        print(index, "positive")
        review_samples.loc[index, ["sentiment"]] = "positive"
    #ii. Rating 3: Neutral
    elif (row.overall < 4. or row.overall > 2.):
        print(index, "neutral")
        review_samples.loc[index, ["sentiment"]] = "neutral"
    #iii. Ratings 1,2: Negative
    elif (row.overall <= 2.):
        print(index, "negative")
        review_samples.loc[index, ["sentiment"]] = "negative"

746104 positive
432160 positive
242727 positive
12930 positive
578692 neutral
519892 positive
182444 positive
145236 positive
465863 positive
140470 positive
220362 positive
219974 positive
197482 positive
526602 neutral
638165 positive
6322 positive
241343 positive
268137 positive
213302 positive
766348 positive
305261 positive
187242 positive
734783 positive
296188 neutral
443546 positive
401983 positive
674472 positive
279640 positive
568888 neutral
234535 positive
382778 positive
459791 positive
752038 positive
512143 positive
391101 positive
418713 positive
512178 positive
718649 positive
432263 positive
81602 positive
324572 positive
632236 positive
362453 positive
714356 positive
132602 positive
718798 positive
49447 positive
143205 positive
252033 positive
425845 positive
424982 positive
107894 positive
268090 positive
421289 positive
324330 positive
203341 positive
605925 positive
173005 positive
29286 positive
773535 positive
52645 positive
137786 positive
654631 positive
684

302846 positive
130474 positive
87948 positive
441978 positive
285309 positive
252354 positive
227961 positive
566316 positive
218942 positive
537280 positive
312186 positive
437034 positive
36936 positive
365075 positive
500886 positive
253742 positive
288348 positive
770328 positive
142308 neutral
420160 positive
341640 positive
700039 positive
363104 positive
295308 positive
569993 positive
389066 positive
568108 positive
787349 positive
417358 positive
430173 neutral
352788 positive
247702 positive
102385 positive
184010 positive
270231 positive
206710 positive
156685 positive
116781 positive
418009 positive
389758 neutral
687738 positive
621461 positive
522419 neutral
354082 positive
500544 positive
47022 positive
337074 positive
540343 neutral
132008 positive
738543 positive
346550 positive
588242 positive
602147 positive
236838 neutral
30965 positive
157183 positive
628167 positive
742222 positive
745763 neutral
603669 positive
641344 positive
726475 positive
695966 positive
511

In [7]:
review_samples["sentiment"].isnull().any()

False

In [8]:
review_samples["reviewText"].isnull().any()

False

In [9]:
#c. Chose the appropriate columns for your sentiment analyzer. (Give this some thought)
sent_col = "sentiment"
review_samples[sent_col]

746104    positive
432160    positive
242727    positive
12930     positive
578692     neutral
            ...   
290261     neutral
575671    positive
221840    positive
448269    positive
479055    positive
Name: sentiment, Length: 812, dtype: object

#### Pre-process Text

In [38]:
stopwords = nltk.corpus.stopwords.words("english")

for i, text in enumerate(review_samples["reviewText"]):
    # Lowercasing
    text = text.lower()
    # Remove Digits
    text = re.sub(r'\d+','', text)
    # Remove Punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    text = " ".join([token for token in text.split() if (token not in stopwords)])
    # Remove trailing whitespaces
    text = ' '.join([token for token in text.split()])
    # Tokenization
    text = word_tokenize(text)    
    
    # Save to DataFrame
    review_samples["reviewText"].iloc[i] = text
    print(review_samples["reviewText"].iloc[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_samples["reviewText"].iloc[i] = text


['works', 'great', 'sturdy', 'nice', 'get', 'something', 'substantial', 'money']
['great']
['ive', 'always', 'loved', 'pens', 'write', 'well', 'feel', 'great', 'hand', 'good', 'quality', 'stainless', 'steel', 'pens', 'else', 'could', 'ask']
['used', 'product', 'years', 'great', 'getting', 'discount', 'vendor', 'service', 'prompt']
['never', 'ever', 'buying', 'anything', 'name', 'epson', 'attached', 'know', 'finally', 'get', 'thing', 'print', 'even', 'somewhat', 'correctly', 'burns', 'ink', 'like', 'arsonist', 'gasoline', 'also', 'reason', 'uses', 'color', 'print', 'black', 'white', 'printer', 'decides', 'one', 'colors', 'low', 'can', 'not', 'print', 'dreadful', 'customer', 'service', 'support', 'page', 'clearly', 'designed', 'convenience', 'customers', 'never', 'buy', 'epson']
['rings', 'dont', 'stay', 'well', 'closed', 'wish', 'better', 'binders']
['love', 'pens', 'use', 'mostly', 'planner']
['labels', 'great', 'ship', 'second', 'time', 'purchase', 'regret', 'one', 'bit', 'go', 'great

['works', 'could', 'louder']
['awesome', 'ring', 'hard', 'find', 'stores', 'getting']
['nice', 'little', 'box', 'use', 'store', 'medicine', 'pill', 'bottles', 'box', 'better', 'quality', 'expecting', 'based', 'stock', 'photos', 'got', 'black', 'black', 'color', 'looks', 'great', 'even', 'though', 'came', 'scuffs', 'combination', 'easy', 'set', 'problems', 'box', 'pretty', 'small', 'dont', 'expect', 'fit', 'anything', 'big', 'inside', 'big', 'enough', 'fit', 'medium', 'sized', 'hand']
['dont', 'know', 'paper', 'printer', 'epson', 'photo', 'stylus', 'seems', 'like', 'pics', 'easily', 'scratched', 'compared', 'premium', 'presentation', 'paper', 'matte', 'ive', 'used', 'past']
['great', 'product']
['love']
['removes', 'walls', 'without', 'damaging', 'paint', 'strong', 'well']
['nice', 'way', 'keep', 'bulletin', 'borders', 'organized', 'wrinklefree']
['grandkids', 'loveem']
['nice', 'envelopes', 'easiertouse', 'sticky', 'tab', 'versus', 'complicated', 'clasp', 'design', 'item', 'good', 'pri

['works']
['mouse', 'pad', 'functions', 'utilize', 'adhesive']
['works', 'supposed', 'using', 'many', 'years', 'issues', 'far', 'epson', 'ribbon']
['product', 'described', 'arrived', 'good', 'condition', 'works', 'designed']
['expected', 'good', 'magnet']
['carpal', 'tunnel', 'really', 'bad', 'using', 'mouse', 'really', 'make', 'fingers', 'go', 'numb', 'something', 'angled', 'seems', 'help', 'micky', 'mouse', 'wrist', 'pad', 'though', 'theres', 'nothing', 'put', 'wrist', 'onexcept', 'hard', 'edge', 'love', 'concept', 'need', 'add', 'soft', 'wrist', 'pad']
['applicable']
['really', 'snap', 'set', 'easier', 'ever', 'login', 'using', 'google', 'username', 'password', 'make', 'google', 'voice', 'default', 'added', 'service', 'dollars', 'year', 'nothing', 'else', 'pay', 'taxes', 'service', 'fees', 'long', 'call', 'united', 'states', 'canada', 'wont', 'costs', 'one', 'penny']
['wife', 'landscape', 'design', 'worked', 'well']
['visited', 'many', 'office', 'supply', 'stores', 'checked', 'type'

['work', 'blame', 'seller', 'tried', 'past', 'use', 'remanufactured', 'cartridges', 'old', 'printer', 'never', 'work', 'error', 'problem']
['great']
['well', 'made', 'comfortable']
['worked', 'great', 'needed']
['shredder', 'couple', 'months', 'issues', 'happy']
['works', 'great']
['purchased', 'without', 'problems', 'time', 'package', 'paper', 'torn', 'open', 'ends', 'sealed', 'amazon', 'tape', 'would', 'okay', 'except', 'third', 'papers', 'damaged', 'since', 'obviously', 'presentation', 'paper', 'cant', 'use', 'damaged', 'worth', 'trouble', 'return', 'need', 'print', 'items', 'today', 'disappointing']
['horrible', 'wont', 'stay', 'wall', 'returned']
['love', 'pens', 'im', 'glad', 'package', 'use', 'alot', 'recommend', 'adult', 'coloring']
['exactly', 'wanted']
['surprised', 'good', 'quality']
['great', 'items', 'expected']
['perfect', 'organizer']
['like', 'dont', 'love', 'yes', 'hold', 'prisma', 'colored', 'pencils', 'varying', 'size', 'okay', 'mine', 'slips', 'around', 'could', 'de

['rd', 'set', 'casters', 'office', 'chair', 'time', 'casters', 'fail', 'choose', 'another', 'brand', 'seems', 'get', 'months', 'set', 'let', 'know', 'set', 'lasts']
['pocket', 'charts', 'lot', 'fun', 'kids', 'since', 'get', 'take', 'handson', 'role', 'education', 'sits', 'nicely', 'tabletop', 'easily', 'folds', 'storage', 'thin', 'enough', 'put', 'behind', 'bookcase', 'wall', 'cant', 'see', 'stock', 'picture', 'theres', 'large', 'storage', 'pocket', 'back', 'holds', 'sentence', 'cards', 'use', 'itll', 'add', 'little', 'bulk', 'chart', 'youre', 'given', 'individual', 'word', 'cards', 'sentence', 'cards', 'fill', 'blank', 'spaces', 'gives', 'lot', 'freedom', 'modify', 'sentences', 'like', 'easy', 'sentences', 'preschooler', 'sound', 'harder', 'sentences', 'geared', 'older', 'child', 'work', 'reading', 'improves', 'also', 'fun', 'try', 'words', 'make', 'silly', 'sentences', 'since', 'theres', 'great', 'variety', 'interchangeable', 'cards', 'tabletop', 'version', 'would', 'work', 'best', '

['needed', 'certain', 'size', 'fit', 'like', 'keep', 'desk', 'shreds', 'fine', 'nice', 'quiet', 'wish', 'held', 'sheets', 'time', 'otherwise', 'great']
['business', 'card', 'size', 'blanks', 'inexpensively', 'great', 'business', 'person', 'making', 'one', 'hundred', 'grand', 'per', 'year', 'high', 'school', 'college', 'student', 'wants', 'introduce', 'sorority', 'freshmen', 'woman', 'new', 'house', 'cleaning', 'business', 'friends', 'neighbors', 'ink', 'jet', 'print', 'easily', 'sides', 'split', 'apart', 'without', 'tearing', 'print', 'ten', 'twenty', 'time', 'run', 'full', 'thousand', 'computer', 'printer', 'bw', 'color', 'true', 'get', 'pay', 'also', 'true', 'product', 'one', 'things', 'worth', 'price']
['keeps', 'breakables', 'safe', 'moves']
['thanks', 'worked', 'great']
['looked', 'great', 'holds', 'papers', 'desk']
['hate', 'cheap', 'plastic', 'dispensers', 'often', 'get', 'packing', 'tape', 'well', 'made', 'heavy', 'dispenser', 'makes', 'packaging', 'sooo', 'much', 'easier']
['p

['exceeded', 'expectations', 'colors', 'great', 'shipping', 'fast', 'bright', 'colors', 'help', 'make', 'newsletters', 'stand', 'crowd', 'papers', 'much', 'better', 'response', 'using', 'bright', 'color']
['best', 'generic', 'xl', 'opinion', 'cheapest', 'option', 'however', 'best', 'value', 'bought', 'every', 'different', 'variation', 'office', 'smart', 'best', 'nondymo', 'brand', 'rolls', 'major', 'issues', 'recurring', 'issue', 'brand', 'sometimes', 'eye', 'hole', 'tell', 'paper', 'stop', 'isnt', 'covered', 'accident', 'piece', 'label', 'causing', 'extra', 'labels', 'feed', 'often', 'still', 'saves', 'tons', 'compared', 'dymo', 'brand', 'issue']
['worked', 'well']
['already', 'complained', 'epson', 'wifi', 'setup', 'easy', 'claim', 'two', 'side', 'printing', 'manual', 'number', 'pad', 'nightmare', 'use', 'user', 'manual', 'skinny', 'useless', 'help', 'user', 'manual', 'use', 'claimed', 'features', 'big', 'disappointment']
['third', 'time', 'byer']
['exactly', 'needed', 'arrived', 'qu

['nice', 'price', 'shipping']
['expensive', 'plus', 'printer', 'great']
['pilot', 'g', 'pens', 'perfect', 'buying', 'years', 'red', 'blue', 'black', 'colors', 'pens', 'seem', 'flow', 'freely', 'paper', 'hate', 'lending', 'people', 'tend', 'want', 'keep', 'buck', 'costly', 'replace', 'definite', 'great', 'buy', 'something', 'keep', 'getting', 'needed', 'highly', 'recommend', 'pens', 'theyre', 'like', 'perfect', 'woman', 'shes', 'really', 'perfect', 'ones', 'perfect']
['really', 'like', 'using', 'pens', 'hadnt', 'tried', 'purchasing', 'amazon', 'serve', 'various', 'functions', 'daily', 'life', 'home', 'work', 'setting', 'reliable', 'pens', 'bleed', 'pages', 'planner', 'sticky', 'notes', 'important', 'papers', 'work', 'etc', 'im', 'pleased', 'repurchase', 'however', 'probably', 'wont', 'use', 'taking', 'notes', 'class', 'prefer', 'ballpoint', 'pen', 'writing', 'quickly', 'circumstance', 'pens', 'look', 'professional', 'ballpoint', 'pens', 'different', 'equally', 'valuable', 'use', 'lifest

['great', 'deal', 'awesome']
['work', 'great', 'issues', 'nice', 'quality', 'lamination', 'sheets']
['would', 'get', 'five', 'stars', 'brother', 'didnt', 'start', 'telling', 'cartridge', 'empty', 'third', 'way', 'comments', 'explain', 'go', 'ahead', 'reset', 'thing', 'work', 'fine', 'long', 'long', 'time', 'much', 'longer', 'printer', 'tell', 'pain', 'every', 'otherwise', 'works', 'pretty', 'flawlessly']
['love', 'quiet', 'got', 'job', 'done']
['notebook', 'works', 'write', 'stuff']
['cant', 'enough', 'colors', 'like', 'ultra', 'fine', 'tips', 'would', 'like', 'sharpie', 'made', 'tips', 'thinner', 'fine', 'thicker', 'ultra', 'fine']
['floral', 'design', 'cover', 'card', 'actually', 'lattice', 'glued', 'card', 'came', 'clear', 'protective', 'sleeve', 'partially', 'removed', 'feel', 'lattice', 'unable', 'slide', 'sleeve', 'back', 'beware', 'careful', 'protective', 'sleeve', 'front', 'card', 'easy', 'bend', 'part', 'card', 'trying', 'slide', 'back', 'designwise', 'nice', 'card', 'special'

['parker', 'great', 'ink']
['got', 'purple', 'collection', 'blue', 'different', 'hue', 'needed', 'highlighters', 'color', 'differently', 'regular', 'sharpies', 'excuse', 'good', 'enough', 'reason', 'add', 'abundance', 'coloring', 'stuff', 'tried', 'laying', 'regular', 'colored', 'sharpie', 'letting', 'fully', 'dry', 'laying', 'highlighter', 'color']
['described', 'prompt', 'delivery', 'would', 'buy', 'aaa']
['ive', 'purchased', 'several', 'different', 'types', 'sakura', 'pens', 'daughter', 'enjoys', 'different', 'characteristics', 'flat', 'textures', 'others', 'sparkle', 'others', 'metallic', 'etc', 'uses', 'craft', 'projects', 'drawings', 'letters', 'etc', 'theyve', 'lasted', 'long', 'time', 'none', 'dried', 'skipped', 'great', 'pens']
['good', 'price']
['ive', 'tried', 'carpet', 'rollers', 'medical', 'stool', 'office', 'chair', 'workroll', 'easily', 'original', 'casters', 'came', 'chairs', 'large', 'enough', 'make', 'difference', 'rolling', 'carpet', 'waste']
['works', 'promised']
['

['really', 'love', 'pencils', 'use', 'everyday', 'writing', 'purchased', 'box', 'b', 'hardness', 'right', 'sharpen', 'nicely', 'also', 'nicely', 'boxed', 'believe', 'would', 'make', 'nice', 'gift', 'pencil', 'nut', 'like', 'lightweight', 'hand', 'assume', 'eraser', 'give', 'weight', 'favorite', 'pencils', 'blackwing', 'line', 'course', 'hope', 'review', 'helpful', 'pencil', 'lovers']
['comes', 'different', 'sized', 'laminating', 'pouches', 'used', 'laminating', 'x', 'pictures', 'charity', 'volunteer', 'everything', 'came', 'beautifully', 'laminating', 'pouches', 'mm', 'size', 'accommodate', 'mm', 'size', 'try', 'future', 'thank', 'purple', 'cows', 'great', 'machine', 'thank', 'amazon', 'offering', 'product']
['avery', 'good', 'product', 'item', 'buy', 'works', 'great', 'especially', 'coordinate', 'project', 'avery', 'online', 'print', 'index']
['needed']
['ink', 'essentially', 'supposed', 'print', 'awesome', 'images', 'start', 'awesome', 'digital', 'files', 'never', 'issue', 'particula

['great']
['small']
['amazing', 'great', 'artists', 'decorate', 'almost', 'surface', 'think', 'embellishments', 'leather', 'bag', 'canvas', 'leather', 'shoes', 'possibilities', 'endless', 'tips', 'great', 'control', 'cant', 'get', 'easy', 'get', 'nice', 'consistent', 'line', 'oh', 'specifically', 'looking', 'burgandy', 'color', 'got', 'lucky', 'set', 'great', 'colorful', 'zentangles']
['always', 'love', 'replacement', 'cartridges', 'price', 'amazing', 'compared', 'local', 'stores', 'dont', 'last', 'quite', 'long', 'brand', 'names', 'problem', 'price', 'using', 'years']
['great', 'quality', 'price']
['three', 'printers', 'use', 'regular', 'print', 'jobs', 'postage', 'print', 'envelopes', 'works', 'fine', 'replacement', 'toners', 'used', 'pretty', 'expensive', 'printer', 'released', 'replacement', 'toner', 'overall', 'quality', 'printer']
['works', 'great', 'looks', 'good', 'advertised']
['always', 'happy', 'avery']
['im', 'high', 'maintenance', 'comes', 'pens', 'theyre', 'easy', 'hold',

['excellent']
['expected']
['works', 'larger', 'rubberband', 'guns', 'rubberband', 'gun', 'ammo', 'works', 'better']
['perfect']
['great', 'sharpie', 'markers', 'happy', 'colors']
['works', 'expected', 'good', 'quality']
['things', 'could', 'little', 'longer', 'opinion', 'look', 'longer', 'picture', 'looking', 'tie', 'together', 'two', 'ends', 'zipper', 'suitcase', 'things', 'barely', 'made', 'gap', 'seems', 'solidly', 'made', 'snaps', 'together', 'really', 'well']
['affordable', 'prices', 'fast', 'shipping', 'works', 'great', 'bringing', 'groceries', 'house']
['perfect']
['great', 'lifesaver', 'youre', 'like', 'constantly', 'trying', 'remember', 'put', 'ruler', 'last', 'project', 'limited', 'omnigrid', 'fit', 'standard', 'rulers']
['great']
['actually', 'bought', 'send', 'school', 'kindergartener', 'list', 'best', 'price', 'could', 'find', 'complaints', 'brand', 'teachers', 'always', 'request', 'reliable']
['worked', 'better', 'thought', 'sturdy', 'seem', 'ruin', 'pages']
['little', '

['exactly', 'advertised', 'shipped', 'rapidly']
['ive', 'used', 'fujitsu', 'scansnap', 'months', 'scanner', 'compare', 'kodak', 'small', 'home', 'office', 'use', 'scansnap', 'tested', 'kodak', 'business', 'personal', 'use', 'kodak', 'scanner', 'speed', 'really', 'churns', 'images', 'scan', 'lot', 'every', 'day', 'may', 'good', 'buy', 'dont', 'mind', 'lack', 'good', 'userfriendly', 'software', 'comes', 'software', 'plus', 'thirdparty', 'software', 'paperport', 'omnipage', 'none', 'measure', 'intuitive', 'userfriendly', 'software', 'fujitsu', 'scansnap', 'tweaked', 'kodak', 'settings', 'hours', 'still', 'performing', 'well', 'scansnap', 'box', 'example', 'scansnap', 'automatically', 'adjusts', 'size', 'receipt', 'scans', 'saves', 'whats', 'needed', 'even', 'tweaking', 'kodak', 'doesnt', 'scansnap', 'organizer', 'software', 'much', 'intuitive', 'kodak', 'software', 'making', 'saving', 'filing', 'significantly', 'easier', 'scanning', 'half', 'battle', 'able', 'find', 'later', 'important', 

['use', 'daily', 'homework', 'worth', 'price']
['looking', 'lot', 'products', 'tilt', 'hp', 'laptop', 'finally', 'settled', 'one', 'wasnt', 'disappointed', 'near', 'perfect', 'light', 'weight', 'keeps', 'laptop', 'cool', 'day', 'lots', 'angle', 'adjustments', 'fits', 'particular', 'laptop', 'beautifully', 'although', 'follow', 'advice', 'previous', 'buyer', 'put', 'cardboard', 'strips', 'come', 'packaging', 'front', 'stand', 'slightly', 'raise', 'laptop', 'level', 'stands', 'two', 'front', 'tabs', 'make', 'much', 'comfortable', 'rest', 'hands', 'laptop']
['pictured', 'cute']
['always', 'expo', 'never', 'wrong', 'love', 'product', 'great', 'quality']
['husband', 'moved', 'new', 'home', 'last', 'year', 'extra', 'living', 'room', 'able', 'use', 'office', 'bought', 'new', 'office', 'furniture', 'mission', 'get', 'everything', 'neat', 'organized', 'could', 'desk', 'bought', 'nice', 'file', 'cabinet', 'built', 'needed', 'file', 'holders', 'put', 'similar', 'file', 'dividers', 'office', 'work

['im', 'artist', 'lot', 'pencil', 'work', 'little', 'eraser', 'lets', 'precision', 'erasing', 'without', 'disturbing', 'rest', 'drawing', 'great']
['nothing', 'takes', 'back', 'elementary', 'school', 'smelly', 'markers', 'classic', 'scents', 'remember', 'watermelon', 'cherry', 'nasty', 'black', 'licorice']
['love', 'love', 'theyre', 'made', 'america', 'soft', 'cover', 'perfect', 'notetaking', 'writing', 'etc', 'highly', 'recommended']
['datahookproductlinklinked', 'classalinknormal', 'hrefeasyreadregistercheckbooktransactionregisterscalendarspackofdpbyqcgkrefcmcrarpdrvwtxtieutfeasy', 'read', 'register', 'checkbook', 'transaction', 'registers', 'calendars', 'pack', 'write', 'checks', 'month', 'using', 'debit', 'card', 'still', 'need', 'place', 'write', 'transactions', 'keep', 'much', 'money', 'spending', 'constantly', 'running', 'checkbook', 'registers', 'annoying', 'decided', 'purchase', 'get', 'one', 'register', 'order', 'checks', 'received', 'quickly', 'fit', 'check', 'book', 'perfec

['regular', 'size', 'ones']
['good', 'ink']
['product', 'arrived', 'wrapped', 'thin', 'clear', 'plastic', 'wrapping', 'colorful', 'packaging', 'boxes', 'labels', 'appreciate', 'cost', 'cutting', 'efforts', 'pass', 'savings', 'consumers', 'edges', 'papers', 'wrinkledbentcrushed', 'due', 'lack', 'solid', 'box', 'thick', 'plastic', 'hold', 'edges', 'luckily', 'minor', 'problem', 'impact', 'performance', 'product', 'labels', 'printed', 'clearly', 'easy', 'apply', 'havent', 'found', 'surface', 'didnt', 'stick', 'however', 'seem', 'ever', 'slightly', 'less', 'sticky', 'major', 'name', 'brand', 'applying', 'plastic', 'bags', 'strongly', 'recommended']
['quick', 'frugal', 'printer', 'easy', 'setup', 'works', 'without', 'issues', 'surprised', 'fast', 'first', 'page', 'spits', 'good', 'quality', 'print']
['great', 'paper', 'used', 'kind', 'printer', 'time', 'paper', 'dont', 'get', 'hung', 'machine', 'printer', 'fussy', 'printer', 'looks', 'good', 'printed', 'buy', 'needed', 'amazon']
['nice', 'l

In [41]:
# Stemming
stemmer = PorterStemmer()
stemmed_samples = review_samples.copy()

In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_samples = review_samples.copy()

In [3]:
# Plot the vector distribution (bar graph)

In [None]:
# more preprocessing
## distribution graph
## limitization, stemming (week2)
## LaMDA ()?

### Text Representation

In [83]:
# Define methods to remove stopwords and punctuations from reviewText
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words("english")
ps = nltk.PorterStemmer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split("\W+", text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aiju2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [84]:
# Apply TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(review_samples["reviewText"])
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names())

(542, 2618)
['', '03mm', '05', '05mm', '062pg', '07', '1', '10', '100', '1000', '1012', '10hour', '11', '114', '1162a', '11x14', '12', '127', '13', '14', '15', '150', '16', '18', '1982', '1year', '2', '20', '200', '2000', '2014', '2015', '23', '24', '25', '26', '2b', '3', '30', '300', '301', '307', '30odd', '30sec', '314', '32016', '3349a', '34', '35mm', '38', '388', '3m', '3ring', '4', '40', '48', '4999', '4pli', '4x12', '4x6', '4yo', '5', '50', '500', '53', '57', '5th', '6', '60', '61', '61xl', '62', '64bit', '6520', '682017', '6pack', '7', '701', '71', '74x', '75', '7inch', '7mm', '8', '80', '835', '845', '85x11', '8x10', '9', '90', '92', '93', '96', '9913', '9mm', 'abil', 'abl', 'absolut', 'absorbt', 'abysm', 'accept', 'acceptablebett', 'access', 'accid', 'accident', 'accommod', 'accordingli', 'acct', 'accur', 'ach', 'achiev', 'across', 'action', 'actual', 'ad', 'add', 'addit', 'address', 'adequ', 'adher', 'adhes', 'adivertis', 'adjac', 'adjust', 'adquat', 'adult', 'advanc', 'adver

In [None]:
# Stemming/Lemmatization?

### Modeling (Lexicon-based approach)

In [122]:
# Valence Aware Dictionary and Sentiment Reasoner (VADR)
VADR_analyzer = SentimentIntensityAnalyzer()
# Pass the analyzer for head 5 rows
for index, row in review_samples.head().iterrows():
    vs = VADR_analyzer.polarity_scores(row["reviewText"])
    print(
        f"Index: {index}\n"+
        f"Sentimental Analysis Result: {vs}\n"+
        f"Overall rating category: {row.sentiment}\n"+
        f"Full Text:\n{row.reviewText}\n"+
        "-"*50)

Index: 778312
Sentimental Analysis Result: {'neg': 0.0, 'neu': 0.857, 'pos': 0.143, 'compound': 0.3612}
Overall rating category: positive
Full Text:
They work like they are supposed to... I use them in my work as a teacher...
--------------------------------------------------
Index: 528188
Sentimental Analysis Result: {'neg': 0.0, 'neu': 0.185, 'pos': 0.815, 'compound': 0.6588}
Overall rating category: positive
Full Text:
Great product!
--------------------------------------------------
Index: 88978
Sentimental Analysis Result: {'neg': 0.028, 'neu': 0.781, 'pos': 0.191, 'compound': 0.7814}
Overall rating category: positive
Full Text:
Hard to get excited over writing implements, but these are my personal favorite pens.  Can't speak for the pricing vs. local stores, but they showed up in one day and the convenience is worth a small upcharge.
--------------------------------------------------
Index: 602569
Sentimental Analysis Result: {'neg': 0.039, 'neu': 0.828, 'pos': 0.133, 'compound':

### Validation