In [137]:
# !pip install boto3
# !ls ./dataset/bed_pillow_reviews/1-Beckham/

# import nltk
# nltk.download('punkt') # for sent_tokenize
# nltk.download('stopwords') 
# nltk.download('wordnet') # for WordNetLemmatizer

In [190]:
from credentials import ACCESS_KEY, SECRET_KEY
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Text preprocessing/analysis
import re
from nltk import word_tokenize, sent_tokenize, FreqDist
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from itertools import groupby 


# import seaborn as sns
import boto3
from datetime import datetime, timedelta

# import io
import json
import os
import glob
# import base64

# sns.set()
%matplotlib inline
plt.style.use('fivethirtyeight')

In [3]:
CURRENT_DIR = './dataset/bed_pillow_reviews/1-Beckham/'
all_files = sorted([os.path.join(CURRENT_DIR, f) for f in os.listdir(CURRENT_DIR)])

In [4]:
pillow_reviews = pd.concat((pd.read_csv(f, index_col=None, header=0) for f in all_files), ignore_index=True)

In [5]:
pillow_reviews['ReviewCountry'], pillow_reviews['ReviewDate'] = pillow_reviews['Date'].str.split(' on ', 1).str

pillow_reviews['ReviewDate'] = pd.to_datetime(pillow_reviews['ReviewDate'])

  """Entry point for launching an IPython kernel.


In [6]:
pillow_reviews['Period'] = pillow_reviews['ReviewDate'].apply(lambda x: "%d-%d" % (x.year, x.week))

In [9]:
# pillow_reviews['ReviewCountry'].unique()

# Contains Image OR Video
pillow_reviews[(pillow_reviews.Images!='-') | (pillow_reviews.Videos!='-')].count()

# Contains No Image Nor Video
pillow_reviews[(pillow_reviews.Images=='-') & (pillow_reviews.Videos=='-')].count()

Date             10686
Author           10686
Verified         10686
Helpful          10686
Title            10685
Body             10686
Rating           10686
Images           10686
Videos           10686
URL              10686
Variation        10686
Style            10686
ReviewCountry    10686
ReviewDate       10686
Period           10686
dtype: int64

In [55]:
# For TABLEAU USE
# pillow_reviews.to_csv('dataset/pillow_reviews_{}.csv'.format(re.sub(r'(-|:| )', '', str(datetime.now())[:-7])), encoding='utf_8_sig')

In [13]:
pillow_reviews.groupby(['Variation', 'Rating']).agg({'Rating': ["count"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,Rating
Unnamed: 0_level_1,Unnamed: 1_level_1,count
Variation,Rating,Unnamed: 2_level_2
-,1,13
-,2,10
-,3,12
-,4,25
-,5,24
B01LYC1XSM,1,582
B01LYC1XSM,2,262
B01LYC1XSM,3,331
B01LYC1XSM,4,392
B01LYC1XSM,5,1118


In [14]:
prod_1 = pillow_reviews[pillow_reviews['Variation'] == 'B01LYNW421']

In [98]:
prod_1_good = prod_1[prod_1['Rating'] >= 4]
prod_1_ok = prod_1[prod_1['Rating'] == 3]
prod_1_bad = prod_1[prod_1['Rating'] <= 2]

In [99]:
prod_1_good_reviews = prod_1_good.reset_index().Body
prod_1_ok_reviews = prod_1_ok.reset_index().Body
prod_1_bad_reviews = prod_1_bad.reset_index().Body

In [250]:
# strings_split = combined_strings.split()
# freq_splits = FreqDist(strings_split)
# print(freq_splits.most_common(20))

In [336]:
# short = set(s for s in strings_split if len(s)<4)
# short = [(s, freq_splits[s]) for s in short]
# short.sort(key=lambda x:x[1], reverse=True)
# short

In [249]:
# long = set(s for s in strings_split if len(s)>10)
# long = [(s, freq_splits[s]) for s in long]
# long.sort(key=lambda x:x[1], reverse=True)
# long

In [334]:
def summarise(pattern, strings, freq):
    """Summarise strings matching a pattern."""
    # Find matches
    compiled_pattern = re.compile(pattern)
    matches = [s for s in strings if compiled_pattern.search(s)]
    
    # Print volume and proportion of matches
    print("{} strings, that is {:.2%} of total".format(len(matches), len(matches)/ len(strings)))
    
    # Create list of tuples containing matches and their frequency
    output = [(s, freq[s]) for s in set(matches)]
    output.sort(key=lambda x:x[1], reverse=True)
    
    return output

In [335]:
# summarise(r"\d", strings_split, freq_splits)

In [248]:
#hyphenated words
# summarise(r"\w+-+\w+", strings_split, freq_splits)

In [251]:
def find_outlaw(word):
    """Find words that contain a same character 3+ times in a row."""
    is_outlaw = False
    for i, letter in enumerate(word):
        if i > 1:
            if word[i] == word[i-1] == word[i-2] and word[i].isalpha():
                is_outlaw = True
                break
    return is_outlaw
outlaws = [s for s in strings_split if find_outlaw(s)]
print("{} strings, that is {:.2%} of total".format(len(outlaws), len(outlaws)/ len(strings_split)))
outlaw_freq = [(s, freq_splits[s]) for s in set(outlaws)]
outlaw_freq.sort(key=lambda x:x[1], reverse=True)
# outlaw_freq

79 strings, that is 0.04% of total


In [291]:
def generate_token(reviews):
    
    #remove all grams starting with period
    def clean_grams(ngram_counter):
        begins_with_period = []
        for gram in ngram_counter:
            if gram[0] == '.':
                begins_with_period.append(gram)
        for gram in begins_with_period:
            del ngram_counter[gram]
    
    combined_strings = " ".join(reviews)

    tokeniser = RegexpTokenizer("[A-Za-z\']+|\.")
    tokens = tokeniser.tokenize(combined_strings)
    
    # Remove repeat
    deduped_tokens = [i[0] for i in groupby(tokens)]
    
    lemmatiser = WordNetLemmatizer()
    tokens_norm = [lemmatiser.lemmatize(t.lower(), "v") for t in deduped_tokens]
    
    return tokens_norm

In [324]:
def get_word_tree(tokens):
    def _word_tree(head=None, show_count=20, trailing=2):
        if type(head)==str:
            head = head.lower().split()
        trailing_grams = trailing 
        if head != None:
            trailing_grams += len(head)
        if head==None:
            ngram_counter = Counter(ngrams(tokens, trailing_grams))
        else:
            ngram_counter = Counter([gram for gram in ngrams(tokens, trailing_grams) if gram[:len(head)] == tuple(head)])
        clean_grams(ngram_counter)
        
        for (text, idx) in ngram_counter.most_common(show_count):
            print(f"{idx} - {' '.join(text)}")
#         return ngram_counter
    return _word_tree

In [325]:
# prod_1_bad_reviews
# prod_1_good_reviews

TEXT_TO_ANALYZE = prod_1_bad_reviews

word_tree = get_word_tree(generate_token(TEXT_TO_ANALYZE))

In [329]:
word_tree(None, 25, 4)

43 - no support at all
35 - i wake up with
31 - at all . i
29 - support at all .
27 - you lay your head
25 - wake up with neck
24 - be not the same
24 - pillow . they be
23 - not comfortable at all
23 - i be look for
22 - as soon as you
22 - your head on it
22 - you put your head
22 - put your head on
22 - wake up with a
22 - lay your head on
22 - waste of money .
22 - pillow i have ever
21 - a side sleeper and
21 - these pillow . i
21 - i buy these pillow
20 - i be very disappoint
20 - these pillow be not
19 - up with neck pain
19 - these pillow . they


In [337]:
word_tree('as soon as you', 25, 15)

1 - as soon as you rest your head on it sink down . there be not much support and i
1 - as soon as you put your head on it the pillow flatten out . it be quick to come
1 - as soon as you rest your head on it . it do not matter with any sleep position i
1 - as soon as you rest your head on them it sink to nearly flat . way too soft .
1 - as soon as you lay your head down you sink all the way through . zero support . they
1 - as soon as you lay your head on these you can feel it sink to the bottom of the
1 - as soon as you lay your head on them . i suppose if you like the feel of sleep
1 - as soon as you lay on them . as i state the st set be amaze . soft but
1 - as soon as you put your head on it . i go back to use my old pillow immediately
1 - as soon as you lay down there s no support . you lay down and your head hit the
1 - as soon as you put your head on it you sink down to the mattress . return these and
1 - as soon as you receive the product after sleep on it for one week it s jus

In [187]:
# stop_words = stopwords.words("english")
# print(f"There are {len(stop_words)} stopwords.\n")
# print(stop_words)
# my_stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 
#                 'ourselves', 'you', "you're", "you've", "you'll", 
#                 "you'd", 'your', 'yours', 'yourself', 'yourselves', 
#                 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 
#                 'hers', 'herself', 'it', "it's", 'its', 'itself', 
#                 'they', 'them', 'their', 'theirs', 'themselves', 
#                 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 's', 't', 'can', 'will', 'just', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

In [None]:
# single_review = pillow_reviews.Body[4]

In [150]:
# comprehend = boto3.client(
#     service_name='comprehend', 
#     region_name='us-west-2',
#     aws_access_key_id=ACCESS_KEY,
#     aws_secret_access_key=SECRET_KEY)

In [152]:
# text = single_review

# print('Calling DetectKeyPhrases')
# print(json.dumps(comprehend.detect_key_phrases(Text=text, LanguageCode='en'), sort_keys=True, indent=4))
# print('End of DetectKeyPhrases\n')

Calling DetectKeyPhrases
{
    "KeyPhrases": [
        {
            "BeginOffset": 19,
            "EndOffset": 32,
            "Score": 0.9999992251396179,
            "Text": "these pillows"
        },
        {
            "BeginOffset": 43,
            "EndOffset": 60,
            "Score": 1.0,
            "Text": "the perfect combo"
        },
        {
            "BeginOffset": 124,
            "EndOffset": 137,
            "Score": 1.0,
            "Text": "less soreness"
        },
        {
            "BeginOffset": 141,
            "EndOffset": 148,
            "Score": 0.9999998211860657,
            "Text": "my neck"
        },
        {
            "BeginOffset": 182,
            "EndOffset": 202,
            "Score": 0.9999993443489075,
            "Text": "a memory foam pillow"
        },
        {
            "BeginOffset": 274,
            "EndOffset": 287,
            "Score": 0.9999448657035828,
            "Text": "these pillows"
        },
        {
            