In [1]:
# !pip install boto3
# !ls ./dataset/bed_pillow_reviews/1-Beckham/

# import nltk
# nltk.download('punkt') # for sent_tokenize
# nltk.download('stopwords') 
# nltk.download('wordnet') # for WordNetLemmatizer

In [2]:
import pandas as pd
import numpy as np

# Text preprocessing/analysis
import re
from nltk import word_tokenize, sent_tokenize, FreqDist
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from itertools import groupby 


from datetime import datetime, timedelta

# import io
import json
import os
import glob
# import base64

In [80]:
def get_paths(pathname):
    return sorted([os.path.join(pathname, f) for f in os.listdir(pathname)])

In [81]:
# DATA_DIR = './data/bed_pillow_reviews/1-Beckham/'
DATA_DIR = './data/bed_pillow_reviews/2-down alt/'
all_files = get_paths(DATA_DIR)

In [86]:
def read_files(files, separator=','):
    """
    Takes a list of pathnames and individually reads then concats them into a single DataFrame which is returned.
    Can handle Excel files, csv, or delimiter separated text.
    """
    processed_files = []
    for file in files:
        if file.lower().endswith('.xlsx') or file.lower().endswith('.xls'):
            processed_files.append(pd.read_excel(file, index_col=None, header=0))
        elif file.lower().endswith('.csv'):
            processed_files.append(pd.read_csv(file, index_col=None, header=0))
        else:
            processed_files.append(pd.read_csv(file, sep=separator, index_col=None, header=0))
    completed_df = pd.concat(processed_files, ignore_index=True)
    return completed_df

In [87]:
pillow_reviews = read_files(all_files)

In [88]:
pillow_reviews['ReviewCountry'], pillow_reviews['ReviewDate'] = pillow_reviews['Date'].str.split(' on ', 1).str

pillow_reviews['ReviewDate'] = pd.to_datetime(pillow_reviews['ReviewDate'])

  """Entry point for launching an IPython kernel.


In [7]:
# Contains Image OR Video
print(f"Reviews with EITHER Image or Video: {len(pillow_reviews[(pillow_reviews.Images!='-') | (pillow_reviews.Videos!='-')])}")

# Contains No Image Nor Video
print(f"Reviews with NO Image or Video: {len(pillow_reviews[(pillow_reviews.Images=='-') & (pillow_reviews.Videos=='-')])}")

Reviews with EITHER Image or Video: 310
Reviews with NO Image or Video: 10686


In [8]:
# EXPORT FOR TABLEAU USE
# pillow_reviews.to_csv('dataset/pillow_reviews_{}.csv'.format(re.sub(r'(-|:| )', '', str(datetime.now())[:-7])), encoding='utf_8_sig')

In [9]:
pillow_reviews.groupby(['Variation', 'Rating']).agg({'Rating': ["count"]})

Unnamed: 0_level_0,Unnamed: 1_level_0,Rating
Unnamed: 0_level_1,Unnamed: 1_level_1,count
Variation,Rating,Unnamed: 2_level_2
-,1,13
-,2,10
-,3,12
-,4,25
-,5,24
B01LYC1XSM,1,582
B01LYC1XSM,2,262
B01LYC1XSM,3,331
B01LYC1XSM,4,392
B01LYC1XSM,5,1118


In [10]:
prod_1 = pillow_reviews[pillow_reviews['Variation'] == 'B01LYNW421']

In [89]:
prod_1_good = prod_1[prod_1['Rating'] >= 4]
# prod_1_ok = prod_1[prod_1['Rating'] == 3]
prod_1_bad = prod_1[prod_1['Rating'] <= 3]

In [90]:
prod_1_good_reviews = prod_1_good.reset_index()['Body']
# prod_1_ok_reviews = prod_1_ok.reset_index()['Body']
prod_1_bad_reviews = prod_1_bad.reset_index()['Body']

In [250]:
# strings_split = combined_strings.split()
# freq_splits = FreqDist(strings_split)
# print(freq_splits.most_common(20))

In [336]:
# short = set(s for s in strings_split if len(s)<4)
# short = [(s, freq_splits[s]) for s in short]
# short.sort(key=lambda x:x[1], reverse=True)
# short

In [249]:
# long = set(s for s in strings_split if len(s)>10)
# long = [(s, freq_splits[s]) for s in long]
# long.sort(key=lambda x:x[1], reverse=True)
# long

In [13]:
def summarise(pattern, strings, freq):
    """Summarise strings matching a pattern."""
    # Find matches
    compiled_pattern = re.compile(pattern)
    matches = [s for s in strings if compiled_pattern.search(s)]
    
    # Print volume and proportion of matches
    print("{} strings, that is {:.2%} of total".format(len(matches), len(matches)/ len(strings)))
    
    # Create list of tuples containing matches and their frequency
    output = [(s, freq[s]) for s in set(matches)]
    output.sort(key=lambda x:x[1], reverse=True)
    
    return output

In [14]:
# summarise(r"\d", strings_split, freq_splits)

In [15]:
#hyphenated words
# summarise(r"\w+-+\w+", strings_split, freq_splits)

In [28]:
def find_outlaw(word):
    """Find words that contain a same character 3+ times in a row."""
    is_outlaw = False
    for i, letter in enumerate(word):
        if i > 1:
            if word[i] == word[i-1] == word[i-2] and word[i].isalpha():
                is_outlaw = True
                break
    return is_outlaw

In [29]:
strings_split = " ".join(prod_1_bad_reviews).split()
outlaws = [s for s in strings_split if find_outlaw(s)]
print("{} strings, that is {:.2%} of total".format(len(outlaws), len(outlaws)/ len(strings_split)))
freq_splits = FreqDist(strings_split)
outlaw_freq = [(s, freq_splits[s]) for s in set(outlaws)]
outlaw_freq.sort(key=lambda x:x[1], reverse=True)
outlaw_freq

23 strings, that is 0.03% of total


[('sooo', 3),
 ('Sooooo', 2),
 ('soooo', 2),
 ('Waaaay', 2),
 ('ahhhh', 1),
 ('ARRRGH,', 1),
 ('beautifullll', 1),
 ('theee', 1),
 ('suppportive', 1),
 ('PAAAIIINN!!!"But', 1),
 ('louddd.', 1),
 ('ugghhh', 1),
 ('waaaaaaay', 1),
 ('soooooo', 1),
 ('head...sooo', 1),
 ('waaaay', 1),
 ('teeeeeny', 1),
 ('goood', 1)]

In [150]:
def generate_token(reviews):
    """Takes a list of documents and joins them together, before creating a giant lemmatized list of tokens"""
    combined_strings = " ".join(list(reviews))

    tokeniser = RegexpTokenizer("[A-Za-z\']+|\.")
    tokens = tokeniser.tokenize(combined_strings)
    
    # Remove repeat
    deduped_tokens = [i[0] for i in groupby(tokens)]
    
    lemmatiser = WordNetLemmatizer()
    tokens_norm = [lemmatiser.lemmatize(t.lower(), "v") for t in deduped_tokens]
    
    return tokens_norm

In [140]:
def get_word_tree(tokens):
    """
    Takes a list of tokens and returns a function that takes an optional 
    string which will be searched for for particular n-grams
    """

    #remove all grams starting with period
    def _clean_grams(ngram_counter):
        begins_with_period = []
        for gram in ngram_counter:
            if gram[0] == '.':
                begins_with_period.append(gram)
        for gram in begins_with_period:
            del ngram_counter[gram]

    def _word_tree(head=None, show_count=20, trailing=2, direction='forward', levels=0, indent=0):
        if type(head)==str:
            head = head.lower().split()
        trailing_grams = trailing 
        if head != None:
            trailing_grams += len(head)
        if head==None:
            ngram_counter = Counter(ngrams(tokens, trailing_grams))
        else:
            if direction == 'forward':
                ngram_counter = Counter([gram for gram in ngrams(tokens, trailing_grams) if gram[:len(head)] == tuple(head)])
            elif direction == 'backward':
                ngram_counter = Counter([gram for gram in ngrams(tokens, trailing_grams) if gram[-len(head):] == tuple(head)])
            else:
                ngram_counter = Counter([gram for gram in ngrams(tokens, trailing_grams) if gram[:len(head)] == tuple(head)])

        _clean_grams(ngram_counter)
        
        for (text, idx) in ngram_counter.most_common(show_count):
            print(f"{'  '*indent}{idx} - {' '.join(text)}")
            if levels > 0 and idx > 3:
                _word_tree(text, show_count=3, trailing=2, direction=direction, levels=levels-1, indent=indent+1)

    return _word_tree

In [141]:
# prod_1_bad_reviews
# prod_1_good_reviews

TEXT_TO_ANALYZE = prod_1_bad_reviews

word_tree = get_word_tree(generate_token(TEXT_TO_ANALYZE))

In [142]:
word_tree(None, show_count=15, trailing=4, direction='forward', levels=4)

53 - no support at all
  9 - no support at all . i
    1 - no support at all . i like the
    1 - no support at all . i m a
    1 - no support at all . i be so
  2 - no support at all . flat
  2 - no support at all . one
46 - i wake up with
  11 - i wake up with neck pain
    4 - i wake up with neck pain . i
      1 - i wake up with neck pain . i figure it
      1 - i wake up with neck pain . i buy these
      1 - i wake up with neck pain . i gotta return
    1 - i wake up with neck pain . the
    1 - i wake up with neck pain the next
  4 - i wake up with a sore
    1 - i wake up with a sore neck because
    1 - i wake up with a sore back good
    1 - i wake up with a sore neck and
  4 - i wake up with a stiff
    2 - i wake up with a stiff neck .
    1 - i wake up with a stiff neck when
    1 - i wake up with a stiff neck what
40 - i be look for
  4 - i be look for . i
    1 - i be look for . i feel like
    1 - i be look for . i feel the
    1 - i be look for . i purchase this
  3 - 

In [149]:
word_tree('i be look for', show_count=10, trailing=2, direction='forward', levels=1)

4 - i be look for . i
  1 - i be look for . i feel like
  1 - i be look for . i feel the
  1 - i be look for . i purchase this
3 - i be look for a pillow
3 - i be look for at all
2 - i be look for pillow that
1 - i be look for that hotel
1 - i be look for a proper
1 - i be look for at half
1 - i be look for cool and
1 - i be look for a fluffy
1 - i be look for a posture


In [144]:
word_tree('i be look for', 10, 3, 'backward')

3 - be not what i be look for
1 - and fluffy but i be look for
1 - side sleeper so i be look for
1 - choice hotel pillow i be look for
1 - extra support . i be look for
1 - it be but i be look for
1 - for sleep . i be look for
1 - primary pillow . i be look for
1 - they be heavy i be look for
1 - not the support i be look for


In [187]:
# stop_words = stopwords.words("english")
# print(f"There are {len(stop_words)} stopwords.\n")
# print(stop_words)
# my_stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 
#                 'ourselves', 'you', "you're", "you've", "you'll", 
#                 "you'd", 'your', 'yours', 'yourself', 'yourselves', 
#                 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 
#                 'hers', 'herself', 'it', "it's", 'its', 'itself', 
#                 'they', 'them', 'their', 'theirs', 'themselves', 
#                 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 's', 't', 'can', 'will', 'just', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

In [None]:
import boto3
from credentials import ACCESS_KEY, SECRET_KEY

In [None]:
# single_review = pillow_reviews.Body[4]

In [150]:
# comprehend = boto3.client(
#     service_name='comprehend', 
#     region_name='us-west-2',
#     aws_access_key_id=ACCESS_KEY,
#     aws_secret_access_key=SECRET_KEY)

In [152]:
# text = single_review

# print('Calling DetectKeyPhrases')
# print(json.dumps(comprehend.detect_key_phrases(Text=text, LanguageCode='en'), sort_keys=True, indent=4))
# print('End of DetectKeyPhrases\n')

Calling DetectKeyPhrases
{
    "KeyPhrases": [
        {
            "BeginOffset": 19,
            "EndOffset": 32,
            "Score": 0.9999992251396179,
            "Text": "these pillows"
        },
        {
            "BeginOffset": 43,
            "EndOffset": 60,
            "Score": 1.0,
            "Text": "the perfect combo"
        },
        {
            "BeginOffset": 124,
            "EndOffset": 137,
            "Score": 1.0,
            "Text": "less soreness"
        },
        {
            "BeginOffset": 141,
            "EndOffset": 148,
            "Score": 0.9999998211860657,
            "Text": "my neck"
        },
        {
            "BeginOffset": 182,
            "EndOffset": 202,
            "Score": 0.9999993443489075,
            "Text": "a memory foam pillow"
        },
        {
            "BeginOffset": 274,
            "EndOffset": 287,
            "Score": 0.9999448657035828,
            "Text": "these pillows"
        },
        {
            