In [None]:
#Before we can commence imports, we need to ensure we have the following modules downloaded:
!pip install fake-useragent   #allows a fake user agent to be specified for each call to Amazon

In [2]:
#Imports:
import requests
import pandas as pd
import numpy as np
from lxml import html  
from fake_useragent import UserAgent
ua = UserAgent()

# Amazon Product Reviews

I intend to perform natural language processing to establish a model to predict the star rating an Amazon customer will leave based on the review they leave a product. 

The star rating here is unique to each review, and is the score as defined by the customer. This is not the overall star rating that a product has on Amazon. The overall rating is defined by Amazon: 

"Amazon calculates a product’s star ratings using a machine learned model instead of a raw data average. The machine learned model takes into account factors including: the age of a review, helpfulness votes by customers and whether the reviews are from verified purchases."

As an extension, I intend to establish a model to predict whether another Amazon customer would deem a review 'helpful' based on its content.

# What Information Do I Need To Scrape?

I will need to do two rounds of scraping, Primarily to get all of the product links, and then once I have the products, I want to scrape all of their customer reviews.

I will create one script to run on pages that look like the below image, where I will take:
1. ASIN - The unique product identifier for Amazon Products
2. Product Name
3. Price
4. Number of Reviews

![Product List](images/product_list.png)

Once I have Run this scrape on a number of different product types, I will then for each product scrape the product reviews as left by customers.

The features that I will take from the reviews are:

1. Star rating
2. Review title
3. Author
4. Date review was left
5. Whether purchase was verified or not
6. Review
7. Helpful vote

Additional Features

8. Whether the review has any images
9. Whether the review has any videos

![Review Text](images/review_text.png)

# Scraping

## Product Scrapes

Here I will define function to take all the products from a product grid, and save the features to a dictionary. The function will take all the products on a page, and then navigate to the next page and take thos products and continue in this way until there are no further pages to visit for this scrape

In [None]:
def get_products(page, prod_dict):
    
    
    #XPATHS
    #xpath for amazon product list item
    xpath_product = "//li[starts-with(@id, 'result')]"
    #xpath for next product arrow link
    xpath_next_page = "//a[@id='pagnNextLink']/@href"

    #set up the request
    headers = {'User-Agent': ua.random}
    r = requests.get(page, headers=headers)
    if r.status_code != 200:
        print('status error')

    #get test response from request
    products_page = r.text
    
    #parse the page
    parser = html.fromstring(products_page)
    
    
    # get the individual products  
    products = parser.xpath(xpath_product)
    
    
    #iterate through products and get their info
    for product in products:
        
        # take the amazon asin unique identifier
        try:
            prod_dict['asins'].append(product.xpath("@data-asin")[0])
        except:
            prod_dict['asins'].append("none")
            
            
        # take the product title    
        try:
            prod_dict['titles'].append(
                product.xpath("div/div[3 or 4]/div[1]/a/h2/text()")[0])
        except:
            prod_dict['titles'].append("none")
            
            
        #take the number of reviews (this can be in 2 places based on product detail)
        try:
            # if 1st found element has 5 or more characters it is a 'more products' link, so take 2nd element
            if len(
                    product.xpath(
                        "div/div[6 or 7]/a[starts-with(@class, 'a-size-small')]/text()"
                    )[0]) > 5:
                prod_dict['review_counts'].append(
                    int(
                        product.xpath(
                            "div/div[6 or 7]/a[starts-with(@class, 'a-size-small')]/text()"
                        )[1]))
            else:
                prod_dict['review_counts'].append(
                    int(
                        product.xpath(
                            "div/div[6 or 7]/a[starts-with(@class, 'a-size-small')]/text()"
                        )[0]))
        except:
            prod_dict['review_counts'].append(0)
            
            
        # take the price of the product (can appear in 1 of 2 places)    
        try:
            if len(product.xpath("div/div[5 or 7]/div[1]/a/span/text()")[
                    0]) == 0:
                prod_dict['prices'].append(
                    (
                        product.xpath("div/div[5 or 7]/div[1]/a/span/text()")[
                            1].replace("£", "").replace(",", "")))
            else:
                prod_dict['prices'].append(
                    (
                        product.xpath("div/div[5 or 7]/div[1]/a/span/text()")[
                            0].replace("£", "").replace(",", "")))
        except:
            prod_dict['prices'].append(np.nan)
            
            
    # get the next page from the arrow link
    next_page = parser.xpath(xpath_next_page)
    
    #if no more pages, return the dictionary
    if len(next_page) == 0:
        return prod_dict
    
    #otherwise go to next page and repeat 
    else:
        page = "https://www.amazon.co.uk" + next_page[0]
        return get_products(page, prod_dict)

The function can then be called to start running on any page that includes the products in a grid as above

In [None]:
#Thr function needs a dictionary to store the features
prod_dict = {'asins':[],
            'titles':[],
            'review_counts':[],
            'prices':[]}

# This page will return all products in the beers,wines and spirits category
page = "https://www.amazon.co.uk/beer-wine-spirits/b/ref=nav_shopall_wine_spirits?ie=UTF8&node=358583031"

#The function will return a dictionary called bws
bws = get_products(page,prod_dict)

In [None]:
# The results can then be loaded into a dataframe for inspection
df = pd.DataFrame(bws,columns=bws.keys())

In [None]:
#Save the product information to a .csv file
df.to_csv('amazon_bws.csv')

# Types of Product Searched

I have taken 4 different groups of products to look at the reviews for:

1. Digital SLR cameras
2. Mens sport shoes
3. Beers/wines/spirits
4. Playmobil

I have chosen playmobil here as a product type for which the review most likely isnt being written by the end user and I expect that there may be some interesting results from this.

I am expecting the nlp models to establish different feature weights for each category, but would like to look at the features that are shared between categories. I expect that features shared across categories may give an indication of service as opposed to product. This could be a useful indicator for Amazon Customer service to monitor.

In [None]:
#LOAD RESULTS FROM PRODUCT SCRAPES:

df_slr = pd.read_csv('amazon_slr.csv')
df_bws = pd.read_csv('amazon_bws.csv')
df_mens_shoes = pd.read_csv('amazon_mens_shoes.csv')
df_playmobil = pd.read_csv('amazon_playmobil.csv')


In [16]:
#Load Playmobil as an example
df_playmobil = pd.read_csv('amazon_playmobil.csv')
df_playmobil.drop("Unnamed: 0",inplace=True,axis=1)

In [17]:
#Look at the number of playmobil products returned
len(df_playmobil)

3200

In [18]:
#Example of product listing
df_playmobil.head()

Unnamed: 0,asins,titles,review_counts,prices
0,B00VGQKBGK,PLAYMOBIL Take Along Pet Store Playset,2,24.99
1,B01608M23I,Playmobil 6888 Summer Fun Camp Site with LED Fire,26,14.99
2,B00IF1VVFO,Playmobil 5568 City Life Preschool Children's ...,162,17.60 - 73.84
3,B01LTHZP16,Playmobil 6921 City Action Police Helicopter w...,51,18.49
4,B00VLUZ31O,Playmobil 6657 City Life Furnished Children's ...,91,59.99


## Scraping the Reviews

Because I know from the first scrape how many reviews a product has, and that each product review page has 10 reviews, I can utilise the power of parallel requests to scrape reveiews in parallel rather than in a sequential manner.

In [None]:
#Import modules for multi threading
import multiprocessing as mp
import threading

In [7]:
# helper functions for picking out the salient details from a review block


def get_asin(review):
    xpath_asin = ".//a[@data-hook='review-title']/@href"
    return review.xpath(xpath_asin)[0][-10:]

def get_review_id(review):
    return review.xpath("@id")[0]


def get_stars(review):
    xpath_stars = ".//i[@data-hook='review-star-rating']//text()"
    return review.xpath(xpath_stars)[0][0]


def get_title(review):
    xpath_title = ".//a[@data-hook='review-title']//text()"
    return review.xpath(xpath_title)[0]


def get_comment(review):
    xpath_comment = ".//span[@data-hook='review-body']//text()"
    if review.xpath(xpath_comment) != []:
        return review.xpath(xpath_comment)[0]
    else: 
        return "QQQQQQQQQ" 


def get_author(review):
    xpath_author = ".//a[@data-hook='review-author']/@href"
    if review.xpath(xpath_author) != [] and len(review.xpath(xpath_author)[0]) > 26:
        return review.xpath(xpath_author)[0][26:]
    else:
        return 0


def get_date(review):
    xpath_date = ".//span[@data-hook='review-date']//text()"
    return review.xpath(xpath_date)[0][3:]


def get_verified(review):
    xpath_verified = ".//span[@data-hook='avp-badge']//text()"
    if review.xpath(xpath_verified) != []:
        return review.xpath(xpath_verified)[0]
    else:
        return 0


def get_helpful_count(review):
    xpath_helpful = ".//span[@data-hook='helpful-vote-statement']//text()"
    if review.xpath(xpath_helpful) != []:
        score = review.xpath(xpath_helpful)[0].split()[0]
        if score == "One":
            return 1
        else:
            return score
    else:
        return 0


def get_image_count(review):
    xpath_image = ".//img[@data-hook='review-image-tile']"
    if review.xpath(xpath_image) != []:
        return len(review.xpath(xpath_image))
    else:
        return 0


def get_author_status(review):
    xpath_status = ".//span[@data-hook='review-author']/following-sibling::span[@class='a-size-mini a-color-link c7yBadgeAUI c7yTopDownDashedStrike c7y-badge-text a-text-bold']/text()"
    if review.xpath(xpath_status) != []:
        return review.xpath(xpath_status)[0]
    else:
        return "none"
    
def get_video_block(review):
    xpath_video = "div/div/span/div[starts-with(@id,'video-block')]"
    if review.xpath(xpath_video) != []:
        return 1
    else:
        return 0
    

Next I will define the function to go through each review on a review page, and extract the features to a dictionary

In [None]:
def get_reviews_2(page):
    
    
    review_dict = {
    'asin': [],
    'page': [],
    'stars' : [],
    'author': [],
    'date': [],
    'title':[],
    'comment': [],
    'verified': [],
    'helpful': [],
    'pics': [],
    'video': [],
    'comment_id': [],
    'author_status':[]
    }
    
    #set up the request
    headers = {'User-Agent': ua.safari}
    r = requests.get(page, headers=headers)
    if r.status_code != 200:
        print('status error',r.status_code,page)

    #get test response from request
    reviews_page = r.text

    #parse the page
    parser = html.fromstring(reviews_page)

    # get the individual products
    xpath_review = "//div[@data-hook='review']"
    reviews = parser.xpath(xpath_review)

    for review in reviews:
        #add returned values to the list within the dictionary
        review_dict['asin'].append(get_asin(review))
        review_dict['page'].append(page)
        review_dict['stars'].append(get_stars(review))
        review_dict['title'].append(get_title(review))
        review_dict['comment'].append(get_comment(review))
        review_dict['author'].append(get_author(review))
        review_dict['date'].append(get_date(review))
        review_dict['comment_id'].append(get_review_id(review))
        review_dict['verified'].append(get_verified(review))
        review_dict['helpful'].append(get_helpful_count(review))
        review_dict['author_status'].append(get_author_status(review))
        review_dict['pics'].append(get_image_count(review))
        review_dict['video'].append(get_video_block(review))
    
    print(review_dict)
#     return_dict[page]=review_dict
    return review_dict

In order to use the multi threading, I will loop through my products dataframe, and for each product, generate a request for the appropriate number of review page scrapes based on the 10 reviews per page fact

In [None]:
from multiprocessing.pool import ThreadPool

In [None]:
def async_get(urls):
    ls_=[]
    pool = ThreadPool(18)
    results = pool.map_async(get_reviews_2, urls)
    results.wait()
    ls_.append(results.get())
    pool.close()
    pool.join()
    return ls_

## Processing the Reviews

For Each of the product dataframes, I will run the above process, and then process the resulting dictionary of dictionaries into a dataframe

In [None]:
# Example Run of the above process
start = time.time()

playmobil_reviews = async_get(urls,master_dict)

end = time.time()
print(end - start)

In [None]:
#Processing the dictionary of dictionaries into a dataframe
review_dict = {
    'asin': [],
    'page': [],
    'stars' : [],
    'author': [],
    'date': [],
    'title':[],
    'comment': [],
    'verified': [],
    'helpful': [],
    'pics': [],
    'video': [],
    'comment_id': [],
    'author_status':[]
    }

df = pd.DataFrame(columns = review_dict.keys())

for i in range(len(t__[0])):
    df = df.append(pd.DataFrame(playmobil_reviews[0][i],columns=review_dict.keys()), ignore_index=True)

In [11]:
#Loading some sample reviews
example_reviews = pd.read_csv('example_reviews.csv')
example_reviews.drop("Unnamed: 0",axis=1,inplace=True)

In [12]:
#Look at a sample of the reviews
example_reviews.head()

Unnamed: 0,asin,page,stars,author,date,title,comment,verified,helpful,pics,video,comment_id,author_status
0,B01608M23I,3,5,AHYW7TVACEMYQS2NQ2D3CISJQLYQ,31 October 2017,Five Stars,Happy with product he loves play mobile.,Verified Purchase,0,0,0,RTTUM0QU7HZWQ,none
1,B01608M23I,3,5,AGYJQ2G2AUWNUJRB7SVFRV3PSJ6Q,17 January 2018,Five Stars,excellent,Verified Purchase,0,0,0,R3FNSJ45BRF0X8,none
2,B01608M23I,3,5,AFA7TYS6KM6QLO5USYDOIDKUAMTA,16 March 2018,Five Stars,Grand daughter loves it,Verified Purchase,0,0,0,RE2CU061L9FBT,none
3,B01608M23I,3,5,AHKXO7XI2FSQYJEUID7H5TRMY4ZQ,27 March 2018,Five Stars,Godson loves it,Verified Purchase,0,0,0,R1G8XD7M3NPF9M,none
4,B01608M23I,3,5,AHA2E7FB7Y6QQQPZNM62R4CNACSA,21 March 2018,Good,Good,Verified Purchase,0,0,0,R2YW9S0RZDA23M,none
