In [None]:
!pip install fake-useragent

In [None]:
import pandas as pd
import numpy as np
import requests
import dateparser
from fake_useragent import UserAgent
ua = UserAgent()
from lxml import html
import time
import itertools

In [None]:
# helper functions for picking out the salient details from a review block


def get_asin(review):
    xpath_asin = ".//a[@data-hook='review-title']/@href"
    return review.xpath(xpath_asin)[0][-10:]

def get_review_id(review):
    return review.xpath("@id")[0]


def get_stars(review):
    xpath_stars = ".//i[@data-hook='review-star-rating']//text()"
    return review.xpath(xpath_stars)[0][0]


def get_title(review):
    xpath_title = ".//a[@data-hook='review-title']//text()"
    return review.xpath(xpath_title)[0]


def get_comment(review):
    xpath_comment = ".//span[@data-hook='review-body']//text()"
    if review.xpath(xpath_comment) != []:
        return review.xpath(xpath_comment)[0]
    else: 
        return "QQQQQQQQQ" 


def get_author(review):
    xpath_author = ".//a[@data-hook='review-author']/@href"
    if review.xpath(xpath_author) != [] and len(review.xpath(xpath_author)[0]) > 26:
        return review.xpath(xpath_author)[0][26:]
    else:
        return 0


def get_date(review):
    xpath_date = ".//span[@data-hook='review-date']//text()"
    return review.xpath(xpath_date)[0][3:]


def get_verified(review):
    xpath_verified = ".//span[@data-hook='avp-badge']//text()"
    if review.xpath(xpath_verified) != []:
        return review.xpath(xpath_verified)[0]
    else:
        return 0


def get_helpful_count(review):
    xpath_helpful = ".//span[@data-hook='helpful-vote-statement']//text()"
    if review.xpath(xpath_helpful) != []:
        score = review.xpath(xpath_helpful)[0].split()[0]
        if score == "One":
            return 1
        else:
            return score
    else:
        return 0


def get_image_count(review):
    xpath_image = ".//img[@data-hook='review-image-tile']"
    if review.xpath(xpath_image) != []:
        return len(review.xpath(xpath_image))
    else:
        return 0


def get_author_status(review):
    xpath_status = ".//span[@data-hook='review-author']/following-sibling::span[@class='a-size-mini a-color-link c7yBadgeAUI c7yTopDownDashedStrike c7y-badge-text a-text-bold']/text()"
    if review.xpath(xpath_status) != []:
        return review.xpath(xpath_status)[0]
    else:
        return "none"
    
def get_video_block(review):
    xpath_video = "div/div/span/div[starts-with(@id,'video-block')]"
    if review.xpath(xpath_video) != []:
        return 1
    else:
        return 0
    

In [None]:
def get_reviews_2(page):
    
    review_dict = {
    'asin': [],
    'page': [],
    'stars' : [],
    'author': [],
    'date': [],
    'title':[],
    'comment': [],
    'verified': [],
    'helpful': [],
    'pics': [],
    'video': [],
    'comment_id': [],
    'author_status':[]
    }
    
    #set up the request
    headers = {'User-Agent': ua.safari}
    r = requests.get(page, headers=headers)
    if r.status_code != 200:
        print('status error',r.status_code,page)

    #get test response from request
    reviews_page = r.text

    #parse the page
    parser = html.fromstring(reviews_page)

    # get the individual products
    xpath_review = "//div[@data-hook='review']"
    reviews = parser.xpath(xpath_review)

    for review in reviews:
        #add returned values to the list within the dictionary
        review_dict['asin'].append(get_asin(review))
        review_dict['page'].append(page)
        review_dict['stars'].append(get_stars(review))
        review_dict['title'].append(get_title(review))
        review_dict['comment'].append(get_comment(review))
        review_dict['author'].append(get_author(review))
        review_dict['date'].append(get_date(review))
        review_dict['comment_id'].append(get_review_id(review))
        review_dict['verified'].append(get_verified(review))
        review_dict['helpful'].append(get_helpful_count(review))
        review_dict['author_status'].append(get_author_status(review))
        review_dict['pics'].append(get_image_count(review))
        review_dict['video'].append(get_video_block(review))

    return review_dict

Import product details to scrape reviews for

In [None]:
df_play = pd.read_csv('amazon_playmobil.csv')
df_play.drop("Unnamed: 0",axis=1,inplace=True)

Turn the product table into a list of urls

In [None]:
big_list = [['https://www.amazon.co.uk/'+str(df_play.titles[i].replace(" ", "-")[0:42])+'/product-reviews/'+str(df_play.asins[i].replace(
    '/', '-'))+'/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews&pageNumber='+str(p) for p in range(1, int(np.ceil(df_play.review_counts[i]/10))+1)] for i in range(len(df_play))]

In [None]:
urls = list(itertools.chain(*big_list))

function to generate the calls to the product review pages

In [None]:
from multiprocessing.pool import ThreadPool

In [None]:
def async_get(urls):
    ls_=[]  #list to store result
    pool = ThreadPool(12) #generate a pool of 12 threads
    results = pool.map_async(function, urls) #map the function across all urls
    results.wait() # blocking
    ls_.append(results.get())  #store the result of the latest thread in the list
    pool.close() #close the pool
    pool.join() #close all threads
    return ls_ #

Generate the calls

In [None]:
start = time.time()

t__ = async_get(urls)

end = time.time()
print(end - start)

Store Results in master dataframe

In [None]:
review_dict = {
    'asin': [],
    'page': [],
    'stars' : [],
    'author': [],
    'date': [],
    'title':[],
    'comment': [],
    'verified': [],
    'helpful': [],
    'pics': [],
    'video': [],
    'comment_id': [],
    'author_status':[]
    }

df = pd.DataFrame(columns = review_dict.keys())

for i in range(len(t__[0])):
    df = df.append(pd.DataFrame(t__[0][i],columns=review_dict.keys()), ignore_index=True)