In [14]:
%pip install lxml
%pip install requests

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [16]:
### Trustpilot data scraping module 2.0
### Original Author: Hakim Khalafi
### Updated to incorporate new Trustpilot page URL formats and include date data by Will Vaughan

## Imports

import math
import csv
import time
import json
import requests
import lxml.html as html

In [17]:
## Configurations

# Trustpilot review page
basePage = 'https://uk.trustpilot.com/review/'
reviewSite = 'www.ocado.com'
reviewPage = basePage + reviewSite

# Data file to save to
datafile = 'ocado_reviews.csv'

# Trustpilot default 
resultsPerPage = 20 

print('Scraper set for ' + reviewPage + ' - saving result to ' + datafile)

Scraper set for https://uk.trustpilot.com/review/www.ocado.com - saving result to ocado_reviews.csv


In [18]:
## Count amount of pages to scrape

# Get page, skipping HTTPS as it gives certificate errors
page = requests.get(reviewPage, verify = False)
tree = html.fromstring(page.content)

# Total amount of ratings
ratingCount = tree.xpath('//span[@class="headline__review-count"]')
ratingCount = int(ratingCount[0].text.replace(',',''))

# Amount of chunks to consider for displaying processing output 
# For ex. 10 means output progress for every 10th of the data
tot_chunks = 20

# Throttling to avoid spamming page with requests
# With sleepTime seconds between every page request
throttle = True
sleepTime = 1

# Total pages to scrape
pages = math.ceil(ratingCount / resultsPerPage)
print('Found total of ' + str(pages) + ' pages to scrape')

Found total of 668 pages to scrape




In [19]:
base_url = "https://uk.trustpilot.com"
page = requests.get(reviewPage)
tree = html.fromstring(page.content)

next_page = tree.xpath("//a[contains(@class, 'next-page')]")
if next_page:
    next_page_url = f"{base_url}{next_page[0].get('href')}"
    print(next_page_url)



            

https://uk.trustpilot.com/review/www.ocado.com?b=MTYxOTgwMTU4NDAwMHw2MDhjMzVmMGY5ZjQ4NzA1MTAzNGJjZTg


In [None]:
## Main scraping section

with open(datafile, 'w', newline='', encoding='utf8') as csvfile:
    
    # Tab delimited to allow for special characters
    datawriter = csv.writer(csvfile, delimiter='\t')
    print('Processing..')
   
    pages = math.ceil(ratingCount / resultsPerPage)
    page = requests.get(reviewPage)
   
    for i in range(1,pages+1):
        
        # Sleep if throttle enabled
        if(throttle): time.sleep(sleepTime)
        if(i==1): page = requests.get(reviewPage)
        if(i>1): page = requests.get(next_page_url)
        tree = html.fromstring(page.content)
        
        # Each item below scrapes a pages review titles, bodies and ratings
        script_bodies = tree.xpath("//script[starts-with(@data-initial-state, 'review-info')]")
        for idx,elem in enumerate(script_bodies):
            curr_item = json.loads(elem.text_content())

            # Progress counting, outputs for every processed chunk
            reviewNumber = idx + 20*(i-1) + 1
            chunk = int(ratingCount / tot_chunks)
            if(reviewNumber % chunk == 0): 
                print('Processed ' + str(reviewNumber) + '/'  + str(ratingCount) + ' ratings')
            
            title = curr_item["reviewHeader"]
            body = curr_item["reviewBody"]
            rating = curr_item["stars"]
            
            datawriter.writerow([title,body,rating])
        
        next_page = tree.xpath("//a[contains(@class, 'next-page')]")
        base_url = "https://uk.trustpilot.com"
        if next_page:
            next_page_url = f"{base_url}{next_page[0].get('href')}"
        page = next_page_url
            
    print('Processed ' + str(ratingCount) + '/' + str(ratingCount) + ' ratings.. Finished!')

Processing..
Processed 667/13351 ratings
Processed 1334/13351 ratings
Processed 2001/13351 ratings
Processed 2668/13351 ratings
