In [12]:
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "requests"])

0

In [14]:
subprocess.check_call([sys.executable, "-m", "pip", "install", "beautifulsoup4"])

0

In [None]:
import requests
import time
import random
import json
import re # regular expressions (pattern matching)
from bs4 import BeautifulSoup

In [None]:
# Base URL
URL="https://ie.trustpilot.com/review/brownthomas.com"

In [18]:
HEADERS = {
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.5 Safari/605.1.15",
    "accept-language":"en-GB,en-US;q=0.9,en;q=0.8"
}

In [None]:
# setting constants
MAX_REVIEWS = 200
MAX_PAGES = 50 #safety
SLEEP_RANGE = (2.0,4.0)

In [38]:
# function to get response from URL
def fetch_page(url):
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status() # raises httperror, if one occurred.
    return resp.text

In [21]:
# returns a list of jason blobs

def find_json_blobs(html):
    """Return a list of parsed JSON blobs found in <script> tags."""
    soup = BeautifulSoup(html, "html.parser")
    blobs = []

    # 1) Next.js hydration data (often contains reviews)
    nd = soup.find("script", id="_NEXT_DATA_")
    if nd and nd.string:
        try:
            blobs.append(json.loads(nd.string))
        except Exception:
            pass

    # 2) Any structured ld+json blocks (may include a subset of reviews)
    for tag in soup.find_all("script", {"type": "application/ld+json"}):
        if tag.string:
            try:
                data = json.loads(tag.string)
                blobs.append(data)
            except Exception:
                # some pages include multiple JSON objects concatenated; try to split safely
                for m in re.finditer(r"\{.*?\}", tag.string, flags=re.DOTALL):
                    try:
                        blobs.append(json.loads(m.group(0)))
                    except Exception:
                        pass
    return blobs

In [26]:
# get all the data out of the objects
def flatten(obj):
   
    if isinstance(obj, dict):
        yield obj
        for v in obj.values():
            yield from flatten(v)
    elif isinstance(obj, list):
        for v in obj:
            yield from flatten(v)

In [None]:
# fnd keys that are relevant to reviews
def looks_like_review(d):
   
    if not isinstance(d, dict):
        return False
    keys = set(d.keys())
    # Common fields seen in Trustpilot JSON structures
    review_keys = {"text", "rating", "consumer", "title", "createdDateTime"}
    return len(keys & review_keys)>=2

In [29]:
# extracting the data (normalize review)
def extract_review_fields(d):
    
    # Try multiple possible shapes
    author_id = (
        (d.get("consumer") or {}).get("id")
        or None
    )
    title = d.get("title") or None
    review_text =  d.get("text") or None
    rating = (        
        d.get("rating")
        or None
    )
    created_date = d.get("createdDateTime") or None

    # Clean up basic types
    if isinstance(rating, dict):
        rating = rating.get("value") or rating.get("ratingValue")
    if isinstance(text, str):
        text = text.strip()
    if isinstance(title, str):
        title = title.strip()

    return {
        
        "author_id": id,
        "title": title,
        "review_text": text,
        "rating": rating,
        "created_date": createdDateTime,
    }

In [35]:
# extract review from jason blobs
def parse_reviews_from_html(html):
    reviews = []

    # JSON-first approach
    for blob in find_json_blobs(html):
        for node in flatten(blob):
            if looks_like_review(node):
                reviews.append(extract_review_fields(node))

            # Some pages store reviews under explicit arrays named 'reviews'
            if isinstance(node, dict):
                for k, v in node.items():
                    if k.lower() in ("reviews", "serviceReviews", "recentReviews") and isinstance(v, list):
                        for item in v:
                            if isinstance(item, dict) and looks_like_review(item):
                                reviews.append(extract_review_fields(item))

# Deduplicate by (text, date, author) heuristic
    seen = set()
    unique = []
    for r in reviews:
        key = (r["review_text"], r["created_date"], r["author_id"])
        if key not in seen and (r["text"] or r["title"]):
            unique.append(r)
            seen.add(key)

    return unique
  

In [36]:
# loop for getting pages from the website
def get_reviews(base_url=URL, max_reviews=MAX_REVIEWS):
    out = []
    for page in range(1, MAX_PAGES + 1):
        url = base_url if page == 1 else f"{base_url}?page={page}"
        html = fetch_page(url)
        page_reviews = parse_reviews_from_html(html)

        # Keep only new items by text
        existing_texts = {r["text"] for r in out if r["text"]}
        fresh = [r for r in page_reviews if (r["text"] and r["text"] not in existing_texts)]
        out.extend(fresh)

        print(f"Page {page}: got {len(fresh)} new, total={len(out)}")
        if len(out) >= max_reviews:
            return out[:max_reviews]

        # polite delay
        time.sleep(random.uniform(*SLEEP_RANGE))

    return out[:max_reviews]

In [39]:
reviews=get_reviews()


Page 1: got 0 new, total=0
Page 2: got 0 new, total=0
Page 3: got 0 new, total=0
Page 4: got 0 new, total=0
Page 5: got 0 new, total=0
Page 6: got 0 new, total=0


KeyboardInterrupt: 