In [1]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup
from urllib.parse import unquote

In [2]:
def decode_raw_link(link):
    """
    function to decode original incomprehensible href content into clickable links
    example:
        link to decode: 
            '/Refusal-Eve-M-Riley/dp/1916398219/ref=sr_1_1?crid=COOTT9PK3BYL&dib=eyJ2IjoiMSJ9.vNEzwUzblDG9DYS6PZUzfly5gXoeILtQnABAVDorje-W9p99cuFyHxSd1ftUJRhSjzxN0xi06yJjI--Z9BoFx8xv5loVBGqohy7IejjuIoSE8HBSKU1JGnUA6H14e693ysiESuv0vqYIPU1bvBOQaynzGTy2KIMpaICXzeFrLfcnNnxjtvO6Kn75-UG4pFakaRy1lKH0srgMDnZAHFKG4tLMvYszsOwGRxEvOn-ES8BLvOA7nCJ0_SsdNvLweMZPBtcT_JGJ_fuLRQDvTVyLYtvm9mkxfgO-JfzUOI_ltW4.2-L2oYRiC5zordcHPOndOItPp5PdCQlEDV68iuQo0BY&dib_tag=se&keywords=books&qid=1709343025&sprefix=books%2Caps%2C162&sr=8-1-spons&sp_csd=d2lkZ2V0TmFtZT1zcF9hdGY&psc=1'
        decoded link:
            '/Refusal-Eve-M-Riley/dp/1916398219/ref=sr_1_1?crid=COOTT9PK3BYL&dib=eyJ2IjoiMSJ9.vNEzwUzblDG9DYS6PZUzfly5gXoeILtQnABAVDorje-W9p99cuFyHxSd1ftUJRhSjzxN0xi06yJjI--Z9BoFx8xv5loVBGqohy7IejjuIoSE8HBSKU1JGnUA6H14e693ysiESuv0vqYIPU1bvBOQaynzGTy2KIMpaICXzeFrLfcnNnxjtvO6Kn75-UG4pFakaRy1lKH0srgMDnZAHFKG4tLMvYszsOwGRxEvOn-ES8BLvOA7nCJ0_SsdNvLweMZPBtcT_JGJ_fuLRQDvTVyLYtvm9mkxfgO-JfzUOI_ltW4.2-L2oYRiC5zordcHPOndOItPp5PdCQlEDV68iuQo0BY&dib_tag=se&keywords=books&qid=1709343025&sprefix=books%2Caps%2C162&sr=8-1-spons&sp_csd=d2lkZ2V0TmFtZT1zcF9hdGY&psc=1'
    """

    return unquote(link[(link.find("url=") + len("url=")):])


def get_soup_from_url(url,header):
    response = requests.get(url,headers=header)
    soup = BeautifulSoup(response.text,"html.parser")

    return soup



In [9]:

url = "https://www.amazon.ca/s?k=books"

custom_header = {'user-agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
                'accept-language':'en-GB,en;q=0.9',}

MAX_BOOK = 250

page_no = 1

book_tags = []

while len(book_tags) <= MAX_BOOK:

    url_page = url + "&page=" + str(page_no)

    print(url_page)
    soup = get_soup_from_url(url_page, custom_header)

    book_tags.extend(soup.find_all(name="h2",class_="a-size-mini a-spacing-none a-color-base s-line-clamp-4"))

    print(len(book_tags))
    
    page_no+=1





https://www.amazon.ca/s?k=books&page=1
60
https://www.amazon.ca/s?k=books&page=2
120
https://www.amazon.ca/s?k=books&page=3
180
https://www.amazon.ca/s?k=books&page=4
240
https://www.amazon.ca/s?k=books&page=5
300


In [11]:
homepage = "https://www.amazon.ca/"



reviews = []

for book_tag in book_tags:
    book_name = book_tag.find(name="span",class_="a-size-base-plus a-color-base a-text-normal").getText()
    book_link_raw = book_tag.find(name="a",class_="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal").get("href")
    book_link = homepage + decode_raw_link(book_link_raw)

    print("processing book: <" + book_name + ">")
    soup_book = get_soup_from_url(book_link,custom_header)
    
    review_tags = soup_book.find_all(name="div",class_="a-section review aok-relative")

    for review_tag in review_tags:
        review = {
            "book_name":book_name,
            "book_link":book_link,
            "profile_name":review_tag.find(name="span",class_="a-profile-name").getText(),
            "review_title":review_tag.find_all("span")[3].getText(),
            "ratings":review_tag.find(name="span",class_="a-icon-alt").getText().split(" ")[0],
        }

        try:
            review_content=review_tag.select_one("div.a-expander-content.reviewText.review-text-content.a-expander-partial-collapse-content span").getText()
        except:
            review_content=""

        review["review_content"] = review_content
        
        reviews.append(review)
    
    time.sleep(1)




processing book: <Artificial Intelligence and the Human Mind: A Radical New Old Science of the Human Mind>
processing book: <The Big Book of Serial Killers>
processing book: <Man of Honor (The Sisters Sloane Series Book 1)>
processing book: <Savage Little Games: A Dark Mafia, Enemies to Lovers Romance (Sin City Mafia Book 1)>
processing book: <Killers of the Flower Moon: The Osage Murders and the Birth of the FBI>
processing book: <Atomic Habits: An Easy & Proven Way to Build Good Habits & Break Bad Ones>
processing book: <The Lost Bookshop: The most charming and uplifting novel for 2024 and the perfect gift for book lovers!>
processing book: <The Housemaid>
processing book: <Ikigai: The Japanese Secret to a Long and Happy Life>
processing book: <The Psychology of Money: Timeless lessons on wealth, greed, and happiness>
processing book: <Icebreaker: A Novel (Volume 1)>
processing book: <Verity>
processing book: <The Silent Patient>
processing book: <Fourth Wing>
processing book: <Stop 

In [14]:
reviews_df = pd.DataFrame(reviews)

reviews_df.to_csv("reviews.csv",index=False)