In [127]:
import requests
import time
from bs4 import BeautifulSoup, element
import logging
from collections import defaultdict
import pandas as pd

# Clear any existing logging handlers
logger = logging.getLogger()
for handler in logger.handlers:
    logger.removeHandler(handler)

# Configure logging to display log messages in the notebook
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

# scraping
def scrape_webpage(url: str) -> bytes | None:
    """
    Fetches the content of a webpage.

    Parameters:
        url (str): The URL of the webpage to scrape.

    Returns:
        bytes | None: The raw content of the webpage if successful, None otherwise.
    """
    try:
        logger.info(f"Attempting to fetch content from {url}")
        page = requests.get(url)
        page.raise_for_status()
        logger.info(f"Successfully fetched content from {url}")
        return page.content
    except requests.exceptions.HTTPError as e:
        logger.error(f"An error occurred while fetching content from {url}: {e}")

def has_content(page_content: bytes) -> bool:
    soup = BeautifulSoup(page_content, 'html.parser')
    if soup.find('div', {'class': 'greyText nocontent stacked'}):
        return False
    return True

def scrape_webpage_paginate(review_list_number: int, sec_sleep_between_scraping: float = 2) -> list[bytes]:
    scraped_webpages = []
    page_number = 1
    while True:
        url = f'https://www.goodreads.com/review/list/{review_list_number}?page={page_number}&ref=nav_mybooks'
        page_content = scrape_webpage(url)
        
        if not has_content(page_content):
            logger.info(f"Scraping is done as no more content could be found after scraping {page_number} pages.")
            break

        scraped_webpages.append(page_content)
        
        time.sleep(sec_sleep_between_scraping)
        page_number += 1
    return scraped_webpages

    
# utilities
def parse_scraped_reviews(scraped_webpages: list[bytes]) -> pd.DataFrame:
    reviews: list = []
    for webpage_content in scraped_webpages:
        soup = BeautifulSoup(webpage_content, 'html.parser')
        reviews_html_format = soup.find_all(class_='bookalike review')
        for review in reviews_html_format:
            review_dict = parse_review(review)
            reviews.append(review_dict)
    return pd.DataFrame.from_dict(reviews)

    
def parse_review(review: element.Tag) -> dict:
    return {
        'title': get_review_title(review),
        'avg_rating': get_review_average_rating(review),
        'given_rating': get_review_given_rating(review),
            }

def get_review_title(review: element.Tag) -> str:
    return review.find('td', class_='field title').find('a').text.strip()

def get_review_average_rating(review: element.Tag) -> float:
    return float(review.find('td', {'class': 'field avg_rating'}).find('div', {'class': 'value'}).text.strip())

def ratings_mapping() -> defaultdict:
    ratings_mapping = defaultdict(lambda: None)
    ratings_mapping.update({'it was amazing': 5,
                        'really liked it': 4,
                        'liked it': 3,
                        'it was ok': 2,
                        'did not like it': 1,})
    return ratings_mapping

def get_review_given_rating(review: element.Tag) -> int:
    span_tag = review.find('span', {'class': 'staticStars notranslate'})
    title_value = span_tag['title'] if 'title' in span_tag.attrs else None
    return ratings_mapping()[title_value]

In [131]:


scraped_webpages = scrape_webpage_paginate(54144458)

Attempting to fetch content from https://www.goodreads.com/review/list/54144458?page=1&ref=nav_mybooks
Attempting to fetch content from https://www.goodreads.com/review/list/54144458?page=1&ref=nav_mybooks
Successfully fetched content from https://www.goodreads.com/review/list/54144458?page=1&ref=nav_mybooks
Successfully fetched content from https://www.goodreads.com/review/list/54144458?page=1&ref=nav_mybooks
Attempting to fetch content from https://www.goodreads.com/review/list/54144458?page=2&ref=nav_mybooks
Attempting to fetch content from https://www.goodreads.com/review/list/54144458?page=2&ref=nav_mybooks
Successfully fetched content from https://www.goodreads.com/review/list/54144458?page=2&ref=nav_mybooks
Successfully fetched content from https://www.goodreads.com/review/list/54144458?page=2&ref=nav_mybooks
Attempting to fetch content from https://www.goodreads.com/review/list/54144458?page=3&ref=nav_mybooks
Attempting to fetch content from https://www.goodreads.com/review/lis

In [143]:


reviews = parse_scraped_reviews(scraped_webpages)
reviews

In [145]:
def save_dataframe_to_pickle(df: pd.DataFrame, save_name: str) -> None:
    try:
        df.to_pickle(f"{save_name}.pkl")
        logging.info(f"Successfully saved DataFrame to {save_name}.pkl.")
    except Exception as e:
        logging.error(f"An error occurred while saving the DataFrame: {e}")
save_dataframe_to_pickle(reviews, "keke")

Successfully saved DataFrame to keke.pkl.
Successfully saved DataFrame to keke.pkl.
