In [None]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup

In [None]:
def fetch_with_splash(url):
    params = {
        'url': url,
        'wait': 2
    }
    r = requests.get('http://localhost:8050/render.html', params=params)
    if r.status_code == 200:
        return r.text
    else:
        print(f"Error fetching {url} | Status: {r.status_code}")
        return None

In [None]:
with open('bestbuy_urls.txt', 'r') as f:
    lines = [line.strip() for line in f]
    

In [None]:
all_reviews_data = []


for i in range(0, len(lines), 2):
    product_url = lines[i]
    product_name = lines[i+1]


    print(f"\nStarting scraping reviews for: {product_name}")

    reviews_data = []

    
    for page_num in range(1, 11):
        page_url = f"{product_url}&page={page_num}"
        html = fetch_with_splash(page_url)
        if not html:
            print(f"Skipping page {page_num} due to fetch error.")
            continue
        soup = BeautifulSoup(html, 'html.parser')

        review_blocks = soup.findAll("div", {'class': "review-item"})
        if not review_blocks:
            print(f"No reviews found on page {page_num}. Stopping pagination.")
            break

        for review in review_blocks:
            review_text = None
            div1 = review.find("div", {'class': "ugc-review-body"})
            if div1:
                div2 = div1.find('div', {'class': "ugc-components ugc-line-clamp relative"})
                if div2:
                    p_tag = div2.find("p", {'class': "pre-white-space"})
                    if p_tag:
                        review_text = p_tag.text.strip()

            review_date = None
            date_div = review.find('div', {'class': "posted-date-ownership disclaimer v-m-right-xxs"})
            if date_div:
                time_tag = date_div.find("time", {'class': "submission-date"})
                if time_tag and time_tag.get('title'):
                    review_date = time_tag['title']

            rating = None
            rating_p = review.find("p", {'class': "visually-hidden"})
            if rating_p:
                text = rating_p.text.strip()
                if text.startswith("Rated"):
                    rating_str = text.replace(' out of 5 stars', '').replace('Rated ', '')
                    try:
                        rating = float(rating_str)
                    except ValueError:
                        rating = None

            if review_text:
                reviews_data.append({
                    'Product': product_name,
                    'Review': review_text,
                    'Date': review_date,
                    'Rating': rating
                })

        print(f"Scraped {len(review_blocks)} reviews from page {page_num}")

    print(f"Finished scraping {len(reviews_data)} reviews for {product_name}")
    all_reviews_data.extend(reviews_data)

In [None]:
output_file = '../data/bestbuy.csv'

df = pd.DataFrame(all_reviews_data)
if not df.empty:
    df.to_csv(output_file, index=False)