In [19]:
# Log in to Consumer Reports
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import re
import time

def scroll_to_bottom(driver, scrolls=5, pause=2):
    for _ in range(scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)  

driver = webdriver.Chrome()
driver.get("https://www.consumerreports.org/")
wait = WebDriverWait(driver, 10)
sign_in_label = wait.until(EC.element_to_be_clickable((By.ID, "sign-in-label")))
sign_in_label.click()
wait = WebDriverWait(driver, 10)
login_popout = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".cda-gnav__account-menu--non-member .cda-gnav__menu-content")))
username_field = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.NAME, "userName")))
password_field = driver.find_element(By.NAME, "password")
username_field.send_keys("willfreeman@gmail.com")
password_field.send_keys("NmTCmG8mSvq/L+W")
sign_in_button = login_popout.find_element(By.ID, "gnav-signin-submit")
sign_in_button.click()
wait.until(EC.url_contains("https://www.consumerreports.org/"))
current_url = driver.current_url
if current_url == "https://www.consumerreports.org/":
    print("Login successful!")
else:
    print("Login failed or redirected to:", current_url)

# Scrape ratings pages
df_scrape = pd.DataFrame(columns=["CR_Category", "Product_Name", "Product_Link", "Amazon_Link", "ASIN", "CR_Score"])

ratings_url = "https://www.consumerreports.org/home-garden/cookware/nonstick-cookware-sets/c29546/"
driver.get(ratings_url)

if current_url == "https://www.consumerreports.org/":
    print("URL successful!")
else:
    print("URL failed or redirected to:", current_url)

scroll_to_bottom(driver, scrolls=10)  

html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

product_elements = soup.find_all('div', class_='classic-view__body__item')

print(f"Found {len(product_elements)} product elements.")
cr_category = 'Non-Stick Cookware'

if product_elements:
    for idx, product_element in enumerate(product_elements):
        print(f"Processing product {idx + 1}...")  # Debugging
        
        # Product Name
        product_name_element = product_element.find('p', class_='crux-product-title')
        product_name = product_name_element.text.strip() if product_name_element else None

        # Product Score
        product_score_element = product_element.find('div', class_='ratings-overall-score')
        score_text = product_score_element.find('p', class_='crux-numbers-score') if product_score_element else None
        product_score = score_text.get_text(strip=True) if score_text else None

        # Product Link
        product_link_element = product_element.find('a', href=True)
        product_link = 'https://www.consumerreports.org' + product_link_element['href'] if product_link_element else None

        print(f'Product: {product_name}, Link: {product_link}, Score: {product_score}')  # Debugging
        
        # Fetch the Amazon link
        if product_link:  
            driver.get(product_link)
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            
            amazon_link_element = soup.find('a', href=lambda href: href and 'amazon.com' in href.lower())
            amazon_link = amazon_link_element['href'] if amazon_link_element else None
            
            asin_match = re.search(r'/dp/(B[0-9A-Z]{9})', amazon_link) if amazon_link else None
            asin = asin_match.group(1) if asin_match else None
            
            # Only append if product_name, product_link, and product_score are not None
            if product_name and product_link and product_score:
                temp_df = pd.DataFrame([{"CR_Category": cr_category, 
                                         "Product_Name": product_name, 
                                         "Product_Link": product_link, 
                                         "Amazon_Link": amazon_link, 
                                         "ASIN": asin, 
                                         "CR_Score": product_score}])
                df_scrape = pd.concat([df_scrape, temp_df], ignore_index=True)

print("Scraping completed.")
driver.quit()




Login successful!
URL successful!
Found 833 product elements.
Processing product 1...
Product: Caraway Ceramic-Coated Non-Stick, Link: https://www.consumerreports.org/home-garden/cookware/caraway-ceramic-coated-non-stick/m407538/, Score: 88
Processing product 2...
Product: None, Link: None, Score: None
Processing product 3...
Product: None, Link: None, Score: None
Processing product 4...
Product: None, Link: None, Score: None
Processing product 5...
Product: None, Link: None, Score: None
Processing product 6...
Product: None, Link: None, Score: None
Processing product 7...
Product: None, Link: None, Score: None
Processing product 8...
Product: None, Link: None, Score: None
Processing product 9...
Product: None, Link: None, Score: None
Processing product 10...
Product: None, Link: None, Score: None
Processing product 11...
Product: None, Link: None, Score: None
Processing product 12...
Product: None, Link: None, Score: None
Processing product 13...
Product: None, Link: None, Score: None

In [20]:
df_scrape.head()

Unnamed: 0,CR_Category,Product_Name,Product_Link,Amazon_Link,ASIN,CR_Score
0,Non-Stick Cookware,Caraway Ceramic-Coated Non-Stick,https://www.consumerreports.org/home-garden/co...,https://www.amazon.com/s/?field-keywords=Caraw...,,88
1,Non-Stick Cookware,GreenPan Reserve Ceramic Nonstick,https://www.consumerreports.org/home-garden/co...,https://www.amazon.com/dp/B08C8TWN1B?tag=dprmd...,B08C8TWN1B,87
2,Non-Stick Cookware,Oxo Ceramic Professional Non-Stick,https://www.consumerreports.org/home-garden/co...,https://www.amazon.com/dp/B09J5CX3XZ?tag=dprmd...,B09J5CX3XZ,87
3,Non-Stick Cookware,Member’s Mark (Sam’s Club) Hard Anodized Aluminum,https://www.consumerreports.org/home-garden/co...,https://www.amazon.com/s/?field-keywords=Membe...,,86
4,Non-Stick Cookware,Zwilling Vitale,https://www.consumerreports.org/home-garden/co...,,,85


In [11]:
df_scrape.head()

Unnamed: 0,CR_Category,Product_Name,Product_Link,Amazon_Link,ASIN,CR_Score
0,Noise Cancelling Headphones,Coway Airmega ProX 3522F,https://www.consumerreports.org/appliances/air...,https://www.amazon.com/dp/B0BZTQ16B5?tag=dprmd...,B0BZTQ16B5,93
1,Noise Cancelling Headphones,Blueair Classic 605,https://www.consumerreports.org/appliances/air...,https://www.amazon.com/dp/B01L9UT1YU?tag=dprmd...,B01L9UT1YU,88
2,Noise Cancelling Headphones,Alen BreatheSmart 75i Pure,https://www.consumerreports.org/appliances/air...,https://www.amazon.com/dp/B07FVYF77S?tag=dprmd...,B07FVYF77S,87
3,Noise Cancelling Headphones,Blueair Blue Pure 211i Max,https://www.consumerreports.org/appliances/air...,https://www.amazon.com/dp/B0BN2MGV5H?tag=dprmd...,B0BN2MGV5H,86
4,Noise Cancelling Headphones,Blueair Blue Pure 211+,https://www.consumerreports.org/appliances/air...,https://www.amazon.com/dp/B073WJL99W?tag=dprmd...,B073WJL99W,85


In [5]:
import os
print(os.getcwd())

c:\Users\willf\OneDrive\Documents\NYDSA\Reviews Project


In [21]:
import requests
import time

api_endpoint_url = "https://api.rainforestapi.com/request"
amazon_domain = "amazon.com"
request_type = "product"
api_key = "0D7110B1CEF0486E87481A4D231B98A2"

results = []

for index, row in df_scrape.iterrows():
    asin = row["ASIN"]
    
    # Skip the current loop iteration if ASIN is None
    if not asin:
        print(f"Skipping row {index} due to missing ASIN.")
        continue

    params = {
        "api_key": api_key,
        "amazon_domain": amazon_domain, 
        "asin": asin,
        "type": request_type
    }

    try:
        response = requests.get(api_endpoint_url, params=params)
        response.raise_for_status()  # raise an HTTPError if the HTTP request returned an unsuccessful status code
        
        data = response.json()
        print(data)  # to inspect the 'data' variable
        product_data = data.get("product", {})
        
        df_scrape.at[index, "Rating"] = product_data.get("rating")
        df_scrape.at[index, "Ratings_Total"] = product_data.get("ratings_total")
        
        rating_breakdown = product_data.get("rating_breakdown", {})
        for star_category, star_data in rating_breakdown.items():
            df_scrape.at[index, f"{star_category.capitalize()}_Percentage"] = star_data.get("percentage")
            df_scrape.at[index, f"{star_category.capitalize()}_Count"] = star_data.get("count")
        
    except requests.RequestException as req_e:
        print(f"API request error for ASIN {asin}: {req_e}")
    except Exception as e:
        print(f"Error processing data for ASIN {asin}: {e}")
    
    time.sleep(1)  # delay of 1 second between API calls to ensure rate limits





Skipping row 0 due to missing ASIN.
{'request_info': {'success': True, 'credits_used': 317, 'credits_used_this_request': 1, 'credits_remaining': 183, 'credits_reset_at': '2023-11-11T05:04:58.000Z'}, 'request_parameters': {'amazon_domain': 'amazon.com', 'asin': 'B08C8TWN1B', 'type': 'product'}, 'request_metadata': {'created_at': '2023-10-31T17:26:02.628Z', 'processed_at': '2023-10-31T17:26:11.571Z', 'total_time_taken': 8.94, 'amazon_url': 'https://www.amazon.com/dp/B08C8TWN1B?th=1&psc=1'}, 'product': {'title': 'GreenPan Reserve Hard Anodized Healthy Ceramic Nonstick 10 Piece Cookware Pots and Pans Set, Gold Handle, PFAS-Free, Dishwasher Safe, Oven Safe, Blush Pink 10 Piece Cookware Pots and Pans Set Blush Pink', 'search_alias': {'title': 'Home & Kitchen', 'value': 'garden'}, 'title_excluding_variant_name': 'GreenPan Reserve Hard Anodized Healthy Ceramic Nonstick 10 Piece Cookware Pots and Pans Set, Gold Handle, PFAS-Free, Dishwasher Safe, Oven Safe, Blush Pink', 'keywords': 'GreenPan,Re