### Author: Zorah Zafari 
### Date: 01/26/2025
### This code opens each URL and scrapes data from each individual firm 

In [2]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import csv

# Set up the Selenium WebDriver
driver = webdriver.Chrome()

def get_firm_urls(page_url):
    """
    Fetches all firm URLs from a given page using Selenium to handle CAPTCHA and dynamic content.
    """
    driver.get(page_url)
    print(f"Accessing page: {page_url}")
    
    # Wait for user to solve CAPTCHA manually, if prompted
    input("If prompted, please solve the CAPTCHA and press Enter to continue...")
    time.sleep(5)  # Allow additional time for the page to fully load

    # Parse the rendered page with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    firm_urls = []
    for link in soup.find_all('a', href=True):
        if '/profile/' in link['href']:  # Adjust pattern as necessary for the specific structure
            full_url = 'https://www.bbb.org' + link['href']
            print(f"Found URL: {full_url}")  # Debugging output to verify URLs
            firm_urls.append(full_url)
    return firm_urls

def scrape_firm_details(firm_url):
    """
    Extracts specific details from a firm's page.
    """
    driver.get(firm_url)
    print(f"Scraping data from: {firm_url}")
    time.sleep(3)  # Allow time for the page to fully load

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    try:
        # Extract data based on the provided structure
        name = soup.find('span', class_='bds-h2 bpr-header-business-name', id='businessName').text.strip() if soup.find('span', class_='bds-h2 bpr-header-business-name', id='businessName') else 'N/A'
        address = ' '.join([p.text.strip() for p in soup.find('div', class_='bpr-overview-address').find_all('p', class_='bds-body')]) if soup.find('div', class_='bpr-overview-address') else 'N/A'
        website = soup.find('a', href=True, rel='nofollow noreferrer')['href'] if soup.find('a', href=True, rel='nofollow noreferrer') else 'N/A'
        business_started = soup.find('dt', text='Business Started Locally:').find_next_sibling('dd').text.strip() if soup.find('dt', text='Business Started Locally:') else 'N/A'
        phone = soup.find('a', href=True, text=lambda x: x and x.startswith('(')).text.strip() if soup.find('a', href=True, text=lambda x: x and x.startswith('(')) else 'N/A'
    except Exception as e:
        print(f"Error scraping details from {firm_url}: {e}")
        return {'URL': firm_url, 'Name': 'N/A', 'Address': 'N/A', 'Website': 'N/A', 'Business Started': 'N/A', 'Phone': 'N/A'}

    return {'URL': firm_url, 'Name': name, 'Address': address, 'Website': website, 'Business Started': business_started, 'Phone': phone}

# Main Scraping Loop
base_url = 'https://www.bbb.org/us/category/payday-loans?page='
all_firm_details = []

for page in range(1, 16):  # Loop through pages 1 to 15
    page_url = base_url + str(page)
    firm_urls = get_firm_urls(page_url)

    for firm_url in firm_urls:
        details = scrape_firm_details(firm_url)
        all_firm_details.append(details)
        time.sleep(2)  # Delay between requests to each firm's page

# Save extracted data to a CSV file
csv_filename = 'firm_details.csv'
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['URL', 'Name', 'Address', 'Website', 'Business Started', 'Phone'])
    writer.writeheader()  # Write header row
    writer.writerows(all_firm_details)  # Write rows of firm details

print(f"Total firms scraped: {len(all_firm_details)}")
print(f"Details saved to {csv_filename}")

# Close the WebDriver when done
driver.quit()


ModuleNotFoundError: No module named 'selenium'