# BulkSupplements A-Z Scraper
This notebook scrapes product titles and URLs from BulkSupplements A-Z page and saves them to a CSV file.

In [None]:
%pip install requests beautifulsoup4 pandas

In [None]:
# This finds the headers of the index

import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd  # For easier CSV handling
url = 'https://www.bulksupplements.com/pages/products-a-z'
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
az_list = soup.find('div', class_='az-list')
if not az_list:
    raise Exception('az-list element not found')
az_main_wraps = az_list.find_all('div', class_='az-list-main-wrap', recursive=False)
print(f'Number of az-list-main-wrap divs:', len(az_main_wraps))

In [None]:
# Extract product names, URLs, and az-list-header/az-list-header-first id from each az-list-main-wrap
data = []
for wrap in az_main_wraps:
    # Find header id (az-list-header or az-list-header-first)
    header_span = wrap.find('span', class_='az-list-header')
    if not header_span:
        header_span = wrap.find('span', class_='az-list-header-first')
    header_id = None
    if header_span and header_span.has_attr('id'):
        header_id = header_span['id']
        if header_id.startswith('az-'):
            header_id = header_id[3:]
    ul = wrap.find('ul', class_='az-list-columns')
    if not ul:
        continue
    for a in ul.find_all('a'):
        title = a.get_text(strip=True)
        href = a.get('href')
        if title and href and 'Capsules' not in title and 'Softgels' not in title and 'Pocket' not in title and 'Performance' not in title and 'Machine' not in title:
            data.append({'title': title, 'url': href, 'header_id': header_id})
print(f'Extracted {len(data)} products (excluding "Capsules", "Softgels", "Pocket", "Performance", and "Machine")')
# Display first 5 entries in a readable format
for item in data[:5]:
    print(f"Header ID: {item['header_id']}")
    print(f"Name: {item['title']}")
    print(f"URL: {item['url']}")
    print('-' * 40)

In [None]:
# Test scraping for a single product URL and display only specified headers and their content
product_url = 'https://www.bulksupplements.com/products/wheatgrass-powder-2'  # Enter a product URL here
if product_url:
    response = requests.get(product_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    # --- Scrape Variants and Prices ---
    variants_and_prices = []
    variant_picker = soup.find('variant-picker')

    variants = [span.get_text(strip=True) for span in variant_picker.select('.variant-picker__option-values span')]
    print('Variants found:', variants)

    # Find div with id containing 'shopify-section-template'
    section_div = soup.find('div', id=lambda x: x and 'shopify-section-template' in x)
    if section_div:
        # Find x-tabs inside this div
        x_tabs = section_div.find('x-tabs')
        if x_tabs:
            # Define the headers we are interested in
            interested_headers = [
                "Serving Size",
                "Other Ingredients",
                "Allergen Information",
                "Free of",
                "Suggested Use"
            ]
            headers_data = []
            # Search only <div> and <p> with role='tabpanel' inside x-tabs for <b> tags
            for tabpanel in x_tabs.find_all(['div', 'p'], attrs={'role': 'tabpanel'}):
                # Find all <b> tags within the tabpanel
                b_tags = tabpanel.find_all('b')
                for b_tag in b_tags:
                    header_text_with_colon = b_tag.get_text(strip=True)
                    header = header_text_with_colon.strip(':')
                    if header in interested_headers:
                        # Find the parent <p> or <div> of the <b> tag
                        parent_tag = b_tag.find_parent(['p', 'div'])
                        if parent_tag:
                            # Get the text of the parent and remove the header to get the content
                            content = parent_tag.get_text(separator=' ', strip=True).replace(header_text_with_colon, '', 1).strip()
                            headers_data.append({'name': header, 'url': product_url, 'content': content})
            if headers_data:
                print('--- Supplemental Facts ---')
                for entry in headers_data:
                    # print(f"URL: {entry['url']}")
                    print(f"Header: {entry['name']}")
                    print(f"Content: {entry['content']}")
                    print('-' * 40)
            else:
                print('No interested headers found in tabpanel elements in x-tabs.')
        else:
            print('x-tabs element not found')
    else:
        print('shopify-section-template div not found')
else:
    print('Please enter a product URL in product_url.')


In [None]:
import time
import pprint
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd  

# --- Step 1: Re-extract product names, URLs, and az-list-header/az-list-header-first id from each az-list-main-wrap ---
url = 'https://www.bulksupplements.com/pages/products-a-z'
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
az_list = soup.find('div', class_='az-list')
if not az_list:
    raise Exception('az-list element not found')
az_main_wraps = az_list.find_all('div', class_='az-list-main-wrap', recursive=False)
print(f'Number of az-list-main-wrap divs:', len(az_main_wraps))

# --- Step 2: Scrape interested headers for each product in the filtered list ---
initial_products = []
interested_headers = [
    "Serving Size",
    "Other Ingredients",
    "Allergen Information",
    "Free of",
    "Suggested Use"
]
for wrap in az_main_wraps:
    # Find header id (az-list-header or az-list-header-first)
    header_span = wrap.find('span', class_='az-list-header')
    if not header_span:
        header_span = wrap.find('span', class_='az-list-header-first')
    header_id = None
    if header_span and header_span.has_attr('id'):
        header_id = header_span['id']
        if header_id.startswith('az-'):
            header_id = header_id[3:]
    ul = wrap.find('ul', class_='az-list-columns')
    if not ul:
        continue
    for a in ul.find_all('a'):
        title = a.get_text(strip=True)
        href = a.get('href')
        if title and href and 'Capsules' not in title and 'Softgels' not in title and 'Pocket' not in title and 'Performance' not in title:
            initial_products.append({'title': title, 'url': href, 'header_id': header_id})
print(f'Extracted {len(initial_products)} products (excluding "Capsules", "Softgels", "Pocket", and "Performance")')

scraped_data = []
for product in initial_products:
    print(f"Scraping: {product['title']}")

    # Construct full URL
    prod_url = product['url']
    if not prod_url.startswith('http'):
        prod_url = 'https://www.bulksupplements.com' + prod_url

    try:
        response = requests.get(prod_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # --- Scrape Supplemental Facts ---
        supplemental_facts = {}
        for header in interested_headers:
            supplemental_facts[header] = None
        
        x_tabs = soup.find('x-tabs')
        if x_tabs:
            for tabpanel in x_tabs.find_all(['div', 'p'], attrs={'role': 'tabpanel'}):
                b_tags = tabpanel.find_all('b')
                for b_tag in b_tags:
                    header = b_tag.get_text(strip=True).strip(':')
                    if header in interested_headers:
                        parent_tag = b_tag.find_parent(['p', 'div'])
                        if parent_tag:
                            content = parent_tag.get_text(separator=' ', strip=True).replace(b_tag.get_text(), '', 1).strip()
                            supplemental_facts[header] = content
        
        # --- Scrape Variants ---
        # variant_picker = soup.find('variant-picker')
        # variants = [span.get_text(strip=True) for span in variant_picker.select('.variant-picker__option-values span')]

        # --- Combine data ---
        base_product_details = {
            'title': product['title'],
            'url': prod_url,
            'header_id': product['header_id'],
            **supplemental_facts,
            # 'variants': variants
        }

        scraped_data.append(base_product_details)

    except Exception as e:
        print(f"  - Error scraping {product['title']}: {e}")
        error_details = {
            'title': product['title'],
            'url': prod_url,
            'header_id': product['header_id'],
            'error': str(e)
        }
        scraped_data.append(error_details)

    time.sleep(1)  # Be polite to the server

# --- Step 3: Print the final combined data ---
print("\n--- Scraping Complete ---")
print("Final extracted data for 'A' products:")
pprint.pprint(scraped_data)


In [None]:
# --- Step 4: Save the data to a CSV file ---
if scraped_data:
    df = pd.DataFrame(scraped_data)
    
    # Define the desired column order
    column_order = ['title', 'url', 'header_id'] + interested_headers
    # Reorder columns, adding any that might be missing (like 'error')
    df = df.reindex(columns=column_order + [col for col in df.columns if col not in column_order])

    csv_filename = 'bulksupp_products_A.csv'
    df.to_csv(csv_filename, index=False)
    print(f"\nSuccessfully saved {len(df)} products to {csv_filename}")
else:
    print("\nNo data to save to CSV.")


New Selenium Approach

In [None]:
%pip install selenium

In [None]:
# This scrapes prices from a product page using Selenium and Firefox WebDriver

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options

# Start browser
service = Service()
options = Options()
options.headless = True  # Run browser in headless mode for notebooks
driver = webdriver.Firefox(service=service, options=options)

# Open the page
driver.get('https://www.bulksupplements.com/products/wheatgrass-powder-2')  # Replace with your URL

# Find all price elements inside the container
prices = driver.find_elements(By.CLASS_NAME, 'product-info__price')

# Print prices
for price in prices:
    print(price.text)

# Close browser
driver.quit()

In [None]:
# This scrapes different sizes and their prices from a product page using Selenium and Firefox WebDriver

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
import time

service = Service()
options = Options()
options.headless = True
driver = webdriver.Firefox(service=service, options=options)

driver.get('https://www.bulksupplements.com/products/wheatgrass-powder-2')
time.sleep(2)

# Step 1: Select "Powder" if there's a type picker
try:
    powder_button = driver.find_element(By.XPATH, "//button[contains(., 'Powder')]")
    powder_button.click()
    time.sleep(1)
except Exception:
    print("Powder button not found or already selected.")

# Step 2: Find the correct fieldset for "Size:"
size_fieldset = None
fieldsets = driver.find_elements(By.CSS_SELECTOR, "fieldset.variant-picker__option")
for fs in fieldsets:
    try:
        legend = fs.find_element(By.TAG_NAME, "legend")
        if legend.text.strip() == "Size:":
            size_fieldset = fs
            break
    except Exception:
        continue

if size_fieldset:
    size_radios = size_fieldset.find_elements(By.CSS_SELECTOR, "input[type='radio']")
    for radio in size_radios:
        try:
            label = size_fieldset.find_element(By.CSS_SELECTOR, f"label[for='{radio.get_attribute('id')}']")
            if not label.text.strip():
                continue  # Skip if label text is empty
        except Exception:
            continue  # Skip if label not found
        driver.execute_script("arguments[0].click();", radio)
        time.sleep(1)
        try:
            price = driver.find_element(By.CLASS_NAME, 'product-info__price').text
        except Exception:
            continue  # Skip if price not found
        print(f"Size: {label.text} | Price: {price}")
else:
    print("No fieldset with legend 'Size:' found.")

driver.quit()

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
import time

service = Service()
options = Options()
options.headless = True
driver = webdriver.Firefox(service=service, options=options)

driver.get('https://www.bulksupplements.com/pages/products-a-z')
time.sleep(2)  # Wait for page to load

# Find the az-list container
az_list = driver.find_element(By.CLASS_NAME, 'az-list')
az_main_wraps = az_list.find_elements(By.CSS_SELECTOR, 'div.az-list-main-wrap')

data = []
for wrap in az_main_wraps:
    # Find header id (az-list-header or az-list-header-first)
    header_span = None
    try:
        header_span = wrap.find_element(By.CLASS_NAME, 'az-list-header')
    except:
        try:
            header_span = wrap.find_element(By.CLASS_NAME, 'az-list-header-first')
        except:
            pass
    header_id = None
    if header_span:
        header_id = header_span.get_attribute('id')
        if header_id and header_id.startswith('az-'):
            header_id = header_id[3:]
    try:
        ul = wrap.find_element(By.CLASS_NAME, 'az-list-columns')
        links = ul.find_elements(By.TAG_NAME, 'a')
        for a in links:
            title = a.text.strip()
            href = a.get_attribute('href')
            if title and href and all(x not in title for x in ['Capsules', 'Softgels', 'Pocket', 'Performance', 'Machine']):
                data.append({'title': title, 'url': href, 'header_id': header_id})
    except:
        continue

print(f'Extracted {len(data)} products (excluding unwanted types)')
for item in data[:1]:
    print(f"Header ID: {item['header_id']}")
    print(f"Name: {item['title']}")
    print(f"URL: {item['url']}")
    print('-' * 40)

driver.quit()

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
import time
import pandas as pd

interested_headers = [
    "Serving Size",
    "Other Ingredients",
    "Allergen Information",
    "Free of",
    "Suggested Use"
]

service = Service()
options = Options()
options.headless = True
driver = webdriver.Firefox(service=service, options=options)

# Step 1: Scrape product URLs from A-Z page
driver.get('https://www.bulksupplements.com/pages/products-a-z')
time.sleep(2)

# Find the az-list container
az_list = driver.find_element(By.CLASS_NAME, 'az-list')
az_main_wraps = az_list.find_elements(By.CSS_SELECTOR, 'div.az-list-main-wrap')

data = []
for wrap in az_main_wraps:
    # Find header id (az-list-header or az-list-header-first)
    header_span = None
    try:
        header_span = wrap.find_element(By.CLASS_NAME, 'az-list-header')
    except:
        try:
            header_span = wrap.find_element(By.CLASS_NAME, 'az-list-header-first')
        except:
            pass
    header_id = None
    if header_span:
        header_id = header_span.get_attribute('id')
        if header_id and header_id.startswith('az-'):
            header_id = header_id[3:]
    try:
        ul = wrap.find_element(By.CLASS_NAME, 'az-list-columns')
        links = ul.find_elements(By.TAG_NAME, 'a')
        for a in links:
            title = a.text.strip()
            href = a.get_attribute('href')
            if title and href and all(x not in title for x in ['Capsules', 'Softgels', 'Pocket', 'Performance', 'Machine']):
                data.append({'title': title, 'url': href, 'header_id': header_id})
    except:
        continue

print(f'Extracted {len(data)} products (excluding unwanted types)')

# Step 2: Visit each product and extract interested headers
results = []
for idx, item in enumerate(data):  # Limit for demo; remove [:5] for all products
    start_time = time.time()  # Start timer
    url = item['url']
    title = item['title']
    # print(f"Scraping: {title}")
    driver.get(url)
    time.sleep(2)
    entry = {}
    
    # Supplemental Facts Extraction
    try:
        # Find and click the "Supplemental Facts" tab
        tabs = driver.find_elements(By.CSS_SELECTOR, "button[role='tab']")
        for tab in tabs:
            if "Supplemental Facts" in tab.text:
                tab.click()
                time.sleep(1)
                break

        # Find the active tabpanel
        tabpanels = driver.find_elements(By.CSS_SELECTOR, "[role='tabpanel']")
        for tabpanel in tabpanels:
            if tabpanel.is_displayed():
                panel_text = tabpanel.text
                # Match interested headers and extract their content
                for header in interested_headers:
                    if header in panel_text:
                        # Find the header and extract the following text
                        lines = panel_text.split('\n')
                        for i, line in enumerate(lines):
                            if line.strip().startswith(header):
                                # Get the content after the header (remove header and colon)
                                content = line.replace(header, '', 1).replace(':', '', 1).strip()
                                if not content and i + 1 < len(lines):
                                    content = lines[i + 1].strip()
                                entry[header] = content
                break  # Only process the first visible tabpanel

    except Exception as e:
        print(f"  Error: {e}")

    data[idx].update(entry)

    # Pricing Information 
    # Select "Powder" if there's a type picker
    try:
        powder_button = driver.find_element(By.XPATH, "//button[contains(., 'Powder')]")
        powder_button.click()
        time.sleep(1)
    except Exception:
        # print("Powder button not found or already selected.")
        pass

    variations = []

    # Step 2: Find the correct fieldset for "Size:"
    size_fieldset = None
    fieldsets = driver.find_elements(By.CSS_SELECTOR, "fieldset.variant-picker__option")
    for fs in fieldsets:
        try:
            legend = fs.find_element(By.TAG_NAME, "legend")
            if legend.text.strip() == "Size:":
                size_fieldset = fs
                break
        except Exception:
            continue

    if size_fieldset:
        size_radios = size_fieldset.find_elements(By.CSS_SELECTOR, "input[type='radio']")
        for radio in size_radios:
            try:
                label = size_fieldset.find_element(By.CSS_SELECTOR, f"label[for='{radio.get_attribute('id')}']")
                if not label.text.strip():
                    continue  # Skip if label text is empty
            except Exception:
                continue  # Skip if label not found
            driver.execute_script("arguments[0].click();", radio)
            time.sleep(1)
            try:
                price = driver.find_element(By.CLASS_NAME, 'product-info__price').text.strip("Sale price")
            except Exception:
                continue  # Skip if price not found
            # print(f"Size: {label.text} | Price: {price}")
            variations.append({'size': label.text, 'price': price})
        # print("Variations:")
        # for var in variations:
        #     print(f"    Size: {var['size']} | Price: {var['price']}")
        # entry['variations'] = variations
    else:
        print("No fieldset with legend 'Size:' found.")

    data[idx].update({'Pricing': variations})

    print(f"Scraped: {title} (Time: {(time.time() - start_time):.2f} seconds)")
    # print("Title: ", title)
    # print("Entry:", entry)
    # print("Variations:", variations)
    # print()

driver.quit()

# --- Step 4: Save the data to a CSV file ---
df = pd.DataFrame(data)

# Define the desired column order
column_order = ['title', 'url', 'header_id'] + interested_headers
# Reorder columns, adding any that might be missing (like 'error')
df = df.reindex(columns=column_order + [col for col in df.columns if col not in column_order])

csv_filename = 'bulksupp_products.csv'
df.to_csv(csv_filename, index=False)
print(f"\nSuccessfully saved {len(df)} products to {csv_filename}")



Extracted 480 products (excluding unwanted types)
Scraped: 5-HTP (Time: 14.22 seconds)

Successfully saved 480 products to bulksupp_products.csv
