In [1]:
# Libraries 
import requests
from bs4 import BeautifulSoup
import time, sys, os, re, io

In [7]:
# Global Variables
URL = 'https://gymbeam.sk/vsetky-produkty?p='
PRODS_ON_PAGE = 30

In [6]:
# Base url of gymbeam ecommerce ()
base_url = URL + '{page}'

In [8]:
# Calculate the number of pages to scrape
soup = BeautifulSoup(requests.get(base_url.format(page=1)).text, 'html.parser')
total_products = int(soup.find('span', class_='toolbar-number-total').text)

NUM_PAGES = total_products // PRODS_ON_PAGE + 1
print(f"Total number of products: {total_products}")
print(f"Total number of pages: {NUM_PAGES}")

Total number of products: 3776
Total number of pages: 126


In [4]:
# Function to get links of all products
def get_product_links(base_url, NUM_PAGES):
    """Get all product licks"""
    # List to store all product links
    all_links = []

    for page in range(1, NUM_PAGES + 1):
        url = base_url.format(page=page)
    
        # Send a GET request to the page
        response = requests.get(url)
        
        if response.status_code == 200:
            # Parse the page content
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Find all product links on the page
            products = soup.find_all('a', class_='product-item-click')
            
            # Extract hrefs and titles (if available)
            links = [product['href'] for product in products]
            
            # Add the links and titles to the main lists
            all_links.extend(links)
                        
            # Optional delay to avoid hitting the server too frequently
            time.sleep(1)
        else:
            print(f"Failed to retrieve page {page}")
        
    print(f"All pages retrieved")
    return all_links

In [5]:
# Function to get details about product
def get_product_details(product_url):
    """Scrape product details"""
    p = requests.get(product_url)
    soup = BeautifulSoup(p.text, 'html.parser')

    ############################################################################
    # Remove this if you want to scape accessories and clothing as well 
    breadcrumb_nav = soup.find('nav', {'data-test': 'pdp-breadcrumbs-all'})
    if breadcrumb_nav:
        breadcrumb_items = breadcrumb_nav.find_all('li', {'data-test': 'pdp-breadcrumbs'})
        for item in breadcrumb_items:
            if 'Športové oblečenie' in item.get_text() or 'Príslušenstvo' in item.get_text():
                return None
    # Remove this if you want to scape accessories and clothing as well 
    ############################################################################

    # Product name
    if soup.find('h1', class_='page-title') is None:
        name = None
    else:
      name = soup.find('h1', class_='page-title').text.strip()

    # Description
    if soup.find('div', class_='product-description') is None:
        description = None
    else:
      description = soup.find('div', class_='product-description').text.strip()

    # Dosing information
    dosing_header = soup.find('h2', string="Dávkovanie")
    if dosing_header:
        dosing_paragraph = dosing_header.find_next('p')
        if dosing_paragraph:
            dosing_info = dosing_paragraph.text
        else:
            dosing_info = None
    else:
            dosing_info = None

    # Price
    price_span = soup.find('span', {'data-test': 'hp-bestsellers-price'})
    if price_span:
        price = price_span.text.strip()
    else:
        price = None

    # Warnings
    warning_header = soup.find('h2', string="Upozornenie")
    if warning_header:
        warning_paragraph = warning_header.find_next('div')
        if warning_paragraph:
            warning_info = warning_paragraph.text
        else:
            warning_info = None
    else:
        warning_info = None

    return {'name': name, 
            'url': product_url, 
            'description': description, 
            'price': price, 
            'dosing_info': dosing_info,
            'warning_info': warning_info}

In [6]:
# Get all product links
all_links = get_product_links(base_url, NUM_PAGES)

All pages retrieved


In [7]:
# Ensure the 'products' directory exists
os.makedirs('out/files/products', exist_ok=True)

In [8]:
# Sanitize the product name to remove characters that are not allowed in file names
def sanitize_filename(name):
    if name is None:
        return 'NaN'
    else: 
      return re.sub(r'[\\/*?:"<>|]', "", name)

### This part is optional, feel free to output data in different manner, we needed each product in 'txt' file.

In [9]:
# Save the product details to a files that will be vectorized via openai
counter = 0
try:
    for link in all_links:
        product = get_product_details(link)
        if product is None:
            continue
        sanitized_name = sanitize_filename(product['name'])
        with open(f'out/files/products/{sanitized_name}.txt', 'w', encoding='utf-8') as f:
            f.write(f"Name: {product['name']}\n")
            f.write(f"URL: {product['url']}\n")
            f.write(f"Description: {product['description']}\n")
            f.write(f"Price: {product['price']}\n")
            f.write(f"Dosing information: {product['dosing_info']}\n")
            f.write(f"Warning information: {product['warning_info']}\n")
            
            counter += 1
            print(f"Product {product['name']} saved to file, {counter} products saved in total")

except:
    print(f"The wrong product is {product['url']}")

Product Just Whey - GymBeam saved to file, 1 products saved in total
Product True Whey - GymBeam saved to file, 2 products saved in total
Product 100 % Kreatín monohydrát - GymBeam saved to file, 3 products saved in total
Product 100% Whey Gold Standard - Optimum Nutrition saved to file, 4 products saved in total
Product Magnézium chelát (bisglycinát) - GymBeam saved to file, 5 products saved in total
Product Arašidové maslo - GymBeam saved to file, 6 products saved in total
Product Yum Yum Whey - BeastPink saved to file, 7 products saved in total
Product Vitality complex - GymBeam saved to file, 8 products saved in total
Product Tribulus Terrestris - GymBeam saved to file, 9 products saved in total
Product Crea7in - GymBeam saved to file, 10 products saved in total
Product Vitamín C 1000 mg - GymBeam saved to file, 11 products saved in total
Product Omega 3 - GymBeam saved to file, 12 products saved in total
Product Hydrolyzovaný kolagén RunCollg - GymBeam saved to file, 13 products s