My Linkedin : https://www.linkedin.com/in/yasir-ech-chammakhy/

My Github : https://github.com/yasirech-chammakhy

# Scraping Goldhahn&Sampson Store

This notebook contains the source code used to scrape data from the Goldhahn&Sampson Store website. In addition to the code used in the previous notebook, I also concatenated the code from another notebook to collect additional data that wasn't available in the first one.

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import pandas as pd

In [2]:
def scrape_goldhahnundsampson(url):
    driver = webdriver.Chrome('C:\chromedriver\chromedriver')
    driver.get(url)
    src = driver.page_source 
    soup = BeautifulSoup(src, 'lxml')
    url = soup.find_all('h2', {'class': "caption category_name"})
    hrefs = []
    for h2 in url:
        a_tag = h2.find('a')
        if a_tag:
            hrefs.append(a_tag['href'])

    PROD_LINK = []
    PROD_NAME = []
    PROD_PRICE = []
    PROD_CATEGORY = []
    PROD_IMAGE_URL = []

    for link in hrefs : 
        page_num = 1
        while True:
            # Visit the current page
            page_url = f"{link}?page={page_num}"
            driver.get(page_url)
            src = driver.page_source
            soup = BeautifulSoup(src, 'lxml')

            # Find all the product wraps
            product_wraps = soup.find_all('div', class_='product_wrap')
            # If there are no more product wraps, break out of the loop
            if not product_wraps:
                break

            # Extract the product url, product name, product price, and if it's in stock
            for product_wrap in product_wraps:
                product_name = product_wrap.find('h2', class_='product_name')
                prod_name = product_name.text
                PROD_NAME.append(prod_name) 

                product_url = product_name.find('a').get('href')
                PROD_LINK.append(product_url)

                product_name = product_wrap.find('h2', class_='product_name').text

                product_price = product_wrap.find('div', class_='product_price').text
                PROD_PRICE.append(product_price) 

                # Find the product category.
                product_category = soup.find('li', {'class': 'last'}).text
                PROD_CATEGORY.append(product_category)

                # Find the product image URL.
                product_image_url =  soup.find('a', class_='product_image')
                img_src = product_image_url.find('img').get('src')
                img_url = 'https://www.goldhahnundsampson.de/shop/' + img_src
                PROD_IMAGE_URL.append(img_url)
            # Find the link to the next page
            pagination = soup.find('div', {'class': 'flr'})
            next_link = pagination.find('a', {'title': ' next page '})

            # If there is no next page, break out of the loop
            if not next_link:
                break

            # Increment the page number and continue to the next page
            page_num += 1

    driver.quit()
    # Create a DataFrame from the scraped data
    df = pd.DataFrame({
        'Product_Link': PROD_LINK,
        'Product_Name': PROD_NAME,
        'Product_Price': PROD_PRICE,
        'Product_Category': PROD_CATEGORY,
        'Product_Image_URL': PROD_IMAGE_URL
    })
    return df

In [4]:
# Retrieve product information for each page
df1 = scrape_goldhahnundsampson("https://www.goldhahnundsampson.de/shop/Cookbooks:::289.html")

  driver = webdriver.Chrome('C:\chromedriver\chromedriver')


In [5]:
df2 = scrape_goldhahnundsampson("https://www.goldhahnundsampson.de/shop/Food:::184.html")

  driver = webdriver.Chrome('C:\chromedriver\chromedriver')


In [6]:
df3 = scrape_goldhahnundsampson("https://www.goldhahnundsampson.de/shop/Spirits:::287.html")

  driver = webdriver.Chrome('C:\chromedriver\chromedriver')


In [11]:
# Combine the three dataframes into one
df = pd.concat([df1, df2, df3], ignore_index=True)

# Add missing information
df['STORE_ID'] = 485
df['STORE_NAME'] = 'goldhahn & sampson'
df['TEAM_MEMBER'] = 'Yasir ECH-CHAMMAKHY'

In [13]:
df = df.rename(columns={
    'Product_Link': 'PROD_LINK',
    'Product_Name': 'PROD_NAME',
    'Product_Price': 'PROD_PRICE',
    'Product_Category': 'PROD_CATEGORY',
    'Product_Image_URL': 'PROD_IMAGE_URL'
})


In [15]:
df["PROD_LINK"]

0      https://www.goldhahnundsampson.de/shop/Cookboo...
1      https://www.goldhahnundsampson.de/shop/Cookboo...
2      https://www.goldhahnundsampson.de/shop/Cookboo...
3      https://www.goldhahnundsampson.de/shop/Cookboo...
4      https://www.goldhahnundsampson.de/shop/Cookboo...
                             ...                        
730    https://www.goldhahnundsampson.de/shop/Spirits...
731    https://www.goldhahnundsampson.de/shop/Spirits...
732    https://www.goldhahnundsampson.de/shop/Spirits...
733    https://www.goldhahnundsampson.de/shop/Spirits...
734    https://www.goldhahnundsampson.de/shop/Spirits...
Name: PROD_LINK, Length: 735, dtype: object

## Scraping description, brand and id product

In [None]:
# set the display option to show the full text of the 'PROD_LINK' column
pd.set_option('display.max_colwidth', None)

In [None]:
df = df.drop(columns=["Unnamed: 0.1", "Unnamed: 0"])
df = df[~df["PROD_LINK"].str.startswith("https://www.goldhahnundsampson.de/shop/Cookbooks/")]
df = df.reset_index(drop=True)

In [None]:
# Initialize empty lists to store the scraped data.
brands = []
descriptions = []
ids = []

# Loop through the product links in the DataFrame.
for link in df['PROD_LINK']:
    
    soup = BeautifulSoup(requests.get(link).content, 'lxml')

    brand = soup.find_all('a', class_='manufacturers_link')
    if brand:
        brand = brand[0].text.strip()
    else:
        brand=''

    product_description = soup.find_all('div', id='product_description')
    if product_description:
        product_description = product_description[0].text.strip()
    else:
        product_description = ''

    ean_element = soup.find('div', string='EAN:')
    if ean_element:
        ean_element = ean_element.find_next('div', {'class': 'value'})
        ean = ean_element.text.strip()
    else:
        ean = ''

    # Add the scraped data to the respective lists.
    brands.append(brand)
    descriptions.append(product_description)
    ids.append(ean)

In [None]:
# Add the scraped data to the DataFrame.
df['PROD_BRAND'] = brands
df['PROD_DESCRIPTION'] = descriptions
df["PROD_ID"] = ids

In [None]:
df.to_csv("store_id.csv")