In [1]:
# This file extracts information about the banknotes from the website gap-banknoten.de and saves it in a json file.
from selenium import webdriver
from selenium.webdriver.common.by import By
import csv
import re
import time
import pandas as pd

# Define the path for the edge driver.
driver_path = "msedgedriver.exe"


# Define the url of the website.
base_url = "https://www.gap-banknoten.de/deutsches-notgeld-nach-provinzen/wuerttemberg/"

# Call the driver.
wd = webdriver.Edge(driver_path)

  wd = webdriver.Edge(driver_path)


In [2]:
# Initialize an empty list to store the product URLs.
product_urls = []
for page_number in range(1, 32):  # Loop through pages 1 to 31
    # Create the URL for the current page
    url_path = f"{base_url}?p={page_number}"
    
    # Open the url.
    wd.get(url_path)

    # Wait for the page to load.
    time.sleep(1)
    
    # Get the div class product--info.
    product_info = wd.find_elements(By.CLASS_NAME, "product--info")
    
    # Iterate through the product_info elements and extract the product URL
    for product in product_info:
        product_title = product.find_element(By.CLASS_NAME, "product--title")
        product_url = product_title.get_attribute("href")
        product_urls.append(product_url)

In [3]:
def extract_data_from_product_page(url, webdriver):
    # Open the url.
    wd.get(url)

    # Find the product details container
    product_details = webdriver.find_element(By.CLASS_NAME, "content.product--details")
    
    # 1. Extract the product title
    product_title = product_details.find_element(By.CLASS_NAME, "product--title").text
    
    # 2. Extract the price content
    price_content = product_details.find_element(By.CLASS_NAME, "price--content.content--default")
    price = float(price_content.find_element(By.TAG_NAME, "meta").get_attribute("content"))

    # 3. Extract the text content from the base-info--entry elements
    base_info_entries = product_details.find_elements(By.CLASS_NAME, "base-info--entry.entry-attribute")
    entry_contents = []
    for entry in base_info_entries:
        label = entry.find_element(By.CLASS_NAME, "entry--label").text
        content = entry.find_element(By.CLASS_NAME, "entry--content").text
        entry_contents.append((label, content))

    # 4. Extract the table content
    table_rows = product_details.find_elements(By.CSS_SELECTOR, ".product--properties-table tr")
    table_contents = []
    for row in table_rows:
        label = row.find_element(By.CLASS_NAME, "product--properties-label").text
        value = row.find_element(By.CLASS_NAME, "product--properties-value").text
        table_contents.append((label, value))

    # Combine extracted data into a dictionary
    product_data = {
        'url': url,
        'title': product_title,
        'price': price,
        'base_info': entry_contents,
        'table_contents': table_contents
    }

    return product_data

In [4]:
# Initialize an empty list to store the extracted data from each product page. Takes 30 mins.
product_data_list = []

# Iterate through the product URLs
for product_url in product_urls:
    # Open the product URL
    wd.get(product_url)
    
    # Wait for the page to load
    time.sleep(1)

    # Extract the desired data from the product page
    product_data = extract_data_from_product_page(product_url, wd)
    
    # Append the extracted data to the product_data_list
    product_data_list.append(product_data)

# Save the product data list to a CSV file
csv_columns = ['url', 'title', 'price', 'base_info', 'table_contents']
csv_file = "product_data.csv"

with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
    writer.writeheader()
    for data in product_data_list:
        writer.writerow(data)

In [5]:
# Read the CSV file into a DataFrame
df = pd.read_csv("product_data.csv")

# Initialize the new columns with empty strings
df['Katalog'] = ''
df['Erhaltung'] = ''
df['Provinz'] = ''
df['Wert'] = ''
df['Periode'] = ''
df['Ort'] = ''

# Iterate through the DataFrame rows
for index, row in df.iterrows():
    # Extract base_info and table_contents as lists of tuples
    base_info = eval(row['base_info'])
    table_contents = eval(row['table_contents'])

    # Find the values in base_info and table_contents
    katalog = [content for label, content in base_info if label == 'Katalog-Nr.:']
    erhaltung = [value for label, value in table_contents if label == 'Erhaltung:']
    provinz = [value for label, value in table_contents if label == 'Provinz:']
    wert = [value for label, value in table_contents if label == 'Wert:']
    periode = [value for label, value in table_contents if label == 'Periode:']
    ort = [value for label, value in table_contents if label == 'Ort:']

    # Update the new columns with the extracted values
    if katalog:
        df.at[index, 'Katalog'] = katalog[0]
    if erhaltung:
        df.at[index, 'Erhaltung'] = erhaltung[0]
    if provinz:
        df.at[index, 'Provinz'] = provinz[0]
    if wert:
        df.at[index, 'Wert'] = wert[0]
    if periode:
        df.at[index, 'Periode'] = periode[0]
    if ort:
        df.at[index, 'Ort'] = ort[0]
        
# Drop the 'base_info' and 'table_contents' columns
df = df.drop(columns=['base_info', 'table_contents'])

# Reorder the columns
df = df[['Ort', 'price','Provinz', 'Wert', 'Periode', 'Erhaltung', 'Katalog', 'url', 'title']]

# Save the updated DataFrame to the CSV file
df.to_csv("product_data.csv", index=False)

In [6]:
# Read the CSV file into a DataFrame
df = pd.read_csv("product_data.csv")

# Initialize the new 'date' column with empty strings
df['date'] = ''

# Define a regex pattern to match date in the format 'dd.mm.yy'
date_pattern = re.compile(r'\b\d{2}\.\d{2}\.\d{2}\b')

# Iterate through the DataFrame rows
for index, row in df.iterrows():
    # Check if there is a date in the title
    date_match = date_pattern.search(row['title'])

    # If a date is found, save it to the 'date' column
    if date_match:
        df.at[index, 'date'] = date_match.group()
    else:
        # If no date is found, fill it with no date.
        df.at[index, 'date'] = 'NoDate'

# Save the updated DataFrame to the CSV file
df.to_csv("product_data.csv", index=False)

In [7]:
# Save the updated DataFrame to the CSV file
df.to_csv("NotgeldData.csv", index=False)