## Libraries und Vairablen

In [18]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by  import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import requests
import re
from bs4 import BeautifulSoup

In [11]:
buchhaus_new_last_30 = 'https://www.buchhaus.ch/de/heute/last30'

## Webscraping

In [12]:
'''
In this code Snippet we will get all the links to the books on the website
'''

PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
#load the page
driver.get(buchhaus_new_last_30)

all_links = []

# data frame
buch_df = pd.DataFrame(columns=['Titel',  
                            'Preis',
                            'Autor', 
                            'details', 
                            'Genre',
                            'Text'])

for page in range(1,11):
    print('Scraping page: ', page)

    # scrape all hyperlinks
    links = driver.find_elements(By.XPATH, '/html/body/div[2]/div/div/main/div[3]/div[2]/div[4]/div//a')

    # get the href attribute
    links = [link.get_attribute('href') for link in links]

    # remove duplicates
    links = list(dict.fromkeys(links))

    # remove all links that are not books
    links = [link for link in links if 'buecher' in link]

    # add links to all_links
    all_links.extend(links)

    # go to the next page
    if page == 1: driver.find_element(By.XPATH, '/html/body/div[2]/div/div/main/div[3]/div[2]/div[1]/div[5]/div/a').click()
    elif page == 10: pass
    else: driver.find_element(By.XPATH, '/html/body/div[2]/div/div/main/div[3]/div[2]/div[1]/div[5]/div/a[2]').click()
    time.sleep(1)
    
driver.close()

  driver = webdriver.Chrome(PATH)


Scraping page:  1
Scraping page:  2
Scraping page:  3
Scraping page:  4
Scraping page:  5
Scraping page:  6
Scraping page:  7
Scraping page:  8
Scraping page:  9
Scraping page:  10


In [13]:
'''
In this code snippet we will get the data from the links
'''

PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
#load the page
driver.get(buchhaus_new_last_30)

for book in all_links:
    try:
        driver.get(book)

        # get the data simple data
        try: titel = driver.find_element(By.XPATH, '/html/body/div[2]/div/div/main/div[2]/section[1]/div[2]/div[1]/h1/span/span').text
        except: titel = 'nan'

        try: preis = driver.find_element(By.XPATH, '/html/body/div[2]/div/div/main/div[2]/section[1]/div[2]/div[2]/div/div/span[2]/span').text
        except: preis = 'nan'

        try: autor = driver.find_element(By.XPATH, '/html/body/div[2]/div/div/main/div[2]/section[1]/div[2]/div[1]/div[2]/div').text
        except: autor = 'nan'

        # get the 'Buchbeschreibungen'
        try: text = driver.find_element(By.XPATH, '/html/body/div[2]/div/div/main/div[2]/section[3]/div[1]/div[2]/div/span/span/span').text
        except: text = 'nan'

        # get the genre
        parts = book.split('/')
        #find the index of the genre
        start_index = parts.index('buecher')
        end_index = parts.index('detail')
        # get the genres
        genre = parts[start_index+1:end_index]
        genre = ' '.join(genre)

        # get the 'Buchdetails'
        try: details = driver.find_element(By.XPATH, '/html/body/div[2]/div/div/main/div[2]/section[3]/div[2]/div[2]/div').text
        except: details = 'nan'

    # Fehlermeldung
    except: 
        print('error mit Buch: ', book)
        continue
    

    df_temp = pd.DataFrame({'Titel': titel,  
                    'Preis': preis,
                    'Autor': autor,
                    'details': details,
                    'Genre' : genre,
                    'Text': text
                    }, index=[0])
    buch_df = pd.concat([buch_df, df_temp], ignore_index=True)

driver.close()

  driver = webdriver.Chrome(PATH)


## Datawrangling

In [14]:
buch_df.shape

(228, 6)

In [15]:
'''
short algorithm description:
1. List all possible keys: These are the categories that precede the actual values in the data string.
2. Create a regular expression: This regular expression is designed to match each key, followed by any characters until the next key is found or until the end of the string.
3. Find all matches in the data: Use the 're.findall' function to apply the regular expression to the data string. 
This returns a list of tuples containing the key and the associated value for each match.
4. Convert the matches into a Dictionnary in the Dataframe.
'''

def extract_details(details):
    # Key words which can be find in the string
    keys = ["ISBN/GTIN", "Produktart", "Einbandart", "Verlag", "Erscheinungsdatum", "Auflage", "Reihe", "Reihen-Nr.", "Seiten", "Sprache", "Masse", "Artikel-Nr."]

    # generate a regex which can find all the keys in the string
    regex = "(" + "|".join(keys) + ")(.*?)(?=" + "|".join(keys) + "|$)"

    # finds all key value pairs in the string
    matches = re.findall(regex, details)
    
    # returns a dictionary with key value pairs
    return dict((k.strip(), v.strip()) for k, v in matches)

In [16]:
def data_wrangling_pipeline(df):
    # apply the function to the data frame (create a new column with extracted details)
    df['details_dict'] = df['details'].apply(extract_details)
    # create new dataframe with the details
    details_df = df['details_dict'].apply(pd.Series, dtype= 'object')
    # concat both dataframes
    new_df = pd.concat([buch_df, details_df], axis=1)
    # drop the old details columns
    new_df.drop(['details', 'details_dict'], axis=1, inplace=True)
    return new_df

In [19]:
buch_df = data_wrangling_pipeline(buch_df)

#drop all rows where the title is nan
buch_df = buch_df[buch_df['Titel'] != 'nan']

#drop duplicates
buch_df.drop_duplicates(subset=['Titel'], inplace=True)

# reset index
buch_df.reset_index(drop=True, inplace=True)

In [22]:
buch_df.tail()

Unnamed: 0,Titel,Preis,Autor,Genre,Text,ISBN/GTIN,Produktart,Einbandart,Verlag,Erscheinungsdatum,Auflage,Reihe,Seiten,Sprache,Masse,Artikel-Nr.
222,Marsupilami 31: So ein Zirkus!,17.9,"Franquin, AndréDugomierBatemIllustrationenLe C...",comics comics,Abenteuer für Leseanfänger_innen\n\nDas Marsup...,978-3-551-79675-2,Buch,Paperback,Carlsen,02.05.2023,,n-Nr.31,,Deutsch,"Breite 220 mm, Höhe 295 mm, Dicke 6 mm",
223,Lacroix und der traurige Champion von Roland-G...,26.9,"Lépic, Alex",belletristik krimi,Seit seine Frau Dominique für das Amt der Bürg...,978-3-311-12568-6,Buch,Gebunden,,20.04.2023,,n-Nr.06,,Deutsch,"Breite 115 mm, Höhe 185 mm, Dicke 18 mm",
224,Mehr Energie in 4 Wochen,29.9,"Walk, Ute",fachbuecher medizin naturheilen,Wieder mehr Energie und Kraft\n\nEtwas Müdigke...,978-3-8338-8734-5,Buch,Paperback,Gräfe & Unzer,04.05.2023,,GU Körper & Seele Ratgeber Gesundheit,,Deutsch,"Breite 166 mm, Höhe 200 mm, Dicke 14 mmGewicht...",
225,"Noah - Von einem, der überlebte",17.9,"Würger, TakisKangisser Cohen, SharonNachwortKl...",taschenbuch biografien,"»Noah [...] sei vielen, vor allem jungen Leser...",978-3-328-10844-3,Taschenbuch,Paperback,München,11.05.2023,Erstmals im TB,,,Deutsch,"Breite 119 mm, Höhe 188 mm, Dicke 19 mmGewicht...",
226,Feel Good at Home,37.9,"Hellweg, Marion",fachbuecher kunst wohnen_fengshui,Der praktische Wohn-Guide: So verwandele ich m...,978-3-7913-8938-7,Buch,Gebunden,Prestel,26.04.2023,,,,Deutsch,"Breite 177 mm, Höhe 248 mm, Dicke 23 mmGewicht...",


## Export

In [21]:
# save as xlsx
buch_df.to_excel('../data/excelfiles/df_neuheiten_buchhaus.xlsx', index=False)
# save as feather
buch_df.to_feather('../data/feather/df_neuheiten_buchhaus.feather')

## Ausblick

Ausblick:

* Bücher in den nächsten 30 Tagen könnten auch gescrapet werden