## Libraries und Vairablen

In [14]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by  import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import re

In [15]:
buchhaus_new_last_30 = 'https://www.buchhaus.ch/de/heute/last30'

## Webscraping

In [16]:
PATH = "C:\Program Files (x86)\chromedriver.exe"
driver = webdriver.Chrome(PATH)
#load the page
driver.get(buchhaus_new_last_30)

# data frame
buch_df = pd.DataFrame(columns=['Titel',  
                            'Preis',
                            'Autor', 
                            'details', 
                            'Genre',
                            'Text'])

for page in range(1, 11):
    print('Scraping books from page {}'.format(page))

    # scrape all hyperlinks
    links = driver.find_elements(By.XPATH, '/html/body/div[2]/div/div/main/div[3]/div[2]/div[4]/div//a')

    # get the href attribute
    links = [link.get_attribute('href') for link in links]

    # remove all links that are not books
    links = [link for link in links if 'buecher' in link]

    for book in links:

        try:
            driver.get(book)

            # get the data simple data
            try: titel = driver.find_element(By.XPATH, '/html/body/div[2]/div/div/main/div[2]/section[1]/div[2]/div[1]/h1/span/span').text
            except: titel = 'nan'

            try: preis = driver.find_element(By.XPATH, '/html/body/div[2]/div/div/main/div[2]/section[1]/div[2]/div[2]/div/div/span[2]/span').text
            except: preis = 'nan'

            try: autor = driver.find_element(By.XPATH, '/html/body/div[2]/div/div/main/div[2]/section[1]/div[2]/div[1]/div[2]/div').text
            except: autor = 'nan'

            # get the 'Buchbeschreibungen'
            try: text = driver.find_element(By.XPATH, '/html/body/div[2]/div/div/main/div[2]/section[3]/div[1]/div[2]/div/span/span/span').text
            except: text = 'nan'

            # get the genre
            parts = book.split('/')
            #find the index of the genre
            start_index = parts.index('buecher')
            end_index = parts.index('detail')
            # get the genres
            genre = parts[start_index+1:end_index]
            genre = ' '.join(genre)

            # get the 'Buchdetails'
            try: details = driver.find_element(By.XPATH, '/html/body/div[2]/div/div/main/div[2]/section[3]/div[2]/div[2]/div').text
            except: details = 'nan'

        # Fehlermeldung
        except: 
            print('error mit Buch {} auf Seite {}'.format(book, page))
            continue
    

        df_temp = pd.DataFrame({'Titel': titel,  
                        'Preis': preis,
                        'Autor': autor,
                        'details': details,
                        'Genre' : genre,
                        'Text': text
                        }, index=[0])
        buch_df = pd.concat([buch_df, df_temp], ignore_index=True)
        
        #back to the main page
        driver.get(buchhaus_new_last_30)

    # go to the next page
    if page == 1: driver.find_element(By.XPATH, '/html/body/div[2]/div/div/main/div[3]/div[2]/div[1]/div[5]/div/a').click()
    elif page == 10: pass
    else: driver.find_element(By.XPATH, '/html/body/div[2]/div/div/main/div[3]/div[2]/div[1]/div[5]/div/a[2]').click()
    time.sleep(1)

driver.close()

  driver = webdriver.Chrome(PATH)


Scraping books from page 1
Scraping books from page 2
Scraping books from page 3
Scraping books from page 4
Scraping books from page 5
Scraping books from page 6
Scraping books from page 7
Scraping books from page 8
Scraping books from page 9
Scraping books from page 10


## Datawrangling

In [17]:
buch_df.shape

(254, 6)

In [18]:
'''
short algorithm description:
1. List all possible keys: These are the categories that precede the actual values in the data string.
2. Create a regular expression: This regular expression is designed to match each key, followed by any characters until the next key is found or until the end of the string.
3. Find all matches in the data: Use the 're.findall' function to apply the regular expression to the data string. 
This returns a list of tuples containing the key and the associated value for each match.
4. Convert the matches into a Dictionnary in the Dataframe.
'''

def extract_details(details):
    # Key words which can be find in the string
    keys = ["ISBN/GTIN", "Produktart", "Einbandart", "Verlag", "Erscheinungsdatum", "Auflage", "Reihe", "Reihen-Nr.", "Seiten", "Sprache", "Masse", "Artikel-Nr."]

    # generate a regex which can find all the keys in the string
    regex = "(" + "|".join(keys) + ")(.*?)(?=" + "|".join(keys) + "|$)"

    # finds all key value pairs in the string
    matches = re.findall(regex, details)
    
    # returns a dictionary with key value pairs
    return dict((k.strip(), v.strip()) for k, v in matches)

In [19]:
def data_wrangling_pipeline(df):
    # apply the function to the data frame (create a new column with extracted details)
    df['details_dict'] = df['details'].apply(extract_details)
    # create new dataframe with the details
    details_df = df['details_dict'].apply(pd.Series, dtype= 'object')
    # concat both dataframes
    new_df = pd.concat([buch_df, details_df], axis=1)
    # drop the old details columns
    new_df.drop(['details', 'details_dict'], axis=1, inplace=True)
    return new_df

In [20]:
buch_df = data_wrangling_pipeline(buch_df)

#drop all rows where the title is nan
buch_df = buch_df[buch_df['Titel'] != 'nan']

#drop duplicates
buch_df.drop_duplicates(subset=['Titel'], inplace=True)

# reset index
buch_df.reset_index(drop=True, inplace=True)

In [21]:
buch_df

Unnamed: 0,Titel,Preis,Autor,Genre,Text,ISBN/GTIN,Produktart,Einbandart,Verlag,Erscheinungsdatum,Auflage,Reihe,Seiten,Sprache,Masse,Artikel-Nr.
0,Atlas - Die Geschichte von Pa Salt,32.00,"Riley, LucindaWhittaker, HarryHauser, SonjaÜbe...",belletristik romane,"Paris, 1928. Ein Junge wird gerade noch rechtz...",978-3-442-31567-3,Buch,Gebunden,Goldmann,11.05.2023Erstverkaufstag11.05.2023,Deutsche Erstausgabe,n-Nr.08,,,,
1,Der Feind,24.90,"Brand, Christine",belletristik krimi,Ein bizarre Mordserie an Männern sowie Schüsse...,978-3-7645-0771-8,Buch,Paperback,Blanvalet,26.04.2023,1. A.,n-Nr.05,,Deutsch,,
2,Die Krume Brot,29.00,,belletristik schweizer,"Adelina, Tochter italienischer Einwanderer, ar...",978-3-498-00320-3,Buch,Gebunden,Rowohlt,18.04.2023,,,,Deutsch,"Breite 133 mm, Höhe 209 mm, Dicke 23 mmGewicht...",
3,Anuschka und Finn,20.00,"Schawinski, Roger",fachbuecher geschichte schweiz,"Roger Schawinski über einen Medienskandal, der...",978-3-033-09890-9,Buch,Paperback,Radio 1 AG,12.05.2023,,,,Deutsch,"Breite 144 mm, Höhe 221 mm, Dicke 15 mmGewicht...",58408389
4,Elternabend,23.90,"Fitzek, Sebastian",belletristik romane,Stell dir vor ...\n... du musst eine halbe Ewi...,978-3-426-28413-1,Buch,Paperback,Droemer/Knaur,26.04.2023Erstverkaufstag26.04.2023,,,,Deutsch,"Breite 135 mm, Höhe 210 mm, Dicke 25 mm",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,Fokus. Die Kraft der Konzentration,34.90,"DandapaniWeingart, KarinÜbersetzung",fachbuecher psychologie psychologie,Permanente Ablenkung ist in unserem modernen A...,978-3-7787-8309-2,Buch,Gebunden,Lotos,26.04.2023,Deutsche Erstausgabe,,,Deutsch,"Breite 160 mm, Höhe 232 mm, Dicke 34 mmGewicht...",
222,Queen Charlotte - Bevor es die Bridgertons gab...,17.90,"Quinn, JuliaRhimes, ShondaPanic, IraÜbersetzung",taschenbuch historische,Das Buch zur neuen Netflix-Serie\n\nAn einem s...,978-3-365-00491-3,Buch,Paperback,HarperCollins HamburgHarperCollins Taschenbuch,09.05.2023,,n-Nr.Spin-Off,,Deutsch,,
223,Dein Lotta-Leben. Ferienbuch,14.90,"Pantermüller, AliceKohl, Daniela",kids bis11,Juhu! Heute war der allerletzte Schultag und m...,978-3-401-60000-0,Buch,Gebunden,Arena,08.05.2023,8.A.,Mein Lotta-Leben,,Deutsch,"Breite 148 mm, Höhe 194 mm, Dicke 13 mm",
224,Louis XIV,45.00,"Willms, Johannes",fachbuecher geschichte biografien,"""VON GOTTES GNADEN"" - DAS MERKWÜRDIGE LEBEN DE...",978-3-406-80067-2,Buch,Gebunden,Beck,12.05.2023,,,,Deutsch,"Breite 139 mm, Höhe 217 mm, Dicke 42 mmGewicht...",


## Export

In [22]:
# save as xlsx
buch_df.to_excel('../data/excelfiles/df_neuheiten_buchhaus.xlsx', index=False)
# save as feather
buch_df.to_feather('../data/feather/df_neuheiten_buchhaus.feather')

## Ausblick

Ausblick:

* Bücher in den nächsten 30 Tagen könnten auch gescrapet werden