In [149]:
## import library
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import pandas as pd
import numpy as np
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re

In [161]:
##set options
#options = Options()
## activate headless options
#options.headless = False
## options.add_argument('window-size=1920x1080')

## The bigger the screen the more data you  get

## website to scrape

website = "https://www.audible.com/search"
path = 'C:/Users/cmwak/chrome_driver/chromedriver'

## define a driver variable'
driver = webdriver.Chrome(service = Service(path))

## go full screen
driver.maximize_window()

## open the driver
driver.get(website)

## maximize the window to display all over the screen
driver.maximize_window() ## this only works when headless modde is on

#pagination
pagination = driver.find_element(By.XPATH, '//ul[contains(@class, "pagingElements")]')

## get the list of all pages
pages = pagination.find_elements(By.TAG_NAME, 'li')

## get the last page
last_page = int(pages[-2].text)

## above we are indexing from the end of the list[-2] because the last element is the next button

## click the next button
## next_page = driver.find_element_by_xpath('//span[contains(@class, "nextButton")]')
## next_page.click()



book_title = []
book_author = []
book_length = []
book_link = []

current_page = 1
while current_page <= last_page:
    time.sleep(2) ## wait for the page to load for 2 seconds [implicit wait]
    ## get the elements
    container = driver.find_element(By.CLASS_NAME, "adbl-impression-container")
    products = container.find_elements(By.XPATH, './/li[contains(@class, "productListItem")]')

    for product in products:
        ## the dot here indicates that the current context is product
        book_title.append(product.find_element(By.XPATH, './/h3[contains(@class,"bc-heading")]').text)
        book_author.append(product.find_element(By.XPATH, './/li[contains(@class,"authorLabel")]').text)
        book_length.append(product.find_element(By.XPATH, './/li[contains(@class,"runtimeLabel")]').text)
        book_link.append(product.find_element(By.XPATH, './/h3/a').get_attribute('href'))

    current_page += 1
    try:
        next_page = driver.find_element(By.XPATH, '//span[contains(@class, "nextButton")]')
        next_page.click()
    except:
        pass



driver.quit()


In [176]:
## create a dataframe

df_books = pd.DataFrame({'title':book_title, 'author':book_author, 'runtime':book_length, 'link':book_link})
#df_books

In [177]:
amazon_books = df_books.copy()

### Preprocess the data

In [178]:
### do some preprocessing

book_time = (
    amazon_books['runtime']
    .str.replace('[^0-9]', ' ', regex = True)
    .str.strip()
    .str.replace(' +', ' ', regex = True)
    .str.split(' ', expand = True)
    .rename(columns = {0:'hours', 1:'mins'})
    .replace('', np.nan)
    .astype(float)
    .astype('Int64')
    .assign(total_mins = lambda x: x.hours * 60 + x.mins).astype(float)
    .assign(total_hours = lambda x: x.total_mins / 60).round(2).astype(float)
    .assign(total_days = lambda x: x.total_hours / 24).round(2).astype(float)
    #.query('total_days > 1')


)

## apply the functions

amazon_books['author'] = amazon_books['author'].replace("By: ", "", regex=True)

amazon_books.drop('runtime', axis=1, inplace=True)


amazon_books = amazon_books.join(book_time)

amazon_books

Unnamed: 0,title,author,link,hours,mins,total_mins,total_hours,total_days
0,Shift Point,Franklin Horton,https://www.audible.com/pd/Shift-Point-Audiobo...,10.0,16.0,616.0,10.27,0.43
1,"The Dream, the Journey, Eternity, and God","Sara Landon, Mike Dooley",https://www.audible.com/pd/The-Dream-the-Journ...,7.0,54.0,474.0,7.90,0.33
2,"Love, Theoretically",Ali Hazelwood,https://www.audible.com/pd/Love-Theoretically-...,12.0,33.0,753.0,12.55,0.52
3,The Five-Star Weekend,Elin Hilderbrand,https://www.audible.com/pd/The-Five-Star-Weeke...,12.0,45.0,765.0,12.75,0.53
4,The Puppeteers,Jason Chaffetz,https://www.audible.com/pd/The-Puppeteers-Audi...,8.0,49.0,529.0,8.82,0.37
...,...,...,...,...,...,...,...,...
494,"My Cottage Was Transferred to Another World, V...",Sebastian Guzman,https://www.audible.com/pd/My-Cottage-Was-Tran...,14.0,25.0,865.0,14.42,0.60
495,The Utterly Uninteresting and Unadventurous Ta...,Drew Hayes,https://www.audible.com/pd/The-Utterly-Uninter...,7.0,2.0,422.0,7.03,0.29
496,Tragic Bonds,J Bree,https://www.audible.com/pd/Tragic-Bonds-Audiob...,10.0,4.0,604.0,10.07,0.42
497,The Strangest Forms,Gregory Ashe,https://www.audible.com/pd/The-Strangest-Forms...,11.0,30.0,690.0,11.50,0.48


In [168]:
amazon_books.to_csv('amazon_audio_books.csv', index=False)

In [180]:
amazon_books['link']

0      https://www.audible.com/pd/Shift-Point-Audiobo...
1      https://www.audible.com/pd/The-Dream-the-Journ...
2      https://www.audible.com/pd/Love-Theoretically-...
3      https://www.audible.com/pd/The-Five-Star-Weeke...
4      https://www.audible.com/pd/The-Puppeteers-Audi...
                             ...                        
494    https://www.audible.com/pd/My-Cottage-Was-Tran...
495    https://www.audible.com/pd/The-Utterly-Uninter...
496    https://www.audible.com/pd/Tragic-Bonds-Audiob...
497    https://www.audible.com/pd/The-Strangest-Forms...
498    https://www.audible.com/pd/Fused-Audiobook/B0C...
Name: link, Length: 499, dtype: object