In [1]:
from requests_html import HTMLSession, HTML
import numpy as np
import pandas as pd
from datetime import datetime
from threading import Thread
import time
import matplotlib.pyplot as plt
import progressbar
import warnings
warnings.filterwarnings("ignore")

In [2]:
base_url = 'https://www.audible.com/search?pf_rd_p=7fe4387b-4762-42a8-8d9a-a63254c74bb2&pf_rd_r=C7ENYKDADHMCH4KY12D4&ref=a_search_l1_feature_five_browse-bin_6&feature_six_browse-bin=9178177011&pageSize=50'

In [3]:
def build_dict(items, category, data):
    for item in items:
        text_fields = item.text.split('\n')
        link = [link for link in item.absolute_links if '/pd/' in link][0]
        dict_entry={
            'category' : category,
            'title' : text_fields[0],
            'link' : link
            }
        try:
            dict_entry['rating_count'] = np.int([s for s in text_fields if 'stars' in s][0].split(
                'stars ')[1].replace(',',''))
        except: pass
        try:
            dict_entry['narrator'] = [s for s in text_fields if 'Narrated by' in s][0].split(': ')[1]
        except: pass
        try:
            dict_entry['asin'] = [s for s in link.split('/') if 'B0' in s][0].split('?')[0]
        except: pass
        try:
            dict_entry['length'] = [s for s in text_fields if 'Length' in s][0].split(': ')[1]
        except: pass
        try:
            dict_entry['rating'] = np.float([s for s in text_fields if 'stars' in s][-1].split(' out')[0])
        except: pass
        try:
            dict_entry['author'] = [s for s in text_fields if 'By' in s][0].split(': ')[1]
        except: pass
        try:
            dict_entry['price'] = np.float([s for s in text_fields if 'Regular' in s][0].split('$')[1])
        except: pass
        try:
            dict_entry['release_date'] = datetime.strptime([s for s in text_fields if 'Release date:' in s][0].split(
                ': ')[1], '%m-%d-%y')
        except : pass
        data.append(dict_entry)
    return data

def scrape_great_courses(mthreads, category, pages, url_list, data):
    sess = HTMLSession()
    
    for url in url_list:
        try:
            r = sess.get(url)
        except:
            try:
                time.sleep(0.5)
                r = sess.get(url)
            except:
                pass
            
        items = r.html.find('li.bc-list-item.productListItem', first=False)
        
        threads = []
        for j in range(mthreads):
            item_sublist = items[j::mthreads]
            t = Thread(target=build_dict, args=(item_sublist, category, data))
            threads.append(t)
            
        [t.start() for t in threads]
        [t.join() for t in threads]
    
    sess.close()
    return data

In [4]:
def scrape_threader(nthreads, mthreads, category, pages, base_url, data=None):
    if data == None:
        data = []
    
    # Create url list
    url_list = []
    for page in range(pages):
        pageurl = base_url + '&page=' + str(page+1)
        url_list.append(pageurl)
        
    # Create threads
    threads = []
    for i in range(nthreads):
        url_sublist = url_list[i::nthreads]
        t = Thread(target=scrape_great_courses, args=(mthreads, category, pages, url_sublist, data))
        threads.append(t)
    
    # Run threads
    [t.start() for t in threads]
    [t.join() for t in threads]
    
    return data

In [5]:
def loop_categories(nthreads, mthreads, cat_names, cat_page_nums, cat_links):
    data = []
    
#     widgets = [
#         progressbar.Percentage(), 
#         progressbar.Bar(), 
#         progressbar.ETA(),
#         progressbar.DynamicMessage('cat')]
#     bar = progressbar.ProgressBar(widgets=widgets, max_value=sum(cat_page_nums)).start()
    
    finished_pages = 0  
    for category, pages, link in zip(cat_names, cat_page_nums, cat_links):
        print('Scraping ', category, '...')
#         bar.update(finished_pages, cat=category)
        data.extend(scrape_threader(nthreads, mthreads, category, pages, link, data=data))
        finished_pages += pages
        
#     bar.finish()    
    return data

In [6]:
sess = HTMLSession()
r = sess.get(base_url)

cat_items = r.html.find('div.bc-col-responsive.bc-col-3')[1].find('ul.bc-list')[0].find('li.bc-list-item')
cat_names = [item.text.split(' (')[0] for item in cat_items]
cat_item_nums = [np.int(item.text.split(' (')[1][:-1].replace(',', '')) for item in cat_items]
cat_page_nums = [np.int(np.ceil(item/50)) for item in cat_item_nums]
cat_links = [item.absolute_links.pop() + '&pageSize=50' for item in cat_items]

sess.close()

In [7]:
for i in range(3, len(cat_names)):
    start=time.time()
    df = pd.DataFrame(data=loop_categories(
        8, 2, [cat_names[i]], [cat_page_nums[i]], [cat_links[i]]))
    df = df.drop_duplicates()
    df.to_csv('{}.csv'.format(i))
    end = time.time()
    rate = cat_page_nums[i]/(end-start)
    pages_left = np.sum(cat_page_nums[i+1:])
    eta = pages_left/rate/60
    print('Done. Scraped {} out of {} items at {:.1f} pages/s. ETA: {:.1f} min.'.format(
        len(df), cat_item_nums[i], rate, eta))

Scraping  Classics ...
Done. Scraped 11042 out of 11043 items at 1.1 pages/s. ETA: 118.2 min.
Scraping  Erotica & Sexuality ...
Done. Scraped 14405 out of 14405 items at 1.1 pages/s. ETA: 119.0 min.
Scraping  Fiction ...
Done. Scraped 55564 out of 55614 items at 1.1 pages/s. ETA: 103.4 min.
Scraping  History ...


Exception in thread Thread-3276:
Traceback (most recent call last):
  File "C:\Users\Toby-PC\Anaconda3\lib\site-packages\pyquery\pyquery.py", line 95, in fromstring
    result = getattr(etree, meth)(context)
  File "src/lxml/etree.pyx", line 3213, in lxml.etree.fromstring
  File "src/lxml/parser.pxi", line 1877, in lxml.etree._parseMemoryDocument
  File "src/lxml/parser.pxi", line 1765, in lxml.etree._parseDoc
  File "src/lxml/parser.pxi", line 1127, in lxml.etree._BaseParser._parseDoc
  File "src/lxml/parser.pxi", line 601, in lxml.etree._ParserContext._handleParseResultDoc
  File "src/lxml/parser.pxi", line 711, in lxml.etree._handleParseResult
  File "src/lxml/parser.pxi", line 640, in lxml.etree._raiseParseError
  File "<string>", line 1
lxml.etree.XMLSyntaxError: Document is empty, line 1, column 1

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Toby-PC\Anaconda3\lib\threading.py", line 917, in _bootstrap_in

Done. Scraped 15500 out of 15621 items at 1.1 pages/s. ETA: 96.7 min.
Scraping  Mysteries & Thrillers ...
Done. Scraped 47392 out of 47401 items at 1.1 pages/s. ETA: 83.2 min.
Scraping  Romance ...
Done. Scraped 44607 out of 44623 items at 0.8 pages/s. ETA: 89.2 min.
Scraping  Science & Technology ...
Done. Scraped 13984 out of 13984 items at 1.1 pages/s. ETA: 62.7 min.
Scraping  Sci-Fi & Fantasy ...
Done. Scraped 34151 out of 34151 items at 1.1 pages/s. ETA: 56.0 min.
Scraping  Self Development ...
Done. Scraped 44007 out of 44028 items at 1.1 pages/s. ETA: 39.9 min.
Scraping  Comedy ...
Done. Scraped 5085 out of 5085 items at 1.2 pages/s. ETA: 36.9 min.
Scraping  Newspapers & Magazines ...
Done. Scraped 10209 out of 10208 items at 1.2 pages/s. ETA: 32.1 min.
Scraping  Nostalgia Radio ...
Done. Scraped 2053 out of 2104 items at 1.2 pages/s. ETA: 33.1 min.
Scraping  Radio & TV ...
Done. Scraped 10697 out of 10697 items at 1.2 pages/s. ETA: 29.3 min.
Scraping  Sports ...
Done. Scraped 3

In [13]:
df = pd.read_csv('0.csv')
for i in range(1, len(cat_names)):
    df = pd.concat([df, pd.read_csv('{}.csv'.format(i))], ignore_index=True)

In [16]:
df = df.drop_duplicates()
df = df.drop('Unnamed: 0', axis=1)

In [19]:
df.to_csv('all_english_audible.csv', index=False)