## Python Packages 

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from amzsear import AmzSear

## Helper Function

In [2]:
def get_urls(url):
    links = []
    
    content = requests.get(url).content
    soup = BeautifulSoup(content,'lxml')
    table_classes = {"class": ['ul']}
    tag = soup.findAll('i')

    for link in tag:
        try:
            links.append('https://en.wikipedia.org'+link.find('a').get('href'))
        except:
            next
            
    return links

In [3]:
def scrape_wiki(arr):
    l = []
    exceptional_row_count = 0
    
    for url in arr:
        result = {}
        summaries = []
        
        content = requests.get(url).content
        soup = BeautifulSoup(content,'lxml')
        table = soup.find_all('table',{'class':'infobox vcard'})
        summary = soup.find_all('p')
        title = soup.find_all('h1',{'class':'firstHeading'})
        
        for t in title:
                try:
                    result['Title'] = t.find('i').text
                except:
                    next
                    
        for summ in summary:
                try:
                    summaries.append(summ.text)
                except:
                    next

        result['Summary'] = ' '.join(summaries)
            
        if len(table)>0:
            

            for tr in table[0].find_all('tr'):
                if tr.find('th'):
                    try:
                        result[tr.find('th').text] = tr.find('td').text
                    except:
                        next
                else:
                    # the first row Logos fall here
                    exceptional_row_count += 1
            #if exceptional_row_count > 1:
                #print ('WARNING ExceptionalRow>1: ', table)
                
            

            l.append(result)

    return l

In [4]:
def get_ratings(isbn):
    
    amz = AmzSear(isbn)
    last_item = amz.rget(-1)
    
    try:
        ratings = last_item.rating.ratings_text
        
        return ratings
    
    except:
        pass
    
    return 

In [5]:
def get_ratings_count(isbn):
    
    amz = AmzSear(isbn)
    last_item = amz.rget(-1)
    
    try:
        r_count  = last_item.rating.ratings_count_text
        
        return r_count
    
    except:
        pass
    
    return 

In [6]:
def get_product_url(isbn):
    
    amz = AmzSear(isbn)
    last_item = amz.rget(-1)
    
    try:
        product_url  = last_item.product_url
        
        return product_url
    
    except:
        pass
    
    return 

## Scrape Wikipedia

In [7]:
urls = ['https://en.wikipedia.org/wiki/List_of_fantasy_novels_(A%E2%80%93H)','https://en.wikipedia.org/wiki/List_of_fantasy_novels_(I%E2%80%93R)','https://en.wikipedia.org/wiki/List_of_fantasy_novels_(S%E2%80%93Z)','https://en.wikipedia.org/wiki/List_of_science_fiction_novels']

In [8]:
links = []

for url in urls:
    link = get_urls(url)
    links.extend(link)

In [9]:
data = scrape_wiki(links)

In [10]:
df = pd.DataFrame(data)

In [11]:
df['ratings_text'] = df.ISBN.apply(get_ratings)

In [12]:
df['ratings_count'] = df.ISBN.apply(get_ratings_count)

In [13]:
df['product_url'] = df.ISBN.apply(get_product_url)

In [14]:
df.to_csv('data/books.csv')

In [15]:
print(f'Dataframe shape:{df.shape}')

Dataframe shape:(1041, 57)


In [16]:
print('All Done!')

All Done!
