# Importing required libraries

In [1]:
# Trivial libraries.

import numpy as np
import pandas as pd


# Making an empty dataframe

In [2]:
# Create an empty daatframe with proper structure.

df = pd.DataFrame(columns = ['Name', 'Author', 'ISBN', 'Rating', 'Text', 'Summary'])
df


Unnamed: 0,Name,Author,ISBN,Rating,Text,Summary


# Crawler Implementation and Dataset Population

In [3]:
# Crawl the web to get data & populate the dataset.

# Import all the required libraries.
import requests
from bs4 import BeautifulSoup
import urllib

# Get the seed URL.
URL = "https://sive.rs/book"

# Get the HTML content.
r = requests.get(URL)
control = BeautifulSoup(r.content, 'html.parser')

# Find an extract all anchor tags or links which contain book 'text'.
read_my_notes = control.find_all('a')

texts_links = []
for notes in read_my_notes:        
    if(notes['href'][:6] == '/book/'):  
        
        found_link = 'https://sive.rs' + str(notes['href'])
        
        if(found_link not in texts_links):
            texts_links.append(found_link)
            
# Find and extract all paragraph tags with contain book 'summary'.
book_section = control.find('section', attrs = {'id': 'allbooks'})

book_sums = []
count = 0
for summs in book_section.find_all('p'):
    
    if(count %2 == 0):   
        book_sums.append(summs.text)
        
    count += 1
             

# Traverse every found link for book 'text' and 'metadata'.
i = 0
for link in texts_links:
    
    req = requests.get(link)
    innercontrol = BeautifulSoup(req.content, 'html.parser')    
    
    division = innercontrol.find('div', attrs = {'id': 'content'})
    
    title = ''
    author = ''
    isbn = ''
    rating = 0
    texts = ''
    
    # Loop to find the book names and authors.
    for h1s in division.find_all('h1'):
        temp = str(h1s.text)
        
        itlen = len(temp)
        while(temp[itlen-1] is not '-'):
            author += temp[itlen-1]
            itlen -= 1
            
        author = author[::-1]
        author = author[4:].strip()
        title = temp[:itlen-1].strip()
        
        break        
        
    # Loop to find the book ISBNs and author ratings.
    for headers in division.find_all('header'):
        
        bookinfo = ''
        
        for smalls in headers.find_all('small'):
            bookinfo += (smalls.text)
            
        bookinfo = bookinfo.replace('\n', ' ')        
        book_info_list = list(bookinfo.split(' '))
        
        isbn = book_info_list[2]
        rating = int(book_info_list[11].split('/')[0])
            
        break        
        
    # Loop to find the book texts.
    for ts in division.find_all('p', attrs = {'id':'booknotes'}):
        texts += (ts.text.strip())
        
    texts = texts.replace('\n', ' ')
    texts = ' '.join(texts.split())

    # Populating the dataset with crawled data.
    df.loc[i] = [title, author, isbn, rating, texts, book_sums[i]]
    i += 1


In [4]:
# Sample look at out dataframe.

df


Unnamed: 0,Name,Author,ISBN,Rating,Text,Summary
0,Sum: Forty Tales from the Afterlives,David Eagleman,0307377342,10,There are three deaths. The first is when the ...,Awesomely creative think-piece. 40 very short ...
1,On Writing Well,William Zinsser,0060891548,10,The essence of writing is rewriting. Just beca...,Great blunt advice about writing better non-fi...
2,"Mindwise: How We Understand What Others Think,...",Nicholas Epley,0307595919,10,"“The only true voyage of discovery, the only f...","Many new brilliant insights, especially about ..."
3,The War of Art,Steven Pressfield,0446691437,10,It’s not the writing part that’s hard. What’s ...,Have you experienced a vision of the person yo...
4,"Thinking, Fast and Slow",Daniel Kahneman,66157471,10,The title of the book refers to two modes of t...,If you liked “Predictably Irrational” or “Stum...
...,...,...,...,...,...,...
287,Seeing What Others Don't,Gary Klein,1610392515,0,The fancy new BMW in front of them: The driver...,"I really wanted to like this book, but couldn'..."
288,Flex: Do Something Different,Ben Fletcher and Karen Pine,1907396543,0,Personalities fit into certain categories: agr...,I give the basic idea a 9-out-of-10 rating: th...
289,Cambodia's Curse,Joel Brinkley,1586487876,0,Cambodia's Curse: The Modern History of a Trou...,Cambodia's political history from 1978 to 2009...
290,Conspiracy of the Rich,ert Kiyosaki,0446559806,0,Conspiracy of the Rich - Robert Kiyosaki The n...,Yet another Rich Dad book shat out for the usu...


# Saving the dataset locally

In [5]:
# Save the dataset to a file.

df.to_csv('.//Datasets//Books.csv', index = False)
