In [22]:
import requests
from bs4 import BeautifulSoup, NavigableString
import math
import Levenshtein
from langdetect import detect
import json

In [2]:
def findAuthorID(author):
    try:
        author = author.replace(" ", "+")
        page = requests.get("https://www.goodreads.com/search?utf8=%E2%9C%93&q=" + author + "&search_type=books&search%5Bfield%5D=author")
        soup = BeautifulSoup(page.text, 'html.parser')
        idLink = soup.find(class_="authorName",href=True)["href"]
        start = idLink.find("show/")
        end = idLink.find("?from")
        authorID = idLink[start+5:end]

        #experimental: trying to find out if the authorID is correct
        nameStart = authorID.find(".")
        authorName = authorID[nameStart+1:].replace("_"," ")
        lDistance = Levenshtein.distance(author,authorName)
        differenceScore = lDistance-len(authorName)+len(author) #bigger is worse

    except:
        print("failed to find author")
        pass
    print("found " + author +" @ " + authorID)
    print("(experimental) difference score = " + str(differenceScore))
    return authorID

In [56]:
def getQuotesByAuthor(author, maxChars, page_num = None, language = 'en'):
    all_quotes = []
    authorID = findAuthorID(author)

    if page_num is None:
        try:
            page = requests.get("https://www.goodreads.com/author/quotes/" + authorID)
            soup = BeautifulSoup(page.text, 'html.parser')
            pages = soup.find(class_="smallText").text
            of = pages.find("of ")
            showing = pages.find("Showing ")
            num_shown = pages[showing+10:of-1]
            total_num = pages[of+3:]
            total_num = total_num.replace(",", "").replace("\n", "")
            num_shown = int(num_shown)
            total_num = int(total_num)
            page_num = math.ceil(total_num/num_shown)
            print("looking through", page_num, "pages")

        except:
            page_num = 1

    #get author's name
    page = requests.get("https://www.goodreads.com/author/quotes/" + authorID)
    soup = BeautifulSoup(page.text, 'html.parser')
    h1 = soup.find("h1")
    officialName = h1.find_all("a")[1].text
    print("Author's Official Name: " + officialName)


    for i in range(1, page_num+1, 1):
        try:
            page = requests.get("https://www.goodreads.com/author/quotes/" + authorID + "?page=" + str(i))
            soup = BeautifulSoup(page.text, 'html.parser')
            print("scraping page", i, " of ", page_num)
        except:
            print("could not connect to goodreads")
            break    

        try:
            quote = soup.find(class_="quotes")
            quote_list = quote.find_all(class_="quoteDetails")
        except:
            pass

        for quote in quote_list:
            meta_data = []
        # Get quote's text
            try:
                outer = quote.find(class_="quoteText")
                inner_text = " ".join(outer.strings)   
                midIndex = inner_text.find("―")
                final_quote = " ".join(inner_text[:midIndex].split()).strip()
            except:
                pass 
            if(len(final_quote) < maxChars and len(final_quote) != 0 and detect(final_quote) == language):
                    meta_data.append(final_quote)
            else:
                meta_data.append(None)
                continue

            #get quote's author
            try:
                meta_data.append(officialName)

            except:
                meta_data.append(None)

            #get quote's title
            try: 
                title = quote.find(class_="authorOrTitle")
                title = title.nextSibling.nextSibling.text
                title = title.replace("\n", "")
                meta_data.append(title.strip())
            except:
                meta_data.append(None)

            # Get quote's tags
            try:
                tags = quote.find(class_="greyText smallText left").text
                tags = [x.strip() for x in tags.split(',')]
                tags = tags[1:]
                meta_data.append(tags)
            except:
                meta_data.append(None)
            
            # Get number of likes
            try:
                likes = quote.find(class_="right").text
                likes = likes.replace("likes", "")
                likes = int(likes)
                meta_data.append(likes)
            except:
                meta_data.append(None)

            all_quotes.append(meta_data)
    
    print("Found " + str(len(all_quotes)) + " quotes")
    return all_quotes

In [58]:
quotes = getQuotesByAuthor("Descartes", 175,3)
#for quote in quotes:
 #   print(quote[0])
    #print(quote[1])
    #print(quote[2])
    #print(quote[3])
  #  print(" ")
quotes

found Descartes @ 36556.Ren_Descartes
(experimental) difference score = 0
Author&#39;s Official Name: René Descartes
scraping page 1  of  3
scraping page 2  of  3
scraping page 3  of  3
Found 53 quotes


[[&#39;“I think; therefore I am.”&#39;, &#39;René Descartes&#39;, None, None, 2689],
 [&#39;“The reading of all good books is like conversation with the finest men of past centuries.”&#39;,
  &#39;René Descartes&#39;,
  None,
  [&#39;literature&#39;, &#39;reading&#39;, &#39;words&#39;],
  850],
 [&#39;“Cogito ergo sum. (I think, therefore I am.) ”&#39;,
  &#39;René Descartes&#39;,
  None,
  [&#39;life-and-living&#39;, &#39;mottos&#39;, &#39;thinking&#39;],
  635],
 [&#39;“If you would be a real seeker after truth, it is necessary that at least once in your life you doubt, as far as possible, all things.”&#39;,
  &#39;René Descartes&#39;,
  None,
  [&#39;truth&#39;],
  607],
 [&#39;“Conquer yourself rather than the world.”&#39;,
  &#39;René Descartes&#39;,
  None,
  None,
  313],
 [&#39;“Common sense is the most widely shared commodity in the world, for every man is convinced that he is well supplied with it.”&#39;,
  &#39;René Descartes&#39;,
  None,
  [&#39;humor&#39;, &#39;irony&#39;