In [1]:
import numpy as np
import pandas as pd
import requests 
from bs4 import BeautifulSoup

In [2]:
import re

In [14]:
#Write a function to srap data at one page
def get_page_data(page_number):
    """
    Scrap the content on Goodreads website based on page number
    
    Args:
    page_number(int): the position of the page in the search results, an interger from 1 to 50
    
    Returns: 
    Data Frame with 5 colums: Title, Author, Avg Rating, Total Ratings and Year
    """
    #Specify the params: this will pass page number to the url
    params = {'page':page_number}
    #Specify the url
    url="https://www.goodreads.com/shelf/show/psychology"
    
    
    #Request the content from url and save
    r = requests.get(url, params=params)
    
    #Save the text using Beautiful Soup
    soup = BeautifulSoup(r.text)
    
    #Get titles:
    title = []
    tags_a = soup.find_all('a', class_='leftAlignedImage')
    for i in range(0 , 50):
        title.append(tags_a[i].get('title'))
    
    #Get authors:
    author = []
    tags_span = soup.find_all('span', itemprop='name')
    for j in range(0 , 50):
        author.append(tags_span[j].text)
        
    #Get avg_rating, total_ratings, published_year:
    avg_rating = [] 
    total_ratings = []
    published_year = []
    greytext = soup.find_all(string=re.compile('avg rating'))
    for k in range(0 , 50): 
        avg_rating.append(greytext[k][28:32])
        total_ratings.append(greytext[k][51:58])
        published_year.append(greytext[k][95:100])
        
    df_output = pd.DataFrame(zip(title, author, avg_rating, total_ratings, published_year), columns=['Title','Author', 'Avg Rating', 'Total Ratings', 'Year'])
    return df_output

In [15]:
df = pd.DataFrame()

In [16]:
df_output = get_page_data(4)
df_output.head()

Unnamed: 0,Title,Author,Avg Rating,Total Ratings,Year
0,"Thinking, Fast and Slow",Daniel Kahneman,4.16,333012,2011
1,Man's Search for Meaning,Viktor E. Frankl,4.35,434272,1946
2,Quiet: The Power of Introverts in a World That...,Susan Cain,4.06,340576,2012
3,Blink: The Power of Thinking Without Thinking,Malcolm Gladwell,3.94,504406,2005
4,The Power of Habit: Why We Do What We Do in Li...,Charles Duhigg,4.11,365421,2012


In [17]:
#Scrap data from the first 25 page and put into a data frame together
for page_number in [1, 2, 3]:
    df_output = get_page_data(page_number)
    df = pd.concat([df, df_output], axis=0)

In [18]:
df.describe()

Unnamed: 0,Title,Author,Avg Rating,Total Ratings,Year
count,150,150,150.0,150,150
unique,50,40,34.0,50,38
top,Stumbling on Happiness,Malcolm Gladwell,4.06,21917,2012
freq,3,15,12.0,3,9


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 0 to 49
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Title          150 non-null    object
 1   Author         150 non-null    object
 2   Avg Rating     150 non-null    object
 3   Total Ratings  150 non-null    object
 4   Year           150 non-null    object
dtypes: object(5)
memory usage: 7.0+ KB


In [20]:
df.head(58)

Unnamed: 0,Title,Author,Avg Rating,Total Ratings,Year
0,"Thinking, Fast and Slow",Daniel Kahneman,4.16,333012,2011
1,Man's Search for Meaning,Viktor E. Frankl,4.35,434272,1946
2,Quiet: The Power of Introverts in a World That...,Susan Cain,4.06,340576,2012
3,Blink: The Power of Thinking Without Thinking,Malcolm Gladwell,3.94,504406,2005
4,The Power of Habit: Why We Do What We Do in Li...,Charles Duhigg,4.11,365421,2012
5,Influence: The Psychology of Persuasion,Robert B. Cialdini,4.19,108158,1984
6,The Man Who Mistook His Wife for a Hat and Oth...,Oliver Sacks,4.06,172542,1985
7,Predictably Irrational: The Hidden Forces That...,Dan Ariely,4.12,104800,2008
8,Flow: The Psychology of Optimal Experience,Mihaly Csikszentmihalyi,4.1,58761,1990\n
9,Outliers: The Story of Success,Malcolm Gladwell,4.17,588313,2008


In [None]:
#Sadly Goodreads seems to block my scraping effort and the page params do not work. You can see that the last 8 records are repetition of the first 8, as instead of scraping page 2, I gave me page 1 again. 