In [8]:
import time, json
import requests
import csv
from bs4 import BeautifulSoup

In [2]:
def read_api_key(filepath):
    with open(filepath, 'r') as f:
        return f.read().replace('\n','')
    
api_key = read_api_key('api_key.txt')
print(api_key)

yfmkhh2aWg8dnQyvnoRXg


In [32]:
# Test function for querying goodreads API
def goodreads_query(api_key):
    """
    api_key (string): API key
    url (string): Base query URL
    query (dictionary): Query parameters
    """
    
    request_params = {'id':68428, 'key':api_key, 'format':'xml', 'text_only':'true'}
    response = requests.get('https://www.goodreads.com/book/show', params=request_params)
    
    soup = BeautifulSoup(response.text, 'lxml')
    shelves = soup.find('popular_shelves')
    return response
    
temp_response = goodreads_query(api_key)

In [38]:
# Get metadata about a book given its book ID
def get_book_metadata(api_key, book_id):
    request_params = {'id':book_id, 'key':api_key, 'format':'xml', 'text_only':'true'}
    response = requests.get('https://www.goodreads.com/book/show', params=request_params)
    
    book_info = {}
    
    book_info['book_id'] = book_id
    
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Author
    book_info['author'] = soup.find('author').find('name').text
    
    # Publication date
    year = soup.find('publication_year').text
    month = soup.find('publication_month').text
    day = soup.find('publication_day').text
    book_info['publication_date'] = str(month) + '/' + str(day) + '/' + str(year)
    
    # Publisher
    book_info['publisher'] = soup.find('publisher').text
    
    # Review information
    book_info['review_count'] = soup.find('text_reviews_count').text
    
    # Rating information
    book_info['rating_count'] = soup.find('ratings_count').text
    book_info['average_rating'] = soup.find('average_rating').text
    
    # Treat the genre as the most popular shelf this book has been placed on
    book_info['genre'] = soup.find('popular_shelves').find('shelf')['name']
    
    return book_info

# Save all book metadata as a CSV
def download_book_metadata(api_key):
    data_columns = ['book_id', 'author', 'publication_date', 'publisher', 'review_count', 'rating_count', 'average_rating', 'genre']
    
    with open('metadata.csv', 'w', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, data_columns, lineterminator='\n')
        dict_writer.writeheader()
        
        # TODO: iterate through book IDs - add a time.sleep(1) between each iteration
        current_metadata = get_book_metadata(api_key, 68428)
        dict_writer.writerow(current_metadata)

download_book_metadata(api_key)

In [18]:
# Get all the reviews for a given book ID as a list of dictionaries
# Review data we are collecting - rating, date, and review text
def get_reviews(api_key, book_id):
    request_params = {'id':book_id, 'key':api_key, 'format':'xml', 'text_only':'true'}
    response = requests.get('https://www.goodreads.com/book/show', params=request_params)
    
    soup = BeautifulSoup(response.text, 'lxml')
    iframe = soup.find('reviews_widget').find('iframe')
    
    all_reviews = []
    
    reviews_url = iframe['src']
    response_reviews = requests.get(reviews_url)
    
    while(1):
        review_soup = BeautifulSoup(response_reviews.text, 'lxml')
        reviews = review_soup.find_all('div', {'class':'gr_review_container'})

        for review in reviews:
            current_review = {}

            # Extract the rating
            rating = review.find('span', {'class':'gr_rating'})
            if (rating):
                rating = rating.find_all(text=True)
                rating_num = 0
                for i in range(len(rating[0])):
                    if (ord(rating[0][i]) == 9733):
                        rating_num += 1
            else:
                rating_num = -1

            # Extract the date
            date = review.find('span', {'class':'gr_review_date'}).find_all(text=True)
            date = [x.strip() for x in date][0]

            # Extract the review text
            s = [x.strip() for x in review.find('div', {'class':'gr_review_text'}).find_all(text=True)]
            s = [x for x in s if x]

            # Ignore the last element, the '...more'
            review_text = ' '.join(s[:-1])

            # Remove the last word, since it will be partial
            review_text = ' '.join(review_text.split(' ')[:-1])

            current_review['book_id'] = book_id
            current_review['rating'] = rating_num
            current_review['date'] = date
            current_review['text'] = review_text

            all_reviews.append(current_review)

            #print(current_review)
        
        # See if there is another page of reviews
        if (review_soup.find_all('a', {'class':'next_page'}) != []):
            # There is another page
            reviews_url = 'https://goodreads.com' + review_soup.find('a', {'class':'next_page'})['href']
            response_reviews = requests.get(reviews_url)
        else:
            # No more pages of reviews
            break
        
    return all_reviews

# Save all reviews as a CSV
def download_reviews(api_key):
    data_columns = ['book_id', 'rating', 'date', 'text']
    
    with open('reviews.csv', 'w', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, data_columns, lineterminator='\n')
        dict_writer.writeheader()
        
        # TODO: iterate through book IDs - add a time.sleep(1) between each iteration
        current_reviews = get_reviews(api_key, 68428)
        dict_writer.writerows(current_reviews)

#book_reviews = get_reviews(api_key, 68428)
#print(len(book_reviews))
download_reviews(api_key)