In [1]:
import time, json
import requests
import csv
from bs4 import BeautifulSoup

In [2]:
def read_api_key(filepath):
    with open(filepath, 'r') as f:
        return f.read().replace('\n','')
    
api_key = read_api_key('api_key.txt')
print(api_key)

yfmkhh2aWg8dnQyvnoRXg


In [3]:
# Test function for querying goodreads API
def goodreads_query(api_key):
    """
    api_key (string): API key
    url (string): Base query URL
    query (dictionary): Query parameters
    """
    
    request_params = {'id':68428, 'key':api_key, 'format':'xml', 'text_only':'true'}
    response = requests.get('https://www.goodreads.com/book/show', params=request_params)
    
    soup = BeautifulSoup(response.text, 'lxml')
    shelves = soup.find('popular_shelves')
    return response
    
temp_response = goodreads_query(api_key)

In [4]:
# Get metadata about a book given its book ID
def get_book_metadata(api_key, book_id):
    request_params = {'id':book_id, 'key':api_key, 'format':'xml', 'text_only':'true'}
    response = requests.get('https://www.goodreads.com/book/show', params=request_params)
    
    book_info = {}
    
    book_info['book_id'] = book_id
    
    soup = BeautifulSoup(response.text, 'lxml')

    # Title
    try:
        book_info['title'] = soup.find('original_title').text
    except:
        book_info['title'] = ''
    
    # Author
    try:
        book_info['author'] = soup.find('author').find('name').text
    except:
        book_info['author'] = ''
    
    # Publication date
    try:
        year = soup.find('publication_year').text
        month = soup.find('publication_month').text
        day = soup.find('publication_day').text
        book_info['publication_date'] = str(month) + '/' + str(day) + '/' + str(year)
    except:
        book_info['publication_date'] = ''
        
    # Publisher
    try:
        book_info['publisher'] = soup.find('publisher').text
    except:
        book_info['publisher'] = ''
    
    # Review information
    try:
        book_info['review_count'] = soup.find('text_reviews_count').text
    except:
        book_info['review_count'] = -1
    
    # Rating information
    try:
        book_info['rating_count'] = soup.find('ratings_count').text
    except:
        book_info['rating_count'] = -1
    try:
        book_info['average_rating'] = soup.find('average_rating').text
    except:
        book_info['average_rating'] = -1
    
    # Treat the genre as the most popular shelf this book has been placed on
    try:
        all_shelves = soup.find('popular_shelves').find_all('shelf')
        num_shelves = min(len(all_shelves), 10)
        genres = []
        for i in range(num_shelves):
            genres.append(all_shelves[i]['name'])
        #book_info['genre'] = soup.find('popular_shelves').find('shelf')['name']
        book_info['genre'] = ','.join(genres)
    except:
        book_info['genre'] = ''
    
    # Truncated blurb
    try:
        book_info['description'] = soup.find('description').text.strip()
    except:
        book_info['description'] = ''
    
    return book_info

# Save all book metadata as a CSV
def download_book_metadata(api_key):
    data_columns = ['book_id', 'title', 'author', 'publication_date', 'publisher', 'review_count', 'rating_count', 'average_rating', 'genre', 'description']
    
    file_index = 14
    book_index = 130001
    
    for i in range(10):
        with open('data/metadata' + str(file_index) + '.csv', 'w', encoding='utf-8') as output_file:
            dict_writer = csv.DictWriter(output_file, data_columns, lineterminator='\n')
            dict_writer.writeheader()

            # TODO: iterate through book IDs - add a time.sleep(1) between each iteration
            for i in range(10000):
                current_metadata = get_book_metadata(api_key, book_index)
                dict_writer.writerow(current_metadata)
                book_index += 1

            file_index += 1
        print("Finished file " + str(file_index-1))

#get_book_metadata(api_key, 68428)
download_book_metadata(api_key)

Finished file 14
Finished file 15
Finished file 16
Finished file 17
Finished file 18
Finished file 19
Finished file 20
Finished file 21
Finished file 22
Finished file 23


In [6]:
# Get all the reviews for a given book ID as a list of dictionaries
# Review data we are collecting - rating, date, and review text
def get_reviews(api_key, book_id):
    request_params = {'id':book_id, 'key':api_key, 'format':'xml', 'text_only':'true'}
    response = requests.get('https://www.goodreads.com/book/show', params=request_params)
    
    try:
        soup = BeautifulSoup(response.text, 'lxml')
        iframe = soup.find('reviews_widget').find('iframe')

        all_reviews = []

        reviews_url = iframe['src']
        response_reviews = requests.get(reviews_url)
    except:
        all_reviews = [{'book_id':'', 'rating':'', 'date':'', 'text':''}]
        return all_reviews
    
    while(1):
        review_soup = BeautifulSoup(response_reviews.text, 'lxml')
        reviews = review_soup.find_all('div', {'class':'gr_review_container'})
        
        for review in reviews:
            current_review = {}

            # Extract the rating
            rating = review.find('span', {'class':'gr_rating'})
            if (rating):
                rating = rating.find_all(text=True)
                rating_num = 0
                for i in range(len(rating[0])):
                    if (ord(rating[0][i]) == 9733):
                        rating_num += 1
            else:
                rating_num = -1

            # Extract the date
            try:
                date = review.find('span', {'class':'gr_review_date'}).find_all(text=True)
                date = [x.strip() for x in date][0]
            except:
                date = ''

            # Extract the review text
            try:
                s = [x.strip() for x in review.find('div', {'class':'gr_review_text'}).find_all(text=True)]
                s = [x for x in s if x]

                # Ignore the last element, the '...more'
                review_text = ' '.join(s[:-1])

                # Remove the last word, since it will be partial
                review_text = ' '.join(review_text.split(' ')[:-1])
            except:
                review_text = ''

            current_review['book_id'] = book_id
            current_review['rating'] = rating_num
            current_review['date'] = date
            current_review['text'] = review_text

            all_reviews.append(current_review)

            #print(current_review)
        
        # See if there is another page of reviews
        if (review_soup.find_all('a', {'class':'next_page'}) != []):
            # There is another page
            reviews_url = 'https://goodreads.com' + review_soup.find('a', {'class':'next_page'})['href']
            response_reviews = requests.get(reviews_url)
        else:
            # No more pages of reviews
            break
        
    return all_reviews

# Save all reviews as a CSV
def download_reviews(api_key):
    data_columns = ['book_id', 'rating', 'date', 'text']
    
    file_index = 4
    book_index = 17081
    
    for i in range(1):
        with open('data/reviews' + str(file_index) + '.csv', 'w', encoding='utf-8') as output_file:
            dict_writer = csv.DictWriter(output_file, data_columns, lineterminator='\n')
            dict_writer.writeheader()

            # TODO: iterate through book IDs - add a time.sleep(1) between each iteration
            for j in range(2919):
                current_reviews = get_reviews(api_key, book_index)
                dict_writer.writerows(current_reviews)
                book_index += 1
                time.sleep(0.5)
            
            file_index += 1
        print("Finished file " + str(file_index-1))

    with open('data/reviews' + str(file_index) + '.csv', 'w', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, data_columns, lineterminator='\n')
        dict_writer.writeheader()

        # TODO: iterate through book IDs - add a time.sleep(1) between each iteration
        for j in range(10000):
            current_reviews = get_reviews(api_key, book_index)
            dict_writer.writerows(current_reviews)
            book_index += 1
            time.sleep(0.5)

        file_index += 1
    print("Finished file " + str(file_index-1))

#book_reviews = get_reviews(api_key, 68428)
#print(len(book_reviews))
download_reviews(api_key)

Finished file 4
Finished file 5
