Purpose: get most common genres from GoodReads API (defined by user tags) from a set of book IDs

In [1]:
import pandas as pd
import numpy as np 

import requests
import xml.etree.ElementTree as ET

import time
import glob

In [2]:
# API credentials
f = open("goodreads_key.txt", "r")
api_key = f.read()
f.close()

In [3]:
# import list of books with IDs
ratings_df = pd.read_csv('books.csv')

# filter for only books in sub-sample for which we already collected images
ls_raw = glob.glob('imgs/*.jpg')
ls = [int(i.split('\\')[-1].split('.')[0]) for i in ls_raw]

ratings_df = ratings_df[ratings_df['bookID'].isin(ls)]

ratings_df.shape

(994, 12)

In [4]:
# remove tags that aren't actually genres
genreExceptions = [
'to-read', 'currently-reading', 'owned', 'default', 'favorites', 'books-i-own',
'ebook', 'kindle', 'library', 'audiobook', 'owned-books', 'audiobooks', 'my-books',
'ebooks', 'to-buy', 'english', 'calibre', 'books', 'british', 'audio', 'my-library',
'favourites', 're-read', 'general', 'e-books'
]
# h/t https://www.goodreads.com/topic/show/19317219-getting-the-genre-of-a-book

In [5]:
def get_root(book_id):
    r_url = 'https://www.goodreads.com/book/show/'+str(book_id)+'.xml?key='+api_key
    root = ET.fromstring(requests.get(r_url).content)
    return root

In [6]:
def get_genres(root, genreExceptions):
    
    book_genres = []
    for child in root.iter('shelf'): 
        book_genres.append(child.attrib['name'])
    
    book_genres = [i for i in book_genres if i not in genreExceptions]
    book_genres = book_genres[:5]
    return book_genres

In [7]:
# iterate through list of books: 
%%time

id_list = list(ratings_df['bookID']) 
genres_list = list()

for i in id_list:
    root = get_root(i)
    genres = get_genres(root, genreExceptions)
    genres_list.append(genres)
    
    # limit rate
    time.sleep(1)
    
genre_df = pd.DataFrame({'bookID': id_list, 'genres': genres_list})

Wall time: 27min 30s


In [8]:
ratings_df.merge(genre_df, how='left', on='bookID').to_csv('books_genres_sample.csv', index=False)

In [9]:
ratings_df.merge(genre_df, how='left', on='bookID')

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,genres
0,35,The Lord of the Rings (The Lord of the Rings ...,J.R.R. Tolkien/Alan Lee,4.50,618260587,9.780620e+12,en-US,1216,1618,140,10/1/2002,Houghton Mifflin Harcourt,"[fantasy, classics, fiction, classic, tolkien]"
1,37,The Lord of the Rings: Complete Visual Companion,Jude Fisher,4.50,618510826,9.780620e+12,eng,224,359,6,11/15/2004,Houghton Mifflin Harcourt,"[tolkien, fantasy, middle-earth, reference, mo..."
2,59,The Changeling Sea,Patricia A. McKillip,4.06,141312629,9.780140e+12,eng,137,4454,302,4/14/2003,Firebird,"[fantasy, young-adult, romance, fiction, ya]"
3,72,Artesia: Adventures in the Known World,Mark Smylie,4.13,1932386106,9.781930e+12,eng,352,52,4,12/14/2005,Archaia,"[fantasy, rpg, expanded-setting, lle_wanted, m..."
4,81,Giving Good Weight,John McPhee,4.23,374516006,9.780370e+12,eng,288,542,36,4/1/1994,Farrar Straus and Giroux,"[non-fiction, essays, nonfiction, tim-ferriss,..."
5,83,Rising from the Plains,John McPhee,4.23,374520658,9.780370e+12,eng,208,1341,98,11/1/1987,Farrar Straus and Giroux,"[non-fiction, science, geology, nonfiction, hi..."
6,94,Getting Results with Curriculum Mapping,Heidi Hayes Jacobs,3.25,871209993,9.780870e+12,eng,192,55,5,11/15/2004,ASCD,"[education, teaching, reference, adult-non-fic..."
7,180,Wrinkles in Time,George Smoot/Keay Davidson,4.01,380720442,9.780380e+12,eng,360,1035,23,10/1/1994,Harper Perennial,"[science, physics, non-fiction, astronomy, non..."
8,204,The Long Shadow (The Morland Dynasty #6),Cynthia Harrod-Eagles,4.12,751506435,9.780750e+12,eng,367,376,17,6/1/1994,Little Brown Book Group,"[historical-fiction, historical, series, morla..."
9,205,A Long Shadow (Inspector Ian Rutledge #8),Charles Todd,4.11,006078671X,9.780060e+12,eng,352,3086,237,1/3/2006,William Morrow,"[mystery, historical-fiction, mysteries, ficti..."
