# Question 2

#### Collection of data from the NYT API

From the Books API we will try to answer the question: which category of books has the bestseller that was on the list for the greatest number of weeks?

* Get the list of categories
* Get the list of bestsellers for each category

In [None]:
# Import the modules we need.
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import datetime
from glob import glob
import json
import os
import re
import requests
import string
import sys


# Path to the data directory into which downloaded JSON is saved.
data_path = os.path.join("..", "..", "data")
print(data_path)

if os.path.isdir(data_path):
    print(data_path + " is a directory")
else:
    print(data_path + " is NOT a directory - something is wrong :(")


# NYT API Key needs to be set in the environment before running this notebook.
#     $ export nyt_archive_key="abcd1234"
nyt_archive_key = os.getenv('nyt_archive_key')
print(nyt_archive_key)

if (nyt_archive_key is None) or (nyt_archive_key == ''):
    print("NYT API key is missing")

In [None]:
# General-purpose utility function for saving an object as JSON to the data directory.
def save_to_json(obj, save_file_path):
    print("saving to file: " + save_file_path)

    with open(save_file_path, "wt") as f:
        json.dump(obj, f)
        
# General function for getting JSON, either by downloading or from a cache file.
def resolve_nyt_json(url, cache_file, request_params={}):
    if os.path.isfile(cache_file):
        # Cache file exists, so use that.
        result = {}
        with open(cache_file, 'rt') as f:
            try:
                result = json.load(f)
            except ValueError:
                result = {}
                
        print("resolve_nyt_json(): returning value from cache file: " + cache_file)
        return result

    # It's not in the cache, so download and save it.
    print("resolve_nyt_json(): downloading from NYT API")

    response = requests.get(url, params=request_params)
    print(response.status_code)
    
    if 200 == response.status_code:
        save_to_json(response.json(), cache_file)
    else:
        print("resolve_nyt_json(): error downloading from NYT API ({code})".format(code=response.status_code))
        return {}
        
    return response.json()

In [None]:
# URL for calls to books/v3/lists/names.
def get_books_list_names_url():
    return "https://api.nytimes.com/svc/books/v3/lists/names.json"

# Name of the cache file for calls to books/v3/lists/names.
def get_books_list_names_cache_file_path():
    return os.path.join(data_path, "books_v3_lists_names.json")

# Name of the cache file for calls to books/v3/lists/names.
def get_books_list_names_params():
    return {'api-key':nyt_archive_key}

# Convenience routine for getting the names of the bestseller lists.
def resolve_books_list_names():
    return resolve_nyt_json(get_books_list_names_url(), 
                            get_books_list_names_cache_file_path(), 
                            get_books_list_names_params())

In [None]:
list_names_json = resolve_books_list_names()
# print(list_names_json['results'])

# Only weekly lists have a 'weeks on list' value.
weekly_list_names = list([w['list_name_encoded'] for w in list_names_json['results'] if w['updated'] == 'WEEKLY'])
print("Weekly bestseller lists ({count}):".format(count=len(weekly_list_names)))
print(weekly_list_names)

monthly_list_names = list([w['list_name_encoded'] for w in list_names_json['results'] if w['updated'] == 'MONTHLY'])
print("Monthly bestseller lists ({count}):".format(count=len(monthly_list_names)))
print(monthly_list_names)

In [None]:
# URL for calls to books/v3/lists.
def get_books_list_url():
    return "https://api.nytimes.com/svc/books/v3/lists.json"

# Name of the cache file for calls to books/v3/lists.
def get_books_list_cache_file_path(list_name):
    filename = "books_v3_lists_{list_name}.json".format(list_name=list_name)
    print(filename)
    return os.path.join(data_path, filename)

# Name of the cache file for calls to books/v3/lists.
def get_books_list_params(list_name):
    return {'api-key':nyt_archive_key, 
            'list':list_name,
            'sort-order':'ASC'}

# Convenience routine for getting a bestseller list.
def resolve_books_list(list_name):
    return resolve_nyt_json(get_books_list_url(), 
                            get_books_list_cache_file_path(list_name), 
                            get_books_list_params(list_name))

In [None]:
# TODO: this analysis should be moved into a separate notebook!
# Get the data for each bestseller list.
longest_on_list = []
for weekly_list in weekly_list_names:
    print(weekly_list)
    response = resolve_books_list(weekly_list)
    books_list = response['results']
    for b in books_list[0:1]:
        if b['weeks_on_list'] != 0:
            print("{title} ({rank}) spent {n} weeks on the list".format(
                    title=b['book_details'][0]['title'],
                    rank=b['rank'],
                    n=b['weeks_on_list']))
        
    books_list.sort(key=lambda x : x['weeks_on_list'],reverse=True)
    for b in books_list[0:1]:
        if b['weeks_on_list'] != 0:
            title = b['book_details'][0]['title']
            print("{title} ({rank}) spent {n} weeks on the list".format(
                    title=title,
                    rank=b['rank'],
                    n=b['weeks_on_list']))
            longest_on_list.append({'list':weekly_list, 
                                    'title':title,
                                    'rank':b['rank'], 
                                    'weeks_on_list':b['weeks_on_list']})

longest_on_list.sort(key=lambda x : x['weeks_on_list'], reverse=True)
for item in longest_on_list:
    print("{title}, #{rank} from {list_name}, spent {weeks} weeks on the list".format(
            title=item['title'], 
            rank=item['rank'], 
            weeks=item['weeks_on_list'], 
            list_name=item['list']))
    