################################################################################################################################
#                  Mid Term: Author-Vaishali Lambe, NUID-001286444                            #
################################################################################################################################

# Analysis 3

#### The influence of reviews on the bestseller lists.

* Which bestsellers have had reviews published in the NYT?
* Which ones haven't?
* Does a review influence how long a book stays on a bestseller list? 
################################################################################################################################

In [1]:
# Import modules we need.
import json
import os


# Path to the data directory into which downloaded raw JSON is saved.
data_path = os.path.join("..", "data")
print(data_path)

if os.path.isdir(data_path):
    print(data_path + " is a directory")
else:
    print(data_path + " is NOT a directory - something is wrong :(")
    

# Path to the data directory into which intermediate results are saved.
intermediate_data_path = os.path.join(".", "ana_3")
print(intermediate_data_path)

if os.path.isdir(intermediate_data_path):
    print(intermediate_data_path + " is a directory")
else:
    print(intermediate_data_path + " is NOT a directory - something is wrong :(")

..\data
..\data is a directory
.\ana_3
.\ana_3 is a directory


In [2]:
# General function for getting JSON, either by downloading or from a cache file.
def get_cached_nyt_json(cache_file):
    if os.path.isfile(cache_file):
        # Cache file exists, so use that.
        result = {}
        with open(cache_file, 'rt') as f:
            try:
                result = json.load(f)
            except ValueError:
                result = {}
                
        print("get_cached_nyt_json(): returning value from cache file: " + cache_file)
        return result

    # It's not in the cache, so display a message and return an empty result.
    print("get_cached_nyt_json(): required data has not been downloaded from NYT API")
    print("get_cached_nyt_json(): ensure the data is available locally, or run the collection.ipynb notebook to download it")

    return {}


# Name of the cache file for calls to books/v3/lists/names.
def get_books_list_names_cache_file_path():
    return os.path.join(data_path, "books_v3_lists_names.json")


# Convenience routine for getting the names of the bestseller lists.
def resolve_books_list_names():
    return get_cached_nyt_json(get_books_list_names_cache_file_path())


# Name of the cache file for calls to books/v3/lists.
def get_books_list_cache_file_path(list_name):
    filename = "books_v3_lists_{list_name}.json".format(list_name=list_name)
    print(filename)
    return os.path.join(data_path, filename)


# Convenience routine for getting a bestseller list.
def resolve_books_list(list_name):
    return get_cached_nyt_json(get_books_list_cache_file_path(list_name))


# Name of the cache file for calls to books/v3/reviews.
def get_books_reviews_cache_file_path(isbn):
    filename = "books_v3_reviews_{isbn}.json".format(isbn=isbn)
    print(filename)
    return os.path.join(data_path, filename)


# Convenience routine for getting a review from an ISBN.
def resolve_books_reviews(isbn):
    return get_cached_nyt_json(get_books_reviews_cache_file_path(isbn))

In [3]:
list_names_json = resolve_books_list_names()
# print(list_names_json['results'])

# Only weekly lists have a 'weeks on list' value.
weekly_list_names = list([w['list_name_encoded'] for w in list_names_json['results'] if w['updated'] == 'WEEKLY'])
print("Weekly bestseller lists ({count}):".format(count=len(weekly_list_names)))
print(weekly_list_names)

get_cached_nyt_json(): returning value from cache file: ..\data\books_v3_lists_names.json
Weekly bestseller lists (29):
['combined-print-and-e-book-fiction', 'combined-print-and-e-book-nonfiction', 'hardcover-fiction', 'hardcover-nonfiction', 'trade-fiction-paperback', 'mass-market-paperback', 'paperback-nonfiction', 'e-book-fiction', 'e-book-nonfiction', 'hardcover-advice', 'paperback-advice', 'advice-how-to-and-miscellaneous', 'chapter-books', 'childrens-middle-grade', 'childrens-middle-grade-e-book', 'childrens-middle-grade-hardcover', 'childrens-middle-grade-paperback', 'paperback-books', 'picture-books', 'series-books', 'young-adult', 'young-adult-e-book', 'young-adult-hardcover', 'young-adult-paperback', 'hardcover-graphic-books', 'paperback-graphic-books', 'manga', 'combined-print-fiction', 'combined-print-nonfiction']


In [4]:
# Get the data for each bestseller list.
bestselling_books = {}
for weekly_list in weekly_list_names:
    print(weekly_list)
    response = resolve_books_list(weekly_list)
    books_list = response['results']
    for b in books_list:
        #print(b)
        rank=b['rank']
        title=b['book_details'][0]['title']
        weeks_on_list = b['weeks_on_list']
        
        try:
            isbn=b['isbns'][0]['isbn13']
        except IndexError:
            # Some e-books don't have an ISBN - don't worry about those.
            isbn=None
            
        print("{rank} {title} ({list_name}, {weeks} weeks) {isbn}".format(
                rank=rank, 
                title=title, 
                list_name=weekly_list,
                weeks=weeks_on_list,
                isbn=isbn))
        summary={'title':title, 'isbn':isbn, 'rank':rank, 'list':weekly_list, 'weeks_on_list':weeks_on_list}
        
        if isbn is not None:
            bestselling_books[isbn] = summary
        
print("Found {count} bestselling books".format(count=len(bestselling_books)))

for isbn in bestselling_books:
    review = resolve_books_reviews(isbn)
    count=review['num_results']
    print("{isbn} has {count} reviews".format(isbn=isbn, count=count))
    
    # Add a new 'reviews' attribute for the book.
    bestselling_books[isbn]['reviews'] = count
    
print(bestselling_books)  

combined-print-and-e-book-fiction
books_v3_lists_combined-print-and-e-book-fiction.json
get_cached_nyt_json(): returning value from cache file: ..\data\books_v3_lists_combined-print-and-e-book-fiction.json
1 THE SHACK (combined-print-and-e-book-fiction, 5 weeks) 9780964729230
2 DEVIL IN SPRING (combined-print-and-e-book-fiction, 1 weeks) 9780062371904
3 BIG LITTLE LIES (combined-print-and-e-book-fiction, 28 weeks) 9780399167065
4 A MAN CALLED OVE (combined-print-and-e-book-fiction, 40 weeks) 9781476738024
5 A DOG'S PURPOSE (combined-print-and-e-book-fiction, 12 weeks) 9780765326263
6 AFTERMATH: EMPIRE'S END (combined-print-and-e-book-fiction, 1 weeks) 9781101966969
7 ECHOES IN DEATH (combined-print-and-e-book-fiction, 3 weeks) 9781250123114
8 HEARTBREAK HOTEL (combined-print-and-e-book-fiction, 2 weeks) 9780345541437
9 LINCOLN IN THE BARDO (combined-print-and-e-book-fiction, 2 weeks) 9780812995343
10 NORSE MYTHOLOGY (combined-print-and-e-book-fiction, 3 weeks) 9780393609097
11 MILK AND

In [5]:
# Set up the structure for our summary data.
summary_data = {}
for weekly_list in weekly_list_names:
    summary_data[weekly_list] = { 'reviewed': {'count':0, 'weeks': 0}, 'unreviewed': {'count': 0, 'weeks': 0}}


# For each book, find whether or not it has been reviewed and the number of weeks it has been on the list.
for key in bestselling_books:
    item = bestselling_books[key]
    
    review_count = item['reviews']
    weeks = item['weeks_on_list']
    list_name = item['list']
    
    if review_count > 0:
        # Add to reviewed books for this list.
        summary_data[list_name]['reviewed']['count'] += 1
        summary_data[list_name]['reviewed']['weeks'] += weeks

    else:
        # Add to unreviewed books for this list.
        summary_data[list_name]['unreviewed']['count'] += 1
        summary_data[list_name]['unreviewed']['weeks'] += weeks


# For each list, compare the weeks for reviewed and unreviewed books.
for weekly_list in weekly_list_names:
    list_data = summary_data[weekly_list]
    review_count = list_data['reviewed']['count']

    # Don't compare reviewed vs unreviewed if there are no reviewed books.
    if review_count > 0:
        print(weekly_list)
        print(list_data)
    
        if 0 == review_count:
            reviewed_average_weeks = 0
        else:
            reviewed_average_weeks = list_data['reviewed']['weeks'] / review_count

        if 0 == list_data['unreviewed']['count']:
            unreviewed_average_weeks = 0
        else:
            unreviewed_average_weeks = list_data['unreviewed']['weeks'] / list_data['unreviewed']['count']

        print("Reviewed books: {reviewed}, unreviewed books: {unreviewed}".format(
                reviewed=reviewed_average_weeks, 
                unreviewed=unreviewed_average_weeks))

# Compare an overall total.
total_reviewed_weeks = 0
total_reviewed_count = 0

total_unreviewed_weeks = 0
total_unreviewed_count = 0

for weekly_list in weekly_list_names:
    list_data = summary_data[weekly_list]
    
    total_reviewed_count += list_data['reviewed']['count']
    total_reviewed_weeks += list_data['reviewed']['weeks']
    total_unreviewed_count += list_data['unreviewed']['count']
    total_unreviewed_weeks += list_data['unreviewed']['weeks']

print(total_reviewed_count)
print(total_reviewed_weeks)
print(total_unreviewed_count)
print(total_unreviewed_weeks)

total_reviewed_average_weeks = total_reviewed_weeks / total_reviewed_count
total_unreviewed_average_weeks = total_unreviewed_weeks / total_unreviewed_count

print("Total - reviewed books: {reviewed}, unreviewed books: {unreviewed}".format(
        reviewed=total_reviewed_average_weeks, 
        unreviewed=total_unreviewed_average_weeks))

hardcover-fiction
{'reviewed': {'weeks': 47, 'count': 4}, 'unreviewed': {'weeks': 43, 'count': 9}}
Reviewed books: 11.75, unreviewed books: 4.777777777777778
hardcover-nonfiction
{'reviewed': {'weeks': 6, 'count': 1}, 'unreviewed': {'weeks': 32, 'count': 5}}
Reviewed books: 6.0, unreviewed books: 6.4
trade-fiction-paperback
{'reviewed': {'weeks': 17, 'count': 1}, 'unreviewed': {'weeks': 240, 'count': 7}}
Reviewed books: 17.0, unreviewed books: 34.285714285714285
mass-market-paperback
{'reviewed': {'weeks': 21, 'count': 1}, 'unreviewed': {'weeks': 21, 'count': 13}}
Reviewed books: 21.0, unreviewed books: 1.6153846153846154
paperback-nonfiction
{'reviewed': {'weeks': 34, 'count': 2}, 'unreviewed': {'weeks': 159, 'count': 5}}
Reviewed books: 17.0, unreviewed books: 31.8
e-book-fiction
{'reviewed': {'weeks': 21, 'count': 2}, 'unreviewed': {'weeks': 17, 'count': 11}}
Reviewed books: 10.5, unreviewed books: 1.5454545454545454
e-book-nonfiction
{'reviewed': {'weeks': 3, 'count': 3}, 'unreview