In [60]:
from os import path
import gzip
import json

# Change to your directory with the dataset
DIR = '../database/'

In [None]:
# load the ginormous dataset

def load_data(file_name, head = 500):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
    return data

books = load_data(path.join(DIR, 'goodreads_books.json.gz'))

In [None]:
# Keep just the books in English
books[:] = [book for book in books if (book["language_code"]=="eng")]

In [None]:
# Optional
# If you want to save the books in English
with open('goodreads_en.json', 'w') as fp:
    json.dump(books, fp, indent=1)

In [None]:
# Optional
# For when we already have saved books with lang "eng" only
with open('goodreads_en.json', 'r') as fp:
    books = json.load(fp)

In [None]:
import random

In [None]:
# Get a list with 100 000 random eng books 
en_book_sample = random.sample(books, k=100000)

In [None]:
# Optional
# Save the random selection
with open('random_sample_100000_en.json', 'w') as fp:
    json.dump(en_book_sample, fp, indent=4)

In [None]:
# Optional
# For when we want to load the random sample
with open('random_sample_100000_en.json', 'r') as fp:
    data = json.load(fp)

In [None]:
import spacy

In [None]:
# Load the model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Append elements of description identified as 'Countries, cities, states' (GPE) to the "geo" key
for book in en_book_sample:
    description_lab = nlp(book["description"])
    for ent in description_lab.ents:
        if ent.label_ == "GPE":
            if  not "geo" in book:
                book["geo"] = [ent.text]
            else:
                book["geo"].append(ent.text)

In [None]:
# Leave just books with geo samples in en_book_sample (in my run cuts down the size by half)
en_book_sample[:] = [book for book in en_book_sample if "geo" in book]


In [None]:
# Optional
# Save the shorter book list with "geo" keyword
with open('random_sample_en_GEO.json', 'w') as fp:
    json.dump(en_book_sample, fp, indent=1)

In [92]:
import geocoder

# !!!!!!!!!!
# Input your own Mapquest API key
key_mapquest = "YOUR_KEY"
# !!!!!!!!!!

max_batch_size = 100
#15k quota
place_country_pairs = {}
to_be_queried = []

In [93]:
# Get a list of unique places from all "geo"'s - to decrease the amount of queries
for book in en_book_sample:
    for place in book["geo"]:
        if place not in to_be_queried:
            to_be_queried.append(place)

In [104]:
# Make queries to Mapquest in max_batch_size
for i in range(0, len(to_be_queried), max_batch_size):
    sub =to_be_queried[i:i+max_batch_size]
    results = geocoder.mapquest(sub, key= key_mapquest, method="batch")
    cnt = 0
    for result in results:
        try:
            place_country_pairs[sub[cnt]] = result.country
        except AttributeError:
            place_country_pairs[sub[cnt]] = "None"
        cnt += 1

Status code 504 from http://www.mapquestapi.com/geocoding/v1/batch: ERROR - 504 Server Error: Gateway Time-out for url: http://www.mapquestapi.com/geocoding/v1/batch?key=RtAB8WumTskHR3Jbff8rMtgZKgF6scnu&location=Magnum&location=St.+Paul&location=Mordor&location=Tucson&location=Casphairn+Manor&location=Libera&location=Kipling&location=Eminent+Domain&location=Rielle&location=Outlaw&location=b%60lqthm&location=Hastings&location=Vordanai&location=Vhalnich&location=Marcus&location=Nugget&location=Sri+Lanka&location=Becky+Jordan&location=Biron&location=Novazine&location=Teodor&location=Dutton&location=Zephyre&location=Shala&location=Machlis&location=Los+Demonios&location=Subject&location=Keelen&location=Johnston&location=Taser&location=Limehouse&location=Herakles&location=Culloden&location=Malva&location=Belakarta&location=Ayrshire&location=The+City&location=Lymond+Chronicles&location=the+Orkney+Islands&location=Shropshire&location=Cadfael&location=Beatrix&location=Selma&location=Barnes&loca

In [106]:
# Ugly way of redoing failed requests
not_queried = []
for el in to_be_queried:
    if el not in place_country_pairs:
        not_queried.append(el)


In [108]:
# Retry for what didn't work, the ugly way
for i in range(0, len(not_queried), max_batch_size):
    sub = not_queried[i:i+max_batch_size]
    results = geocoder.mapquest(sub, key= key_mapquest, method="batch")
    cnt = 0
    for result in results:
        try:
            place_country_pairs[sub[cnt]] = result.country
        except AttributeError:
            place_country_pairs[sub[cnt]] = "None"
        cnt += 1

In [114]:
# Optional
# Ger a list of unique countries and the number of times they appear
unique_countries = {}
for key, value in place_country_pairs.items():
    if value not in unique_countries:
        unique_countries[value] = 1
    else:
        unique_countries[value] += 1

In [130]:
# ISO 3166-1 alpha-2 country code standard
unique_countries

{'TH': 59,
 'US': 9878,
 'GB': 1154,
 'FR': 1013,
 'CA': 796,
 'PR': 18,
 'CR': 17,
 'DE': 369,
 'ZA': 437,
 'AU': 523,
 'IN': 822,
 'TN': 11,
 'HU': 54,
 'HT': 7,
 'BM': 3,
 'EE': 28,
 'PH': 277,
 'ES': 327,
 'EC': 20,
 'SE': 158,
 'MO': 8,
 'CN': 34,
 'JP': 120,
 'SG': 11,
 'NO': 143,
 'AD': 6,
 'GI': 3,
 'BR': 277,
 'IT': 463,
 'IL': 88,
 'PT': 69,
 'MX': 280,
 'MD': 12,
 'TR': 128,
 'NZ': 98,
 'IS': 12,
 'RU': 159,
 'JM': 19,
 'HK': 17,
 'UA': 37,
 'AL': 25,
 'PK': 19,
 'GH': 21,
 'VI': 9,
 'AT': 56,
 'TO': 8,
 'NP': 65,
 'CL': 65,
 'BG': 20,
 'FI': 81,
 'GE': 12,
 'ID': 89,
 'BE': 72,
 'RO': 63,
 'SI': 14,
 'JO': 18,
 'KE': 13,
 'ZM': 14,
 'MM': 9,
 'VN': 20,
 'IQ': 33,
 'DO': 22,
 'IE': 192,
 'SR': 14,
 'WS': 2,
 'CZ': 49,
 'SO': 3,
 'PL': 78,
 'GT': 11,
 'CY': 7,
 'HR': 38,
 'NL': 163,
 'SD': 10,
 'CG': 21,
 'MY': 45,
 'KN': 3,
 'LA': 9,
 'ZW': 12,
 'LK': 22,
 'BB': 12,
 'EG': 38,
 'LB': 13,
 'BO': 17,
 'AE': 20,
 'MA': 28,
 'AF': 4,
 'CH': 93,
 'GR': 93,
 'BQ': 3,
 'VE': 25,
 '

In [120]:
# Creates "country" key for each book
for book in en_book_sample:
    for place in book["geo"]:
        if place in place_country_pairs:
            if "country" not in book:
                book["country"] = [place_country_pairs[place]]
            elif place_country_pairs[place] not in book["country"]:
                book["country"].append(place_country_pairs[place])

In [143]:
# Example of how the result looks for Sudan
for book in en_book_sample:
    if "SD" in book["country"]:
        print("Title: ", book["title"], ", Country: ", book["country"], ", Places: ", book["geo"]) 

Title:  The Thorn of Emberlain (Gentleman Bastard, #4) , Country:  ['US', 'SD'] , Places:  ['the Kingdom of the', 'The Republic of Thieves']
Title:  Wrenching Fate (Brides of Prophecy, #1) , Country:  ['SD'] , Places:  ['Akasha', 'Akasha']
Title:  Dogstar Rising: A Makana Investigation , Country:  ['EG', 'ZA', 'SD', 'IT'] , Places:  ['Cairo', 'Makana', 'Sudan', 'Makana', 'Meera', 'Makana', 'Makana', 'Cairo', 'Luxor']
Title:  Flight of the Eagle (Frontier #3) , Country:  ['SD', 'AU', 'IN'] , Places:  ['Sudan', 'Sydney', 'Queensland', 'Queensland', 'Nerambura']
Title:  Imaro , Country:  ['SD', 'US'] , Places:  ['Sudan', 'SF', 'New York Timesbestseller']
Title:  We Need New Names , Country:  ['US', 'CA', 'AU', 'CH', 'BR', 'SO', 'IQ', 'SD', 'HT', 'AE'] , Places:  ['US', 'USA', 'Britain', 'Canada', 'Australia', 'Switzerland', 'Congo', 'Somalia', 'Iraq', 'Sudan', 'Haiti', 'America', 'Dubai']
Title:  What Is the What , Country:  ['SD', 'US'] , Places:  ['Sudan', 'the United States']
Title:  M