In [1]:
import urllib.request
from bs4 import BeautifulSoup
from collections import defaultdict
import datetime
import json

BLANK_SEARCH_PAGE = 'https://michaelyingling.com/random/calvin_and_hobbes/search.php?phrase='
OUT_FILE = 'complete_calvin_and_hobbes.json'

In [2]:
page = urllib.request.urlopen(BLANK_SEARCH_PAGE)
soup = BeautifulSoup(page, 'html.parser')
strips = soup.find_all('fieldset')

In [3]:
def parse_appears_in(appears_in):
    f = lambda a: {
        'title': a.img.get('title'),
        'link': a.get('href').split('?')[0],
        'cover': a.img.get('src').split('/')[-1]
    }
    return list(map(f, appears_in))

def parse_strip(strip):
    date = strip.strong.string
    script = strip.find('div', class_='quote').string
    description = strip.find('div', class_='description').string
    link = strip.a.get('href')
    appears_in = parse_appears_in(strip.find('div', class_='books').find_all('a'))
    return {
        'date': date,
        'script': script,
        'description': description,
        'link': link,
        'appears_in': appears_in
    }

parse_strip(strips[0])

{'date': '18 NOV 1985',
 'script': "So long Pop! I'm off to check my tiger trap! I rigged a tuna fish sandwich yesterday, so I'm sure to have a tiger by now! They like tuna fish, huh? Tigers will do anything for a tuna fish sandwich. We're kind of stupid that way.  Munch Munch",
 'description': "Calvin is off to check his tiger trap. Since Calvin baited it with a tuna fish sandwich, he's sure he'll have caught a tiger. Calvin tells his Dad that tigers will do anything for tuna fish. Hobbes, hanging by his foot in the trap, says tigers are kind of stupid that way. ",
 'link': 'http://www.gocomics.com/calvinandhobbes/1985/11/18',
 'appears_in': [{'title': 'Calvin and Hobbes',
   'link': 'http://www.amazon.com/gp/product/0836220889',
   'cover': 'calvin_and_hobbes.jpg'},
  {'title': 'Something Under the Bed is Drooling',
   'link': 'http://www.amazon.com/gp/product/0836218256',
   'cover': 'something_under_the_bed_is_drooling.jpg'},
  {'title': 'The Essential Calvin and Hobbes',
   'link'

In [4]:
parsed_strips = list(map(parse_strip, strips))

In [5]:
books = defaultdict(dict)

for ps in parsed_strips:
    appears_in = ps['appears_in']
    for book in appears_in:
        title = book['title']
        title_hash = str(hash(title))
        books[title_hash]['title'] = title
        books[title_hash]['link'] = book['link']
        books[title_hash]['cover'] = book['cover']

# some book titles are cut off because of mishandled escape characters
dict(books)

{'-8215720599231991787': {'title': 'Calvin and Hobbes',
  'link': 'http://www.amazon.com/gp/product/0836220889',
  'cover': 'calvin_and_hobbes.jpg'},
 '-8380923391474116146': {'title': 'Something Under the Bed is Drooling',
  'link': 'http://www.amazon.com/gp/product/0836218256',
  'cover': 'something_under_the_bed_is_drooling.jpg'},
 '-4222594290859131387': {'title': 'The Essential Calvin and Hobbes',
  'link': 'http://www.amazon.com/gp/product/0836218051',
  'cover': 'the_essential_calvin_and_hobbes.jpg'},
 '-2430226368731180244': {'title': 'The Complete Calvin and Hobbes',
  'link': 'http://www.amazon.com/gp/product/0740748475',
  'cover': 'the_complete_calvin_and_hobbes.jpg'},
 '2134887979636986130': {'title': 'Yukon Ho!',
  'link': 'http://www.amazon.com/gp/product/0836218353',
  'cover': 'yukon_ho.jpg'},
 '7213657484768236466': {'title': 'The Authoritative Calvin and Hobbes',
  'link': 'http://www.amazon.com/gp/product/0836218221',
  'cover': 'the_authoritative_calvin_and_hobbes.

In [6]:
# manually correct book titles
books['3567743571950606685']['title'] = 'Scientific Progress Goes "Boink"'
books['-5738608721331140368']['title'] = 'There\'s Treasure Everywhere'
books['-2654582596097140220']['title'] = 'It\'s a Magical World'

In [7]:
dict(books)

{'-8215720599231991787': {'title': 'Calvin and Hobbes',
  'link': 'http://www.amazon.com/gp/product/0836220889',
  'cover': 'calvin_and_hobbes.jpg'},
 '-8380923391474116146': {'title': 'Something Under the Bed is Drooling',
  'link': 'http://www.amazon.com/gp/product/0836218256',
  'cover': 'something_under_the_bed_is_drooling.jpg'},
 '-4222594290859131387': {'title': 'The Essential Calvin and Hobbes',
  'link': 'http://www.amazon.com/gp/product/0836218051',
  'cover': 'the_essential_calvin_and_hobbes.jpg'},
 '-2430226368731180244': {'title': 'The Complete Calvin and Hobbes',
  'link': 'http://www.amazon.com/gp/product/0740748475',
  'cover': 'the_complete_calvin_and_hobbes.jpg'},
 '2134887979636986130': {'title': 'Yukon Ho!',
  'link': 'http://www.amazon.com/gp/product/0836218353',
  'cover': 'yukon_ho.jpg'},
 '7213657484768236466': {'title': 'The Authoritative Calvin and Hobbes',
  'link': 'http://www.amazon.com/gp/product/0836218221',
  'cover': 'the_authoritative_calvin_and_hobbes.

In [8]:
for ps in parsed_strips:
    # convert date to a UTC timestamp for efficient range querying
    date_string = ps['date'].replace('JUNE', 'JUN').replace('JULY', 'JUL').replace('SEPT', 'SEP')
    date_as_dt = datetime.datetime.strptime(date_string, '%d %b %Y')
    timestamp = int(date_as_dt.replace(tzinfo=datetime.timezone.utc).timestamp())
    ps['date'] = { 'human_readable': ps['date'], 'timestamp': timestamp }
    # re-attach book objects with correct metadata
    ps['appears_in'] = list(map(lambda ai: books[str(hash(ai['title']))], ps['appears_in']))

In [9]:
'''
{
  date: {
    human_readable: human readable date string
    timestamp: UTC timestamp, number of seconds since start of epoch
  }
  script: dialogue in strip
  description: description of strip
  link: GoComics link to strip
  appears_in: [
    {
      'title': title of C&H collection the strip appears in
      'link': link to Amazon item
      'cover': cover image filename
    }
  ]
}
'''

parsed_strips[0]

{'date': {'human_readable': '18 NOV 1985', 'timestamp': 501120000},
 'script': "So long Pop! I'm off to check my tiger trap! I rigged a tuna fish sandwich yesterday, so I'm sure to have a tiger by now! They like tuna fish, huh? Tigers will do anything for a tuna fish sandwich. We're kind of stupid that way.  Munch Munch",
 'description': "Calvin is off to check his tiger trap. Since Calvin baited it with a tuna fish sandwich, he's sure he'll have caught a tiger. Calvin tells his Dad that tigers will do anything for tuna fish. Hobbes, hanging by his foot in the trap, says tigers are kind of stupid that way. ",
 'link': 'http://www.gocomics.com/calvinandhobbes/1985/11/18',
 'appears_in': [{'title': 'Calvin and Hobbes',
   'link': 'http://www.amazon.com/gp/product/0836220889',
   'cover': 'calvin_and_hobbes.jpg'},
  {'title': 'Something Under the Bed is Drooling',
   'link': 'http://www.amazon.com/gp/product/0836218256',
   'cover': 'something_under_the_bed_is_drooling.jpg'},
  {'title': 

In [10]:
with open(OUT_FILE, 'w') as outfile:
    json.dump(parsed_strips, outfile)