In [12]:
import requests #to make HTTP URL requests
import urllib.request 
import time #to get the time.sleep function 
from bs4 import BeautifulSoup #to extract data from html files
import shelve #to save data to a shelve (kinda like matlab save workspace)
import logging #to do logging info and debug statements 
import re #regular expressions
import json
import os.path
import traceback

def loggingConfig():
    format = "%(asctime)s: %(message)s"
    logging.basicConfig(format=format, level=logging.DEBUG,
                        datefmt="%H:%M:%S")

loggingConfig()

shelveFile = "shelve.out"
try: 
    logging.debug("try to open existing shelve")
    if not os.path.exists(shelveFile + ".dat"):
        raise Exception

    my_shelf = shelve.open(shelveFile)
    logging.debug("Shelve opened")
    try:
        for key in my_shelf:
            globals()[key]=my_shelf[key]
            print(str(key))
    except:
        logging.debug("Failed during extracting shelve key")
        pass
    my_shelf.close()
    logging.debug("Shelve closed again")
    print("*** Pulled from shelve ***")

# if it doesn't exist, create the data object again
except Exception:
    traceback.print_exc()
    print("*** Pulling data from URL *** ")
    url = "https://cubavodka.com/cocktails"
    # Needed to add a different user agent, since default python one seems blocked?
    # https://stackoverflow.com/questions/56101612/python-requests-http-response-406
    response = requests.get(url, headers={"User-Agent": "XY"})
    # print(response)
    soup = BeautifulSoup(response.text, "html.parser", from_encoding='utf8')

    my_shelf = shelve.open(shelveFile,'n') # 'n' for new
    saveList = [soup]

    for key in saveList:
        try:
            my_shelf[key] = globals()[key]
        except:
            print('ERROR shelving: {0}'.format(key))
    my_shelf.close()

# Prepare an empty dictionary for all the recipies
recipeDict = {}

# Find all the different cocktails in the dataset
for tag in soup.body.find_all(class_=re.compile("cocktail-item")):
    # print(str(tag) + "\n")
    drinkImgUrl = tag["data-largeimg"]
    drinkName = tag.find("h3", attrs={"itemprop": True}).string
    recipe = tag.find(class_=re.compile("ingredients-wrap"))
    # print(recipe)

    # Put the drink into the recipeDict! 
    recipeDict[str(drinkName)] = {"ingredients": {}}

    for ingredient in recipe.find_all("li", attrs={"itemprop": True}):
        # print(ingredient)

        # Ingredients with an amount 
        try:
            ingredientName = ingredient.contents[1]
            # print(ingredientName)
            amount = ingredient.span.string
            # print(amount)
        except IndexError:
            pass

        # Ingredientwithout an amount
        try:
            if not ingredient.string == None:
                ingredientName = ingredient.string
                # print(ingredientName)
                amount = " "
        except IndexError:
            pass


        # Put it all into the dict! 
        recipeDict[str(drinkName)]["ingredients"][ingredientName] = amount


print(json.dumps(recipeDict, indent=4, sort_keys=True))

# print(recipeDict.split("}},"))




11:28:04: try to open existing shelve
11:28:04: Shelve opened
11:28:04: Shelve closed again
*** Pulled from shelve ***
{
    "10 stk. Lyser\u00f8de Elefanter": {
        "ingredients": {
            "CUBA Strawberry": "4 cl. ",
            "Fl\u00f8deskum": "4 tsk. ",
            "Jordb\u00e6rsirup": "2 cl. ",
            "R\u00f8d sodavand": " "
        }
    },
    "10 stk. Strawberry Smoothie shots": {
        "ingredients": {
            "CUBA Strawberry": "6 cl. ",
            "Grenadine": "4 cl. ",
            "M\u00e6lk": "10 cl. "
        }
    },
    "3 farvet is": {
        "ingredients": {
            "CUBA Strawberry": "2 cl. ",
            "Cocio Chokoladem\u00e6lk": "10 cl. ",
            "Dooley's Original Toffee & Vodka": "2 cl. ",
            "Grenadine": "2 cl. "
        }
    },
    "After 8": {
        "ingredients": {
            "CUBA Cool Mint": "2 cl. ",
            "Cocio Chokoladem\u00e6lk": " "
        }
    },
    "Aloha": {
        "ingredients": {
        