In [8]:
import requests
from bs4 import BeautifulSoup
import json
import re

domain = 'https://nespresso.com'

def getJsonFromPageWithQuery(URL, selector):
    page = requests.get(URL)
    soup = BeautifulSoup(page.content, 'html.parser')
    tag = str(soup.select(selector)[0])
    json_string = tag[tag.find("{"):tag.rfind("}")+1]
    return json.loads(json_string)

In [9]:
main_list = getJsonFromPageWithQuery('https://www.nespresso.com/sg/en/order/capsules/original', 'div[id^=respProductListPLPCapsule]+script')

products = main_list['configuration']['eCommerceData']['products']
# print(products)
categories = main_list['configuration']['eCommerceData']['categories']

excluded_range_ids = ['nesclub2.sg.b2c/cat/capsule-range-limited-editions', 'nesclub2.sg.b2c/cat/capsule-range-assortment']
capsule_ranges = [x for x in categories if ('nesclub2.sg.b2c/cat/capsule-range' in x['superCategories'] and x['id'] not in excluded_range_ids)]
print(capsule_ranges)

simple_data = dict()
for range in capsule_ranges:
    print('Current range: %s' % range['name'])
        
    # initialise array for items in this range
    range_items = dict()

    # grab items from the category 
    items = [x for x in products if range['id'] in x['ranges'] and x['type'] == 'capsule' and x['unitQuantity'] == 1]

    # grab the detailed JSON from the item's own page
    for item in items:
        print('Current capsule: %s' % item['name'])

        URL = domain+item['url']
        item_info = getJsonFromPageWithQuery(URL, 'div[id^=respProductDetailPDPCapsule]+script')
        product = item_info['configuration']['eCommerceData']['product']

        # extract properties
        description = [x['text'] for x in product['ingredients']]
        mg = re.findall(r'(\d+)\s?mg',str(description))
        if (len(mg) == 0) :
            continue
        caffeine_mg = int(re.findall(r'(\d+)\s?mg',str(description))[0])
        print(caffeine_mg)
        image_url = domain+product['image']['url']

        # write item properties to output
        range_items[product['name']] = dict([('caffeine_mg', caffeine_mg), ('image_url', image_url)])

    # write all data for this range
    simple_data[range['name']] = range_items

with open('data.json', 'w') as outfile:
    json.dump(simple_data, outfile)


[{'id': 'nesclub2.sg.b2c/cat/capsule-range-variations-limited-editions', 'name': 'Festive Variations', 'description': 'Experience the flavours of the forest this festive season with our Limited Edition Festive Coffees.', 'icon': None, 'detailsIcon': None, 'url': None, 'capacityLabel': None, 'rangeLink': None, 'subCategories': [], 'superCategories': ['nesclub2.sg.b2c/cat/capsule-range']}, {'id': 'nesclub2.sg.b2c/cat/capsule-range-ispirazione-italiana', 'name': 'Ispirazione Italiana ', 'description': 'A broad range of espresso and ristretto coffees which will transport you to Italian iconic cities, inspired by the richness of Italian roasting traditions.', 'icon': None, 'detailsIcon': None, 'url': None, 'capacityLabel': None, 'rangeLink': None, 'subCategories': [], 'superCategories': ['nesclub2.sg.b2c/cat/capsule-range']}, {'id': 'nesclub2.sg.b2c/cat/capsule-range-world-explorations', 'name': 'World Explorations', 'description': 'Travel the world through coffee and discover a range inspi

KeyboardInterrupt: 