In [1]:
import requests
from scrapy.http import TextResponse
import re
import json

In [2]:
mainUrl = "https://www.ah.be/allerhande/recepten-zoeken?format=json&No="
user_agent = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/58: .0.3029.110 Chrome/58.0.3029.110 Safari/537.36'}
page = 1
recipePrefix = "https://www.ah.be/allerhande/recept"


In [3]:
def getResponse(url, user_agent):
    r = requests.get(url, headers=user_agent)
    return TextResponse(r.url, body=r.text, encoding='utf-8')

In [4]:
def getReceptUrls(previousIndex):
    resp = getResponse(mainUrl + str(previousIndex), user_agent)
    jsonObject = json.loads(resp.text)
    urls = []
    for record in jsonObject["contents"][0]["contentArea"][2]["mainContent"][1]["contents"][0]["records"]:
        urls.append(recipePrefix + record["detailsAction"]["recordState"])
    return urls

In [5]:
def title(response):
    title = response.xpath('//h1 [@itemprop="name"]/text()').extract()[0]
    #Remove soft hyphens
    title = title.replace('­', '')
    return title

In [6]:
def description(response):
    description = response.xpath('//h1 [@class="hidden-phones"]/text()').extract()[0]

In [7]:
def rating(response):
    rating = 0
    ratings = response.xpath('//div [@class="rating "]')[0]
    for star in ratings.xpath('.//ul/li/div/@class').extract():
        if bool(re.search('active', star)):
            rating = rating + 1
    return rating

In [8]:
def votes(response):
    votes = response.xpath('//div [@class="rating "]/span [@class="all-rates"]/span/text()').extract()
    if len(votes) > 0:
        return int(votes[0])
    else:
        return -1

In [9]:
# Inhoud ingredienten
def ingredients(response):
    ingredients = []
    for el in response.xpath('//li [@itemprop="ingredients"]'):
        ingredient = {}
        component = el.xpath('.//a/@data-search-term').extract()
        amount = el.xpath('.//a/@data-quantity').extract()
        unit = el.xpath('.//a/@data-quantity-unit-singular').extract()
        description = el.xpath('.//a/@data-default-label').extract()
        if len(description) > 0:
            ingredient['description']= description[0]
        if len(component) > 0:
            ingredient['component']= component[0]
            if len(amount) > 0:
                ingredient['amount']= amount[0]
            if len(unit) > 0:
                ingredient['unit']= unit[0]
            ingredients.append(ingredient) 
    return ingredients

In [10]:
def instructions(response):
    instructions = []
    for step in response.xpath('//section [@class="preparation"]/ol/li/text()').extract():
        instructions.append(step)
    return instructions

In [11]:
def labels(response):
    labels = []
    for label in response.xpath('//section [@class="tags"]/ul [@class="tags"]/li/a/text()').extract():
        labels.append(label)
    return labels

In [12]:
def createRecipe(url):
    response = getResponse(url, user_agent)
    recipe = {
            'url':url,
            'name':title(response),
            'rating':rating(response),
            'maxRating':5,
            'votes':votes(response),
            'ingredients':ingredients(response),
            'instructions':instructions(response),
            'labels':labels(response)
        }
    return recipe

In [13]:
data = {}
data['recipes']=[]
index = 0
while True:
    print("PAGE " + str(page))
    receptPages = getReceptUrls(index)
    if (len(receptPages) == 0):
        break
    #alle recepten op een pagina
    for receptUrl in receptPages:
        index = index + 1
        print(receptUrl)
        print("RECEPT " + str(index))
        data['recipes'].append(createRecipe(receptUrl))
    
    if page%100 == 0:
        print('EXPORTING TO ' + 'data' + str(page) + '.txt')
        with open('data' + str(page) + '.txt', 'w', encoding='utf8') as outfile:  
            json.dump(data, outfile, ensure_ascii=False)
        data['recipes']=[]    
    
    page = page+ 1

print('EXPORTING TO ' + 'data' + str(page) + '.txt')
with open('data' + str(page) + '.txt', 'w', encoding='utf8') as outfile:  
    json.dump(data, outfile, ensure_ascii=False)
print("DONE")

PAGE 1
https://www.ah.be/allerhande/recept/R-R1190455/traktatie-van-eetbare-slakken
RECEPT 1
https://www.ah.be/allerhande/recept/R-R1190454/traktatie-van-appels-in-monstervorm
RECEPT 2
https://www.ah.be/allerhande/recept/R-R1190453/galaxy-slush-cocktail
RECEPT 3
https://www.ah.be/allerhande/recept/R-R1190452/vegan-nachos-uit-de-oven-met-tomatensalsa
RECEPT 4
https://www.ah.be/allerhande/recept/R-R1190451/vegan-banh-mi
RECEPT 5
https://www.ah.be/allerhande/recept/R-R1190450/taco-s-met-crispy-polenta-halloumi
RECEPT 6
https://www.ah.be/allerhande/recept/R-R1190449/noten-rozijnenbrood
RECEPT 7
https://www.ah.be/allerhande/recept/R-R1190448/pizza-tartufato
RECEPT 8
https://www.ah.be/allerhande/recept/R-R1190447/pastasalade-met-tonijn
RECEPT 9
https://www.ah.be/allerhande/recept/R-R1190446/naan-met-zalm-in-sojasaus-en-sojabonenspread
RECEPT 10
https://www.ah.be/allerhande/recept/R-R1190445/kip-en-groenten-van-de-bakplaat
RECEPT 11
https://www.ah.be/allerhande/recept/R-R1190444/sushisalade-m