In [1]:
import requests
from scrapy.http import TextResponse
import re
import json

In [2]:
mainUrl = "https://www.solo.be/nl/zoeken/?page="
receptUrlPrefix = "https://www.solo.be"
user_agent = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/58: .0.3029.110 Chrome/58.0.3029.110 Safari/537.36'}
page = 10

In [3]:
def getResponse(url, user_agent):
    r = requests.get(url, headers=user_agent)
    return TextResponse(r.url, body=r.text, encoding='utf-8')

In [4]:
def receptPages(response):
    urls = []
    for receptUrl in response.xpath('//a[@class="teaser teaser--new teaser--new--recipe"]'):
        urls.append(receptUrlPrefix + receptUrl.xpath("@href").extract()[0])
    for receptUrl in response.xpath('//a[@class="teaser teaser--new teaser--new--recipe teaser--video"]'):
        urls.append(receptUrlPrefix + receptUrl.xpath("@href").extract()[0])
    return urls

In [5]:
def title(response):
    #titel
    titleList = response.xpath('//h1 [@itemprop="name"]/text()').extract()
    if len(titleList) > 0:
        #verwijder whitespace vooraan
        title = re.sub('^[\t\r\n ]+','',titleList[0])
        return title
    else:
        return 'UNKNOWN'

In [6]:
def description(response):
    descriptionList = response.xpath('//meta [@itemprop="description"]').xpath("@content").extract()
    if len(descriptionList)>0:
        description = re.sub('^[\t\r\n ]+','',descriptionList[0])
        description = re.sub('[\t\r\n ]+$','',description)
        return description
    else:
        return ''

In [7]:
def rating(response):
    ratings = response.xpath('//span [@itemprop="ratingValue"]/text()').extract()
    if len(ratings) > 0:
        return int(ratings[0])
    else:
        return -1

In [8]:
def votes(response):
    votes = response.xpath('//span [@itemprop="ratingCount"]/text()').extract()
    if len(votes) > 0:
        return int(votes[0])
    else:
        return -1

In [9]:
# Inhoud ingredienten
def ingredients(response):
    ingredients = []
    
    for el in response.xpath('//ul[@class="list list--ingredients"]/li'):
        ingredient = {}
        description = el.xpath('.//span/text()').extract()
        
        component= el.xpath('.//span/a/text()').extract()
        if len(component) > 0:
            ingredient['component']= component[0]
            if len(description)>0:
                measure = re.sub('[\t\r\n ]+$','',description[0])
                ingredient['measure']= measure
            ingredients.append(ingredient)
        else:
            if len(description)>0:
                desc = re.sub(' +',' ',description[0])
                desc = re.sub('^[\t\r\n ]+','',desc)
                desc = re.sub('[\t\r\n ]+$','',desc)
                ingredient['description']= desc
                ingredients.append(ingredient) 
    return ingredients

In [10]:
def instructions(response):
    instructions = []
    for step in response.xpath('//div [@itemprop="recipeInstructions"]/ol/li/text()').extract():
        instructions.append(step)
    return instructions

In [11]:
def labels(response):
    labels = []
    for label in response.xpath('//a [@itemprop="recipeCategory"]/text()').extract():
        if label != "":
            labels.append(label)
    return labels

In [15]:
data = {}
data['recipes']=[]
index = 0
while True:
    print("PAGE " + str(page))
    mainResponse = getResponse(mainUrl + str(page), user_agent)
    pages = receptPages(mainResponse)
    if (len(pages) == 0):
        break
    #alle recepten op een pagina
    for receptUrl in pages:
        index = index + 1
        print("RECEPT " + str(index))
        response = getResponse(receptUrl, user_agent)
        
        data['recipes'].append({
            'url':receptUrl,
            'name':title(response),
            'description':description(response),
            'rating':rating(response),
            'votes':votes(response),
            'ingredients':ingredients(response),
            'instructions':instructions(response),
            'labels':labels(response)
        })
    
    if page%100 == 0:
        print('EXPORTING TO ' + 'data' + str(page) + '.txt')
        with open('data' + str(page) + '.txt', 'w', encoding='utf8') as outfile:  
            json.dump(data, outfile, ensure_ascii=False)
        data['recipes']=[]
    page = page+ 1

print('EXPORTING TO ' + 'data' + str(page) + '.txt')
with open('data' + str(page) + '.txt', 'w', encoding='utf8') as outfile:  
    json.dump(data, outfile, ensure_ascii=False)
print("DONE")

PAGE 1301
RECEPT 1
RECEPT 2
RECEPT 3
RECEPT 4
RECEPT 5
RECEPT 6
RECEPT 7
RECEPT 8
RECEPT 9
RECEPT 10
RECEPT 11
RECEPT 12
PAGE 1302
RECEPT 13
RECEPT 14
RECEPT 15
RECEPT 16
RECEPT 17
RECEPT 18
RECEPT 19
RECEPT 20
RECEPT 21
RECEPT 22
RECEPT 23
RECEPT 24
PAGE 1303
RECEPT 25
RECEPT 26
RECEPT 27
RECEPT 28
RECEPT 29
RECEPT 30
RECEPT 31
RECEPT 32
RECEPT 33
RECEPT 34
RECEPT 35
RECEPT 36
PAGE 1304
RECEPT 37
RECEPT 38
RECEPT 39
RECEPT 40
RECEPT 41
RECEPT 42
RECEPT 43
RECEPT 44
RECEPT 45
RECEPT 46
RECEPT 47
RECEPT 48
PAGE 1305
RECEPT 49
RECEPT 50
RECEPT 51
RECEPT 52
RECEPT 53
RECEPT 54
RECEPT 55
RECEPT 56
RECEPT 57
RECEPT 58
RECEPT 59
RECEPT 60
PAGE 1306
RECEPT 61
RECEPT 62
RECEPT 63
RECEPT 64
RECEPT 65
RECEPT 66
RECEPT 67
RECEPT 68
RECEPT 69
RECEPT 70
RECEPT 71
RECEPT 72
PAGE 1307
RECEPT 73
RECEPT 74
RECEPT 75
RECEPT 76
RECEPT 77
RECEPT 78
RECEPT 79
RECEPT 80
RECEPT 81
RECEPT 82
RECEPT 83
RECEPT 84
PAGE 1308
RECEPT 85
RECEPT 86
RECEPT 87
RECEPT 88
RECEPT 89
RECEPT 90
RECEPT 91
RECEPT 92
RECEPT 93

RECEPT 702
RECEPT 703
RECEPT 704
RECEPT 705
RECEPT 706
RECEPT 707
RECEPT 708
PAGE 1360
RECEPT 709
RECEPT 710
RECEPT 711
RECEPT 712
RECEPT 713
RECEPT 714
RECEPT 715
RECEPT 716
RECEPT 717
RECEPT 718
RECEPT 719
RECEPT 720
PAGE 1361
RECEPT 721
RECEPT 722
RECEPT 723
RECEPT 724
RECEPT 725
RECEPT 726
RECEPT 727
RECEPT 728
RECEPT 729
RECEPT 730
RECEPT 731
RECEPT 732
PAGE 1362
RECEPT 733
RECEPT 734
RECEPT 735
RECEPT 736
RECEPT 737
RECEPT 738
RECEPT 739
RECEPT 740
RECEPT 741
RECEPT 742
RECEPT 743
RECEPT 744
PAGE 1363
RECEPT 745
RECEPT 746
RECEPT 747
RECEPT 748
RECEPT 749
RECEPT 750
RECEPT 751
RECEPT 752
RECEPT 753
RECEPT 754
RECEPT 755
RECEPT 756
PAGE 1364
RECEPT 757
RECEPT 758
RECEPT 759
RECEPT 760
RECEPT 761
RECEPT 762
RECEPT 763
RECEPT 764
RECEPT 765
RECEPT 766
PAGE 1365
EXPORTING TO data1365.txt
DONE
