In [40]:
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime


# 전체 페이지 수 가져오기
def get_page_num():
    url = 'https://www.feastingathome.com/recipes/?fwp_by_diet=vegetarian'
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    max_num = soup.find('li', {'class':'pagination-next'}).find_previous_sibling().text.split(' ')[-1]
    nums = list(range(1, int(max_num) + 1))

    return nums

# 입력한 페이지의 전체 레시피 링크 가져오기
def get_links(i):
    link_list = list()
    url = 'https://www.feastingathome.com/recipes/page/' + str(i) + '/?fwp_by_diet=vegetarian'
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    lis = soup.find_all('div', {'class': 'post-summary__image'})
    for li in lis:
        link_list.append(li.find('a').get('href'))

    return link_list

# 입력한 링크의 출처(site), 레시피명(title), 재료 리스트(ingredients), 조리시간(time), 분량(serving), 레시피(recipe),
# 영양 성분(nutrition), 댓글 리스트(comments), 이미지(image) 가져오기
def get_contents(url):
    global calories, carbs, protein, total_fat
    contents = dict()
    soup = BeautifulSoup(requests.get(url).text, 'html.parser')

    # 출처(site)
    # 함수에 입력받는 url 사용

    # 레시피명(title)
    title = soup.find('h1', {'class': 'page-header__title'}).text

    # 재료 리스트(ingredients),
    ingredients_raw = soup.find('div', {'class': 'tasty-recipes-ingredients-body'})
    if ingredients_raw:
        ingredients = ingredients_raw.text.split('\n')
        ing_list = list()
        for ing in ingredients:
            if ing == '':
                pass
            else:
                ing_list.append(ing.replace('\xa0', ''))
    else:
        ing_list = []

    # 조리시간(time)
    cookTime_raw = soup.find('div', {'class': 'tasty-recipes-details'})
    if cookTime_raw:
        cookTimeText = cookTime_raw.text
        val = "Total Time:" not in cookTimeText
        if val:
            cookTime = ''
        else:
            cookTime = cookTime_raw.find('li', {'class': 'total-time'}).text.split('Total Time: ')[1]
        cookTime = str(cookTime)
    else:
        cookTime = ''

    # 분량(serving)
    servings_raw = soup.find('div', {'class': 'tasty-recipes-details'})
    if servings_raw:
        servingsText = servings_raw.text
        val = "Yield:" not in servingsText
        if val:
            serving = '1'
        else:
            serving = servings_raw.find('li', {'class': 'yield'}).text.split('Yield: ')[1]
        serving = str(serving) + ' serving'
    else:
        serving = ''

    # 레시피(recipe)
    recipes_raw = soup.find('div', {'class': 'tasty-recipes-instructions-body'})
    if recipes_raw:
        recipes = recipes_raw.text.split('\n')
        recipes_list = list()
        recipe_list = list()
        for recipe in recipes:
            if recipe == '':
                pass
            else:
                recipes_list.append(recipe.replace('\xa0', ''))
        for i in range(len(recipes_list)):
            recipe_list.append(str(i + 1) + ". " + recipes_list[i])
    else:
        recipe_list = []

    # 영양 성분(nutrition)
    nutrition_raw = soup.find('div', {'class': 'tasty-recipes-nutrition'})
    if nutrition_raw:
        nutrition_raw_text = nutrition_raw.text.split('\n')
        for nutrition in nutrition_raw_text:
            if 'Calories:' in nutrition:
                if nutrition.split('Calories: ')[1]:
                    cal = nutrition.split('Calories: ')[1]
                    calories = cal + ' Kcal'
                else:
                    calories = ''
            else:
                pass
        for nutrition in nutrition_raw_text:
            if 'Carbohydrates:' in nutrition:
                car = nutrition.split('Carbohydrates: ')[1]
                carbs = car
            else:
                pass
        for nutrition in nutrition_raw_text:
            if 'Protein:' in nutrition:
                prt = nutrition.split('Protein: ')[1]
                protein = prt
            else:
                pass
        for nutrition in nutrition_raw_text:
            if 'Fat:' in nutrition:
                fat = nutrition.split('Fat: ')[1]
                total_fat = fat
            else:
                pass
    else:
        pass

    # 댓글 리스트(comments)
    # comments = soup.select('ol[class=comment-list] > li > article > div[class=comment-content]')
    # com_list = list()
    # for comment in comments:
    #     com_list.append(comment.text.replace('\n', ''))

    # 이미지(image)
    images = soup.find('div', {'class': 'entry-content'}).find_all('img')
    imgs_list = list()
    img_list = list()
    for image in images:
        if image.get('src') == None:
            pass
        else:
            imgs_list.append(image.get('src'))
    for img in imgs_list:
        if 'https://www.feastingathom' in img:
            img_list.append(img)

    contents['site'] = url
    contents['title'] = title
    contents['ingredients'] = ing_list
    contents['time'] = cookTime
    contents['serving'] = serving
    contents['recipe'] = recipe_list
    contents['calories'] = calories
    contents['carbs'] = carbs
    contents['protein'] = protein
    contents['total_fat'] = total_fat
    # contents['comments'] = com_list
    contents['image'] = img_list[0]

    return contents

# 전체 페이지 레시피 댓글 가져오기
def get_all_page_comment(nums):
    total = dict()
    title_comments = list()
    a = 0

    with ThreadPoolExecutor(max_workers=10) as executor:
        for num in nums:
            links = get_links(num)
            for link in links:
                content = executor.submit(get_contents, link)
                title_comments.append(content.result())

                a += 1
                print(a)

    date = datetime.today().strftime("%Y%m%d")
    
    total['date'] = date                
    total['feastingathome'] = title_comments
    return total

# 메인에서 실행
if __name__ == '__main__':
    # json 저장
    nums = get_page_num()
    total = get_all_page_comment(nums)

    date = datetime.today().strftime("%Y%m%d")
    
    with open(f'D:\\fruit_hada\\crawling\\crawling_result\\{date}_feastingathome_all.json', 'w', encoding='utf-8-sig') as file:
        json.dump(total, file, indent="\t")    


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277


In [12]:
# 전체 페이지 수 가져오기
url = 'https://www.feastingathome.com/recipes/?fwp_by_diet=vegetarian'
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html.parser')
max_num = soup.find('li', {'class':'pagination-next'}).find_previous_sibling().text.split(' ')[-1]
nums = list(range(1, int(max_num) + 1))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]


In [34]:
# 링크 리스트 가져오기
link_list = list()
for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]:
    url = 'https://www.feastingathome.com/recipes/page/' + str(i) + '/?fwp_by_diet=vegetarian'
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    lis = soup.find_all('div', {'class': 'post-summary__image'})
    for li in lis:
        link_list.append(li.find('a').get('href'))
print(link_list)


['https://www.feastingathome.com/drunken-noodles-pad-kee-mao/', 'https://www.feastingathome.com/truffle-mac-and-cheese/', 'https://www.feastingathome.com/vegetarian-thanksgiving-recipes/', 'https://www.feastingathome.com/gnocchi-with-mushrooms/', 'https://www.feastingathome.com/simple-sauteed-mushrooms/', 'https://www.feastingathome.com/homemade-potato-gnocchi/', 'https://www.feastingathome.com/baked-rigatoni-with-butternut-squash/', 'https://www.feastingathome.com/butternut-squash-dinner-rolls/', 'https://www.feastingathome.com/healthy-luscious-potato-leek-soup/', 'https://www.feastingathome.com/zucchini-pasta/', 'https://www.feastingathome.com/vegan-dinner-recipes/', 'https://www.feastingathome.com/egg-bites/', 'https://www.feastingathome.com/rajas-tacos/', 'https://www.feastingathome.com/pad-see-ew/', 'https://www.feastingathome.com/peach-cobbler/', 'https://www.feastingathome.com/zucchini-recipes/', 'https://www.feastingathome.com/mayonnaise-recipe/', 'https://www.feastingathome.co

In [38]:
# 크롤링할 내용 체크하기
url = 'https://www.feastingathome.com/drunken-noodles-pad-kee-mao/'
global calories, carbs, protein, total_fat
contents = dict()
soup = BeautifulSoup(requests.get(url).text, 'html.parser')

# 출처(site)
# 함수에 입력받는 url 사용

# 레시피명(title)
title = soup.find('h1', {'class': 'page-header__title'}).text

# 재료 리스트(ingredients),
ingredients_raw = soup.find('div', {'class': 'tasty-recipes-ingredients-body'})
if ingredients_raw:
    ingredients = ingredients_raw.text.split('\n')
    ing_list = list()
    for ing in ingredients:
        if ing == '':
            pass
        else:
            ing_list.append(ing.replace('\xa0', ''))
else:
    ing_list = []

# 조리시간(time)
cookTime_raw = soup.find('div', {'class': 'tasty-recipes-details'})
if cookTime_raw:
    cookTimeText = cookTime_raw.text
    val = "Total Time:" not in cookTimeText
    if val:
        cookTime = ''
    else:
        cookTime = cookTime_raw.find('li', {'class': 'total-time'}).text.split('Total Time: ')[1]
    cookTime = str(cookTime)
else:
    cookTime = ''

# 분량(serving)
servings_raw = soup.find('div', {'class': 'tasty-recipes-details'})
if servings_raw:
    servingsText = servings_raw.text
    val = "Yield:" not in servingsText
    if val:
        serving = '1'
    else:
        serving = servings_raw.find('li', {'class': 'yield'}).text.split('Yield: ')[1]
    serving = str(serving) + ' serving'
else:
    serving = ''

# 레시피(recipe)
recipes_raw = soup.find('div', {'class': 'tasty-recipes-instructions-body'})
if recipes_raw:
    recipes = recipes_raw.text.split('\n')
    recipes_list = list()
    recipe_list = list()
    for recipe in recipes:
        if recipe == '':
            pass
        else:
            recipes_list.append(recipe.replace('\xa0', ''))
    for i in range(len(recipes_list)):
        recipe_list.append(str(i + 1) + ". " + recipes_list[i])
else:
    recipe_list = []

# 영양 성분(nutrition)
nutrition_raw = soup.find('div', {'class': 'tasty-recipes-nutrition'})
if nutrition_raw:
    nutrition_raw_text = nutrition_raw.text.split('\n')
    for nutrition in nutrition_raw_text:
        if 'Calories:' in nutrition:
            if nutrition.split('Calories: ')[1]:
                cal = nutrition.split('Calories: ')[1]
                calories = cal + ' Kcal'
            else:
                calories = ''
        else:
            pass
    for nutrition in nutrition_raw_text:
        if 'Carbohydrates:' in nutrition:
            car = nutrition.split('Carbohydrates: ')[1]
            carbs = car
        else:
            pass
    for nutrition in nutrition_raw_text:
        if 'Protein:' in nutrition:
            prt = nutrition.split('Protein: ')[1]
            protein = prt
        else:
            pass
    for nutrition in nutrition_raw_text:
        if 'Fat:' in nutrition:
            fat = nutrition.split('Fat: ')[1]
            total_fat = fat
        else:
            pass
else:
    pass

# 댓글 리스트(comments)
# comments = soup.select('ol[class=comment-list] > li > article > div[class=comment-content]')
# com_list = list()
# for comment in comments:
#     com_list.append(comment.text.replace('\n', ''))

# 이미지(image)
images = soup.find('div', {'class': 'entry-content'}).find_all('img')
imgs_list = list()
img_list = list()
for image in images:
    if image.get('src') == None:
        pass
    else:
        imgs_list.append(image.get('src'))
for img in imgs_list:
    if 'https://www.feastingathom' in img:
        img_list.append(img)

contents['site'] = url
contents['title'] = title
contents['ingredients'] = ing_list
contents['time'] = cookTime
contents['serving'] = serving
contents['recipe'] = recipe_list
contents['calories'] = calories
contents['carbs'] = carbs
contents['protein'] = protein
contents['total_fat'] = total_fat
# contents['comments'] = com_list
contents['image'] = img_list[0]

print(contents)

{'site': 'https://www.feastingathome.com/drunken-noodles-pad-kee-mao/', 'title': 'Drunken Noodles (Pad Kee Mao)', 'ingredients': ['6 ounces dried wide rice noodles', '2 tablespoons high-heat oil- peanut or coconut', '4 garlic cloves, rough chopped', '1–2 fresh Thai chilis, finely minced', '6–8 oz chicken, thinly sliced (or sub shrimp, and see notes for crispy tofu)', '1/2 cup onion,  sliced long and thin', '1/2 cup red bell peppers, sliced long and thin', 'Optional: 1/3 cup jalapeños, cut into long thin strips', 'Optional: 1/2 cup Roma tomato, sliced in to long wedges,or sub other quick cooking vegetable', '1 cup basil, Thai or holy basil', 'Drunken Noodle Sauce:', '2 tablespoons oyster sauce', '1 tablespoon fish sauce', '1 1/2 tablespoon soy sauce', '1 tablespoon sweet dark soy sauce or sub molasses', '2 tablespoons sugar, brown sugar, agave, or honey', '1/2 tablespoon rice vinegar or lime juice', ''], 'time': '35', 'serving': '2-3 1x serving', 'recipe': ['1. Prepare the noodles, acco