In [8]:
# supabaseに接続して、食材を（XXXg）とかから切り離して、カタカナに直す対応。

from supabase import create_client, Client
from dotenv import load_dotenv
import os
from bs4 import BeautifulSoup
import os, requests, time, json, isodate, datetime
from tqdm import tqdm


# ----------------------------------------------------------------
# Supabaseの接続情報（git操作の際は隠す）
# ----------------------------------------------------------------
load_dotenv()
url: str = os.getenv("VITE_SUPABASE_URL")
key: str = os.getenv("VITE_SUPABASE_ANON_KEY")

supabase: Client = create_client(url, key)

In [13]:
def findStructuredData(soup: BeautifulSoup) -> dict:
    """構造化データを取得する

    Parameters
    ----------
    soup : BeautifulSoup

    Returns
    -------
    structured_data : dict
    """
    structured_data = {}
    script_tags = soup.find_all('script', type='application/ld+json')
    for script_tag in script_tags:
        try:
            json_data = json.loads(script_tag.string)
            # レシピの構造化データを探す
            if isinstance(json_data, list):
                for item in json_data:
                    if item.get('@type') == 'Recipe':
                        structured_data = item
                        break
            elif isinstance(json_data, dict):
                if json_data.get('@type') == 'Recipe':
                    structured_data = json_data
                    break

        except json.JSONDecodeError:
            # JSON のパースに失敗した場合は無視する
            pass
    return structured_data


for idx in tqdm(range(40000, 70000, 2400)):
    recipes = supabase.table("Recipes").select("*").gte("id", idx).lte("id", idx+2400).execute()

    # recipes = supabase.table("Recipes").select("*").gte("id", 20000).lte("id", 20000 + 2400).execute()
    recipes = recipes.data

    for recipe in recipes:
        try:
            request = requests.get(recipe["sourceUrl"], headers={"User-Agent": "Mozilla/5.0"})
            time.sleep(1)
            assert request.status_code==200
            soup = BeautifulSoup(request.content, "html.parser")

            data = findStructuredData(soup)
            if data == {}:
                print(recipe["id"], "extract error")
                continue

            supabase.table("Recipes").update(
                {
                    "category": data.get("recipeCategory"),
                    "cuisine": data.get("recipeCuisine"),
                }
            ).eq("id", recipe["id"]).execute()
            if "recipeCategory" in data:
                print(recipe["id"], "success")
        except AssertionError:
            print(recipe["id"], "status code error")
            continue
        except requests.exceptions.RequestException:
            print(recipe["id"], "request error")
            continue
    


  0%|          | 0/13 [00:00<?, ?it/s]

40003 extract error
40004 extract error
40008 extract error
40010 extract error
40015 extract error
40018 extract error
40019 extract error
40020 extract error
40021 extract error
40023 extract error
40029 extract error
40030 extract error
40031 extract error
40032 extract error
40033 extract error
40034 extract error
40042 extract error
40046 extract error
40049 extract error
40050 extract error
40051 extract error
40053 extract error
40064 extract error
40069 extract error
40073 extract error
40079 extract error
40081 extract error
40100 extract error
40104 extract error
40114 extract error
40117 extract error
40120 extract error
40126 extract error
40128 extract error
40129 extract error
40131 extract error
40132 extract error
40163 extract error
40170 extract error
40173 extract error
40175 extract error
40176 extract error
40177 extract error
40179 extract error
40189 extract error
40193 extract error
40194 extract error
40195 extract error
40197 extract error
40198 extract error


 15%|█▌        | 2/13 [1:59:37<11:01:14, 3606.76s/it]

46793 success
46794 success
46795 success
46796 success
46797 success
46798 success
46799 success
46800 success
46801 status code error
46802 success
46803 success
46804 success
46805 success
46806 success
46807 success
46808 success
46809 success
46810 success
46811 success
46812 success
46813 success
46814 success
46815 success
46816 success
46817 success
46818 success
46819 success
46820 success
46821 success
46822 success
46823 success
46824 success
46825 success
46826 success
46827 success
46828 success
46829 success
46830 success
46831 success
46832 success
46834 success
46835 status code error
46836 success
46837 success
46838 success
46839 success
46840 success
46842 success
46843 success
46844 success
46845 success
46846 success
46847 success
46848 success
46849 success
46850 success
46851 success
46852 success
46853 success
46854 success
46855 success
46856 success
46857 success
46858 success
46859 success
46860 success
46861 success
46862 success
46863 success
46864 success


 23%|██▎       | 3/13 [3:01:34<10:09:29, 3656.99s/it]

47200 success
47200 success
47201 success
47202 success
47203 success
47204 success
47205 success
47206 success
47207 success
47208 success
47212 success
47213 status code error
47217 success
47218 success
47219 success
47220 success
47221 success
47222 success
47223 success
47224 success
47225 success
47226 success
47228 success
47230 success
47231 success
47232 success
47233 success
47234 success
47235 success
47236 success
47237 success
47238 success
47240 success
47241 success
47242 success
47243 success
47244 success
47245 success
47247 success
47248 success
47253 success
47254 success
47255 success
47256 success
47257 success
47258 success
47259 success
47260 success
47261 success
47262 success
47263 success
47265 success
47267 success
47268 success
47269 success
47270 success
47271 success
47272 success
47273 success
47274 success
47275 success
47276 success
47277 success
47286 success
47288 success
47289 success
47290 success
47291 success
47299 success
47300 status code error


 31%|███       | 4/13 [3:37:56<7:41:12, 3074.69s/it] 

49590 success
49605 success
49624 success
49690 success
49694 success
49697 success
49718 success
49754 success
49787 success
49794 success
49806 success
49808 success
49813 success
49819 success
49820 success
49824 success
49825 success
49841 status code error
49842 success
49848 success
49858 success
49860 success
49862 success
49874 success
49881 success
49882 success
49883 success
49884 success
49885 success
49886 success
49890 success
49891 success
49892 success
49896 status code error
49897 success
49914 status code error
49927 status code error
49928 success
49930 success
49931 success
49933 success
49935 success
49939 success
49940 status code error
49942 success
49943 success
49945 success
49961 success
49965 success
49966 success
49967 success
49968 success
49980 success
49993 success
50008 success
50011 success
50012 success
50014 success
50024 success
50025 success
50026 success
50029 success
50030 success
50031 success
50048 success
50056 success
50061 success
50070 succes

 38%|███▊      | 5/13 [3:52:30<5:04:07, 2280.96s/it]

52000 extract error
52000 extract error
52001 extract error
52002 extract error
52003 success
52004 extract error
52005 success
52006 success
52007 success
52008 success
52009 extract error
52010 extract error
52011 success
52012 extract error
52013 extract error
52014 extract error
52015 extract error
52016 extract error
52017 extract error
52018 extract error
52019 extract error
52020 success
52021 extract error
52022 success
52023 extract error
52024 extract error
52025 extract error
52026 extract error
52027 success
52028 success
52029 success
52030 success
52031 success
52032 success
52033 success
52034 success
52035 extract error
52036 extract error
52037 success
52038 success
52039 success
52040 extract error
52041 success
52042 success
52043 success
52044 extract error
52045 success
52046 success
52047 extract error
52048 extract error
52049 success
52050 success
52051 extract error
52052 extract error
52053 extract error
52054 success
52055 extract error
52056 extract error
52

 46%|████▌     | 6/13 [4:47:30<5:06:33, 2627.70s/it]

54494 status code error
54615 status code error
54764 status code error
54766 status code error
54769 status code error
54772 status code error
54774 status code error
54778 status code error
54873 status code error
55047 status code error
55114 status code error
55115 status code error
55116 status code error
55117 status code error
55118 status code error
55119 status code error
55122 status code error
55180 status code error
55311 status code error
55317 status code error
55439 status code error
55475 status code error
55514 status code error
55515 status code error
55608 status code error
55808 status code error
55910 status code error
55919 status code error
55944 status code error
56071 status code error
56073 status code error
56191 status code error
56474 status code error
56585 status code error
56603 status code error
56747 status code error
56759 status code error


 54%|█████▍    | 7/13 [5:42:03<4:43:50, 2838.41s/it]

56903 status code error
56990 status code error
56993 status code error
57006 status code error
57008 status code error
57026 status code error
57308 status code error
57394 status code error
57464 status code error
57523 status code error
57536 status code error
57560 status code error
57647 status code error
57648 status code error
57720 status code error
57738 status code error
57783 status code error
57843 status code error
57894 status code error
57932 status code error
58000 status code error
58060 status code error
58095 status code error
58125 status code error
58222 status code error
58295 success
58296 success
58297 success
58298 success
58299 success
58300 success
58301 success
58302 success
58303 success
58304 success
58305 success
58306 success
58307 success
58308 success
58309 success
58310 success
58311 success
58312 success
58313 success
58314 success
58315 success
58316 success
58317 success
58318 success
58319 success
58320 success
58321 success
58322 success
58323 su

 62%|██████▏   | 8/13 [6:39:44<4:13:04, 3036.84s/it]

59200 success
59200 success
59201 success
59202 success
59203 success
59204 success
59205 success
59206 success
59207 success
59208 success
59209 success
59210 success
59211 success
59212 success
59213 success
59214 success
59215 success
59216 success
59217 success
59218 success
59219 success
59220 success
59221 success
59222 success
59223 success
59224 success
59225 success
59226 success
59227 success
59228 success
59229 success
59230 success
59231 success
59232 success
59233 success
59234 success
59235 success
59236 success
59237 success
59238 success
59239 success
59240 success
59241 success
59242 success
59243 success
59244 success
59245 success
59246 success
59247 success
59248 success
59249 success
59250 success
59251 success
59252 success
59253 success
59254 success
59255 success
59256 success
59257 success
59258 success
59259 success
59260 success
59261 success
59262 success
59263 success
59264 success
59265 success
59266 success
59267 success
59268 success
59269 success
59270 

 69%|██████▉   | 9/13 [7:40:22<3:34:58, 3224.55s/it]

61600 success
63737 success
63740 success
63738 success
63739 success
63741 success
63742 success
63746 success
63744 success
63747 success
63748 success
63750 success
63751 success
63752 success
63753 success
63754 success
63756 success
63758 success
63759 success
63760 success
63761 success
62099 success
62380 success
61949 success
62918 success
63569 success
63169 success
63943 success
63944 success
63945 success
61906 success
63917 success
62026 success
62381 success
62027 success
63949 success
63950 success
63951 success
63539 success
62427 success
62028 success
62472 success
61895 success
62108 success
62897 success
62475 success
62695 success
61822 success
63947 success
62109 success
63946 success
62865 success
63948 success
61823 success
61828 success
61880 success
62831 success
62883 success
62885 success
62110 success
62120 success
62138 success
63144 success
61881 success
62152 success
63509 success
61882 success
62161 status code error
62162 success
61907 success
62163 succ

 77%|███████▋  | 10/13 [8:40:18<2:46:58, 3339.38s/it]

63801 success
64000 success
64001 success
64002 success
64003 success
64004 success
64005 success
64006 success
64007 success
64008 success
64009 success
64010 success
64011 success
64012 success
64013 success
64014 success
64015 success
64016 success
64017 success
64018 success
64019 success
64020 success
64021 success
64022 success
64023 success
64024 success
64025 success
64026 success
64027 success
64028 success
64029 success
64030 success
64031 success
64032 success
64033 success
64034 success
64035 success
64036 success
64037 success
64038 success
64039 success
64040 success
64041 success
64042 success
64043 success
64044 success
64045 success
64046 success
64047 success
64048 success
64049 success
64050 success
64051 success
64052 success
64053 success
64054 success
64055 success
64056 success
64057 success
64058 success
64059 success
64060 success
64061 status code error
64062 success
64063 success
64064 success
64065 success
64066 success
64067 success
64068 success
64069 succ

 92%|█████████▏| 12/13 [8:53:41<29:43, 1783.26s/it]  

64546 success


100%|██████████| 13/13 [8:53:41<00:00, 2463.17s/it]


In [4]:
len(recipes)

2401