# Ingredient taxonomy

In [None]:

%load_ext autoreload
%autoreload 2

Created using https://github.com/nichtich/wikidata-taxonomy

In [None]:
#!wdtaxonomy Q2095 -P 279,31,1647 -i -f tsv -o data/wikitax_ingredients.tsv -d

In [None]:
import time
import re
import numpy as np
from aiohttp import ContentTypeError
import pandas as pd
import asyncio
import aiohttp
from tqdm.asyncio import tqdm_asyncio
from tqdm.auto import tqdm
import json
from llm_food_taxonomy.graph.construction import tree_from_leaves
from llm_food_taxonomy.data.utils import get_ancestry_df

In [None]:
tqdm.pandas()

In [None]:
with open("../data/wikitax/wikitax_ingredients.tsv", "r") as f:
    cols = [c.strip() for c in f.readline().split(",")]

tax = pd.read_table("../data/wikitax/wikitax_ingredients.tsv", skiprows=1, header=None)
tax.columns = cols
tax = tax.dropna(subset=["label"])
tax.head()

In [None]:
tax.shape

In [None]:
tax[tax.sites > 0].shape

In [None]:
tax[~(tax.sites > 0)].sample(10)

In [None]:
tax.loc[tax[tax.level.isna()].index, "level"] = ""

In [None]:
tax[tax.label == "fruit"]

In [None]:
tax.shape

In [None]:
tax.shape

In [None]:
semaphore = asyncio.Semaphore(5)

async def get_desc(qid):
    try:
        async with semaphore, aiohttp.ClientSession() as session:
            headers = {'content-type': 'application/json'}
            async with session.get(f'https://www.wikidata.org/w/rest.php/wikibase/v0/entities/items/{qid}/descriptions/en',
                                   headers=headers) as resp:
                r = await resp.json()
                return qid, r
    except ContentTypeError:
        time.sleep(3)
        return await get_desc(qid)


In [None]:
r = await get_desc("Q3314483")
r[1]

In [None]:
qid_to_desc = {}

for cr in tqdm_asyncio.as_completed([get_desc(qid) for qid in tax.id.values.tolist()],
                                     desc="Generating ancestries", total=len(tax)):
    qid, desc = await cr
    qid_to_desc[qid] = desc

In [None]:
qid_to_desc

In [None]:
tax["desc"] = tax.id.progress_apply(lambda qid: qid_to_desc[qid])

In [None]:
descs = tax[["label", "desc"]]
descs.head()

In [None]:
def clean_node_name(name):
    name = name.split(":")[-1]
    name = name.strip()
    name = name.replace("-", " ")
    name = name.replace(",", "")
    name = re.sub('[^A-Za-z0-9À-Ÿ :]+', '', name)
    name = " ".join(name.split("\t"))
    name = re.sub(r"\s{2,}", " ", name)
    return name.lower()

In [None]:
descs.label = descs.label.apply(clean_node_name)
descs.desc = descs.desc.apply(lambda x: x if isinstance(x, str) else np.nan)
descs = descs.dropna()

In [None]:
descs.to_csv("../data/wikitax/raw_desc.csv")

In [None]:
descs.sample(10)

In [None]:
taxonomy = {}
subtree = taxonomy
prev_level = 0
prev_ing = None
prev_row = None
ancestors = []

def get_subtree(ancestors, taxonomy):
    st = taxonomy
    for anc in ancestors:
        st = st[anc]
    return st

for i, row in tqdm(tax.iterrows(), total=len(tax)):
    lvl = len(row["level"]) if i != 0 else 0
    ing = row["label"]
    if i == 0:
        subtree[ing] = {}
    if lvl > prev_level:
        subtree = get_subtree(ancestors, taxonomy)
        ancestors.append(prev_ing)
        subtree[prev_ing] = {}
    elif lvl == prev_level:
        subtree[prev_ing] = {}
    elif lvl < prev_level:
        n_up = prev_level - lvl
        for _ in range(n_up):
            if len(ancestors) > 0:
                ancestors.pop()
        subtree[prev_ing] = {}
        subtree = get_subtree(ancestors, taxonomy)
    else:
        raise ValueError("Something went wrong")

    prev_ing = ing
    prev_level = lvl
    prev_row = row

In [None]:
list(taxonomy.keys())

In [None]:
taxonomy["meat"]

## Clean up taxonomy

In [None]:
recipes = pd.read_json("../data/recipes/recipe1m_plus_det_ingrs.json")
recipes = recipes[["ingredients"]]
recipes.ingredients = recipes.ingredients.progress_apply(lambda x: [i["text"] for i in x if "text" in i])
recipes.head()

In [None]:
ingredients = recipes.explode("ingredients").dropna()
ingredients = ingredients.value_counts().reset_index()
ingredients.columns = ["label", "ingredient_count"]
ingredients.head()

In [None]:
ingredients = ingredients[ingredients.ingredient_count > 10]
ingredients.shape

In [None]:
ingredients.sample(10)

In [None]:
taxonomy = {"food": taxonomy}

In [None]:
df = get_ancestry_df(taxonomy, sep=";")
print(df.shape)
df.head()

In [None]:
df = df[df.leaf.isin(ingredients.label)]
df.shape

In [None]:
df.leaf.unique().shape[0] == df.leaf.shape[0]

In [None]:
df.leaf.unique().shape[0]

In [None]:
cleaned_taxonomy = tree_from_leaves(df, sep=";", min_leaf_depth=3)

In [None]:
cleaned_taxonomy

In [None]:
with open("../data/wikitax/wikitax_ingredients_taxonomy.json", "w") as f:
    json.dump(cleaned_taxonomy, f, indent=4)