# Ingredient taxonomy

In [1]:

%load_ext autoreload
%autoreload 2

Created using https://github.com/nichtich/wikidata-taxonomy

In [2]:
#!wdtaxonomy Q2095 -P 279,31,1647 -i -f tsv -o data/wikitax_ingredients.tsv -d

In [3]:
import time
import re
import numpy as np
from aiohttp import ContentTypeError
import pandas as pd
import asyncio
import aiohttp
from tqdm.asyncio import tqdm_asyncio
from tqdm.auto import tqdm
import json
from llm_food_taxonomy.graph.construction import tree_from_leaves
from llm_food_taxonomy.data.utils import get_ancestry_df

In [4]:
tqdm.pandas()

In [5]:
with open("../data/wikitax/wikitax_ingredients.tsv", "r") as f:
    cols = [c.strip() for c in f.readline().split(",")]

tax = pd.read_table("../data/wikitax/wikitax_ingredients.tsv", skiprows=1, header=None)
tax.columns = cols
tax = tax.dropna(subset=["label"])
tax.head()

Unnamed: 0,level,id,label,sites,instances,parents
0,,Q2095,food,220,0,^^
1,-,Q27773,common sole,0,0,
2,-,Q133017,Chlorella,0,0,
3,-,Q143359,Capelin,0,0,
4,-,Q167692,monjayaki,16,0,


In [6]:
tax.shape

(39837, 6)

In [7]:
tax[tax.sites > 0].shape

(29253, 6)

In [8]:
tax[~(tax.sites > 0)].sample(10)

Unnamed: 0,level,id,label,sites,instances,parents
20918,---,Q107286654,Begun diye kajoli machher jhol,0,0,
30986,-----,Q105671616,Bhetki machher kanta chorchori,0,0,
41074,----,Q114348085,hay cheese,0,0,
15376,------,Q118819242,Planargia,0,0,
53469,----,Q11342956,Milmake,0,0,
36462,---,Q3439845,Roigebrageldi,0,0,
4554,----,Q111315249,White Gummy Bear Shot,0,0,
52399,--,Q107262465,Myroxylon seed,0,0,
18325,----,Q110523094,Molossolini,0,0,
28575,--,Q116245539,Dish (coppa umbonata),0,0,


In [9]:
tax.loc[tax[tax.level.isna()].index, "level"] = ""

In [10]:
tax[tax.label == "fruit"]

Unnamed: 0,level,id,label,sites,instances,parents
44926,-,Q3314483,fruit,85,0,^^^^
54450,--,Q3314483,fruit,85,0,^^^^


In [11]:
tax.shape

(39837, 6)

In [12]:
tax.shape

(39837, 6)

In [13]:
semaphore = asyncio.Semaphore(5)

async def get_desc(qid):
    try:
        async with semaphore, aiohttp.ClientSession() as session:
            headers = {'content-type': 'application/json'}
            async with session.get(f'https://www.wikidata.org/w/rest.php/wikibase/v0/entities/items/{qid}/descriptions/en',
                                   headers=headers) as resp:
                r = await resp.json()
                return qid, r
    except ContentTypeError:
        time.sleep(3)
        return await get_desc(qid)


In [14]:
r = await get_desc("Q3314483")
r[1]

'typically sweet and/or sour, edible part/s of a plant that resembles seed-bearing fruit'

In [15]:
qid_to_desc = {}

for cr in tqdm_asyncio.as_completed([get_desc(qid) for qid in tax.id.values.tolist()],
                                     desc="Generating ancestries", total=len(tax)):
    qid, desc = await cr
    qid_to_desc[qid] = desc

Generating ancestries: 100%|██████████| 39837/39837 [44:59<00:00, 14.76it/s]  


In [16]:
qid_to_desc

{'Q41775462': 'apple cultivar',
 'Q10437228': 'pastry',
 'Q10438086': {'code': 'resource-not-found',
  'message': 'The requested resource does not exist',
  'context': {'resource_type': 'description'}},
 'Q107246629': 'purée made of hemp',
 'Q41775511': 'apple cultivar',
 'Q107246631': 'blood of huso',
 'Q41775519': 'apple cultivar',
 'Q107246632': 'broth made from isinglass',
 'Q10542408': 'pastry eaten during coffee breaks',
 'Q29062003': 'type of Swedish cake with butter and almond glaze',
 'Q41775524': 'apple cultivar',
 'Q10659975': 'pastry containing saffron',
 'Q107246633': 'broth made from juniper berries',
 'Q41775531': 'apple cultivar',
 'Q107246634': 'juice from juniper berry',
 'Q107246635': 'blood from lamb',
 'Q41775546': 'apple cultivar',
 'Q112247828': 'spiced yeast-leavened sweet bun with saffron and currants from Southern England',
 'Q41775553': 'apple cultivar',
 'Q260929': "Scandinavian dessert traditionally eaten on Saint Lucy's Day (13 Dec.), containing saffron an

In [17]:
tax["desc"] = tax.id.progress_apply(lambda qid: qid_to_desc[qid])

  0%|          | 0/39837 [00:00<?, ?it/s]

In [18]:
descs = tax[["label", "desc"]]
descs.head()

Unnamed: 0,label,desc
0,food,any substance consumed to provide nutritional ...
1,common sole,flatfish
2,Chlorella,genus of algae
3,Capelin,species of fish
4,monjayaki,Japanese savory pancake


In [19]:
def clean_node_name(name):
    name = name.split(":")[-1]
    name = name.strip()
    name = name.replace("-", " ")
    name = name.replace(",", "")
    name = re.sub('[^A-Za-z0-9À-Ÿ :]+', '', name)
    name = " ".join(name.split("\t"))
    name = re.sub(r"\s{2,}", " ", name)
    return name.lower()

In [20]:
descs.label = descs.label.apply(clean_node_name)
descs.desc = descs.desc.apply(lambda x: x if isinstance(x, str) else np.nan)
descs = descs.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  descs.label = descs.label.apply(clean_node_name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  descs.desc = descs.desc.apply(lambda x: x if isinstance(x, str) else np.nan)


In [21]:
descs.to_csv("../data/wikitax/raw_desc.csv")

In [22]:
descs.sample(10)

Unnamed: 0,label,desc
36116,fagioli alluccelletto,Italian dish
46808,old fred,apple cultivar
54339,basic methacrylate copolymer,polymer used as a food additive
45970,joseph musch,apple cultivar
41842,boulette davesnes,cheese
28344,chicken burrito,dish
36323,achappam,deep fried cookie made with rice flour
33013,dendeng baracik,a type of Indonesian dendeng
19501,sasanishiki,Japanese rice
46980,rewen,apple cultivar


In [23]:
taxonomy = {}
subtree = taxonomy
prev_level = 0
prev_ing = None
prev_row = None
ancestors = []

def get_subtree(ancestors, taxonomy):
    st = taxonomy
    for anc in ancestors:
        st = st[anc]
    return st

for i, row in tqdm(tax.iterrows(), total=len(tax)):
    lvl = len(row["level"]) if i != 0 else 0
    ing = row["label"]
    if i == 0:
        subtree[ing] = {}
    if lvl > prev_level:
        subtree = get_subtree(ancestors, taxonomy)
        ancestors.append(prev_ing)
        subtree[prev_ing] = {}
    elif lvl == prev_level:
        subtree[prev_ing] = {}
    elif lvl < prev_level:
        n_up = prev_level - lvl
        for _ in range(n_up):
            if len(ancestors) > 0:
                ancestors.pop()
        subtree[prev_ing] = {}
        subtree = get_subtree(ancestors, taxonomy)
    else:
        raise ValueError("Something went wrong")

    prev_ing = ing
    prev_level = lvl
    prev_row = row

  0%|          | 0/39837 [00:00<?, ?it/s]

In [24]:
list(taxonomy.keys())

['food',
 None,
 'common sole',
 'Chlorella',
 'Capelin',
 'monjayaki',
 'fish and chips',
 'milk',
 'Coussin de Lyon',
 'meat',
 'Avena sativa',
 'ladyfinger',
 'shashlik',
 'omelette',
 'Omelette Arnold Bennett',
 'rotisserie',
 'Zwiebelkuchen',
 'Cucumis sativus',
 'batter',
 'Tarta de Santiago',
 'watermelon',
 'seedless watermelon',
 'Syracuse watermelon',
 'square watermelon',
 'Anguria Reggiana',
 'Obanazawa watermelon',
 'drink',
 'Actimel',
 'bitter lemon',
 'Appletiser',
 'sima',
 'Posca',
 'Rivella',
 'sugarcane juice',
 'Coca-Cola',
 'New Coke',
 'Coca-Cola Vanilla',
 'Coca-Cola Zero',
 'Coca-Cola Black Cherry Vanilla',
 'Coca-Cola Citra',
 'Coca-Cola Plus GreenTea',
 'Coca-Cola Orange',
 'Caffeine-Free Coca-Cola',
 'Coca-Cola Plus',
 'New Coke conspiracy theories',
 'Coca-Cola Starlight',
 'drinking water',
 'Lourdes water',
 'in Jelly',
 'Kropla Beskidu',
 'Font de la plaça del Repartidor',
 'Fountain in the Place Dorcière',
 'fountain farmhouse Elfenau',
 'fountain Oberb

In [25]:
taxonomy["meat"]

{'beef': {},
 'beef thymus': {},
 'beef pancreas': {},
 'beef kidney': {},
 'Boned Chunk': {},
 'Tafelspitz': {},
 'beef shank': {},
 'filet mignon': {},
 'standing rib roast': {},
 'corned beef': {},
 'Lonzu': {},
 'oyster': {},
 'Carne de Ávila': {},
 'Cecina de León': {},
 'meat scientist': {},
 'suya': {},
 'Darkcutter': {},
 'buffalo meat': {},
 'Goat meat pepper soup': {},
 'hare meat': {},
 'Carne de Morucha de Salamanca': {},
 'moose meat': {},
 'Lakefleisch': {},
 'emu meat': {},
 'elk meat': {},
 'sika deer meat': {},
 'axis deer meat': {},
 'llama meat': {},
 'white river crayfish meat': {},
 'Atlantic angel shark meat': {},
 'pheasant meat': {},
 'Pacific oyster meat': {},
 'Russian sturgeon meat': {},
 'giant abalone meat': {},
 'bull shark meat': {},
 'nutria meat': {},
 'delta smelt meat': {},
 'Atlantic emperor meat': {},
 'giant boarfish meat': {},
 'blue marlin meat': {},
 'rock sole meat': {},
 'lobster meat': {},
 'muskrat meat': {},
 'coho salmon meat': {},
 'groun

## Clean up taxonomy

In [26]:
recipes = pd.read_json("../data/recipes/recipe1m_plus_det_ingrs.json")
recipes = recipes[["ingredients"]]
recipes.ingredients = recipes.ingredients.progress_apply(lambda x: [i["text"] for i in x if "text" in i])
recipes.head()

  0%|          | 0/1029720 [00:00<?, ?it/s]

Unnamed: 0,ingredients
0,"[penne, cheese sauce, cheddar cheese, gruyere ..."
1,"[elbow macaroni, American cheese, celery, gree..."
2,"[tomatoes, kosher salt, red onion, green bell ..."
3,"[milk, water, butter, mashed potatoes, whole k..."
4,"[watermelon gelatin, boiling water, Cool Whip,..."


In [27]:
ingredients = recipes.explode("ingredients").dropna()
ingredients = ingredients.value_counts().reset_index()
ingredients.columns = ["label", "ingredient_count"]
ingredients.head()

Unnamed: 0,label,ingredient_count
0,salt,362348
1,butter,237604
2,sugar,217878
3,olive oil,169619
4,water,158509


In [28]:
ingredients = ingredients[ingredients.ingredient_count > 10]
ingredients.shape

(10959, 2)

In [29]:
ingredients.sample(10)

Unnamed: 0,label,ingredient_count
4432,low - sodium tamari,92
430,mint,3277
8792,lime twist,18
4015,prepared polenta,111
1020,pecorino cheese,1065
6962,blackberry schnapps,33
4597,halibut fillet,85
4453,dry champagne,92
8994,1 whole Lemon Thinly Sliced,17
8281,peanut butter cup,21


In [30]:
taxonomy = {"food": taxonomy}

In [31]:
df = get_ancestry_df(taxonomy, sep=";")
print(df.shape)
df.head()

(23490, 3)


Unnamed: 0,leaf,ancestry,level
0,'Mpanatigghi,food;'Mpanatigghi,2
1,'Nduja,food;food product;meat product;processed meat;...,7
2,'Nzuddi,food;'Nzuddi,2
3,'O pere e 'o musso,food;dish;fast food;street food;'O pere e 'o m...,5
4,'Ohana Bread Pudding,food;dessert;pudding;'Ohana Bread Pudding,4


In [32]:
df = df[df.leaf.isin(ingredients.label)]
df.shape

(1083, 3)

In [33]:
df.leaf.unique().shape[0] == df.leaf.shape[0]

True

In [34]:
df.leaf.unique().shape[0]

1083

In [35]:
cleaned_taxonomy = tree_from_leaves(df, sep=";", min_leaf_depth=3)

In [36]:
cleaned_taxonomy

{'0:food': {'1:food product': {'2:cheese': {'3:processed cheese': {'4:American cheese': {}},
    '8:pressed cheese': {'9:Appenzeller cheese': {},
     '31:Cotija cheese': {},
     '955:Bergkäse': {'956:raclette cheese': {}}},
    '81:French cheese': {'82:Philadelphia Cream Cheese': {},
     '568:fromage blanc': {}},
    '86:mould cheese': {'87:Roquefort cheese': {},
     '93:Stilton cheese': {},
     '243:blue cheese': {},
     '616:gorgonzola': {}},
    '267:brick cheese': {},
    "357:cow's-milk cheese": {'358:cheese curds': {}, '458:cream cheese': {}},
    '361:cheese spread': {},
    '449:fresh cheese': {'450:cottage cheese': {},
     '569:fromage frais': {},
     '776:mascarpone': {}},
    '544:farmer cheese': {},
    '547:mixed milk cheese': {'548:feta': {}},
    '653:hard cheese': {},
    '763:manchego cheese': {},
    '902:pasta filata': {'903:pizza cheese': {}},
    '953:queso blanco': {},
    '992:whey cheese': {'993:ricotta salata': {}},
    '1057:smoked cheese': {}},
   '25

In [37]:
with open("../data/wikitax/wikitax_ingredients_taxonomy.json", "w") as f:
    json.dump(cleaned_taxonomy, f, indent=4)