In [None]:
import requests
"""
The query is:

"""

query = """
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX schema: <http://schema.org/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX prov: <http://www.w3.org/ns/prov#>
PREFIX pr: <http://www.wikidata.org/prop/reference/>
SELECT DISTINCT ?wikidata_id ?wikipedia_id ?image ?sitelinks ?name ?parent ?parentPref ?taxRankName WHERE {
  #BIND(wd:Q13446570 AS ?wikidata_id)
  {?wikidata_id wdt:P171 ?parent. } UNION {BIND( wd:Q2382443 AS ?wikidata_id)} # either it has a parent or it is a root
  OPTIONAL {?wikidata_id wdt:P18 ?image.}
  OPTIONAL {?wikidata_id wdt:P105 ?taxRank.
  ?taxRank @en@rdfs:label ?taxRankName .
  }
  OPTIONAL { # this is a particular paper we trust for the high-level hierarchy
  ?wikidata_id <http://www.wikidata.org/prop/P171> ?parentPref.
  ?parentPref prov:wasDerivedFrom ?x.
  ?x <http://www.wikidata.org/prop/reference/P248> wd:Q19858624.
  }
  OPTIONAL {
  ?wikipedia_id schema:inLanguage "en" .
  ?wikipedia_id schema:about ?wikidata_id.
  ?wikidata_id ^schema:about/wikibase:sitelinks ?sitelinks .
  # must be wikipedia
  ?wikipedia_id schema:isPartOf <https://en.wikipedia.org/> .
  }
  ?wikidata_id @en@rdfs:label ?name .
}
"""
url = "https://qlever.cs.uni-freiburg.de/api/wikidata?query=" + requests.utils.quote(query) + "&action=tsv_export"
r = requests.get(url)
r.raise_for_status()
with open("data.tsv", "w") as f:
    f.write(r.text)



In [1]:

import pandas as pd
headings = ["wikidata_id", "wikipedia_id", "image", "sitelinks", "name", "parent", "parentPref", "taxRankName"]
df = pd.read_csv("data.tsv", sep="\t", names=headings)
from_wd_id = {}
import tqdm
import dataclasses
@dataclasses.dataclass
class Taxon:
    wd_id: str
    name: str
    parent: str
    wikipedia_id: str
    image: str
    taxRank: str
    children: list = dataclasses.field(default_factory=list)
    ranks: dict = dataclasses.field(default_factory=dict)

grouped = df.groupby("wikidata_id")
for wd_id, group in tqdm.tqdm(grouped):
    # get first for these:
    image = group["image"].iloc[0]
    wikipedia_id = group["wikipedia_id"].iloc[0]
    name = group["name"].iloc[0]
    taxRank = group["taxRankName"].iloc[0]

    # there is a single row in group:
    if len(group.parent.unique()) == 1:
        parent = group["parent"].iloc[0]
    else:
        # if there is a non-null parentPref, use that
        parentPref = group["parentPref"].unique()
        parentPref = parentPref[~pd.isnull(parentPref)]
        if len(parentPref) > 0:
            parent = parentPref[0]
        else:
            # else use the parent
            parent = group["parent"].iloc[0]
        
    taxon = Taxon(wd_id, name, parent, wikipedia_id, image, taxRank)
    from_wd_id[wd_id] = taxon




  df = pd.read_csv("data.tsv", sep="\t", names=headings)
100%|██████████| 3615774/3615774 [06:12<00:00, 9698.13it/s] 


In [2]:
to_include = set()
for taxon in from_wd_id.values():
    # wikipedia id not nan
    if not pd.isnull(taxon.image):
        to_include.add(taxon.wd_id)
# iteratively add parents
while True:
    to_add = set()
    for wd_id in to_include:
        taxon = from_wd_id[wd_id]
        if taxon.parent in from_wd_id and taxon.parent not in to_include:
            to_add.add(taxon.parent)
    if len(to_add) == 0:
        break
    to_include.update(to_add)
    print(len(to_add))

11109
1143
167
30
7
3
1


In [3]:
missings = 0
for wd_id, taxon in from_wd_id.items():
    if wd_id in to_include:
        try:
            parent = from_wd_id[taxon.parent]
            parent.children.append(taxon)
        except KeyError:
            print("no parent for", wd_id, taxon.name, taxon.parent)
            missings += 1
print("missings", missings)

no parent for <http://www.wikidata.org/entity/Q10719811> Weyrauchia peruviana@en <http://www.wikidata.org/entity/Q102318370>
no parent for <http://www.wikidata.org/entity/Q108225714> Dactyloidites@en <http://www.wikidata.org/entity/Q23012932>
no parent for <http://www.wikidata.org/entity/Q109914291> Phasmichnus radagasti@en <http://www.wikidata.org/entity/Q109914298>
no parent for <http://www.wikidata.org/entity/Q111398064> Hapsidophyllas@en _:u_351360f8ac54364e1255a14431845818
no parent for <http://www.wikidata.org/entity/Q112669720> Wathondara kotejai@en <http://www.wikidata.org/entity/Q112669745>
no parent for <http://www.wikidata.org/entity/Q11299399> Cryptista@en <http://www.wikidata.org/entity/statement/Q11299399-E20CC939-DDC9-4830-9DA3-8EBDA79B16B6>
no parent for <http://www.wikidata.org/entity/Q113991275> Parioscorpio@en <http://www.wikidata.org/entity/Q124147>
no parent for <http://www.wikidata.org/entity/Q1205126> Choanozoa@en <http://www.wikidata.org/entity/statement/Q120512

In [4]:
def as_newick(taxon):
    if len(taxon.children) == 0:
        return taxon.wd_id+":1"
    else:
        return f"({','.join(as_newick(child) for child in taxon.children)}){taxon.wd_id}:1"

for key, value in from_wd_id.items():
    # strip the <http://www.wikidata.org/entity/
    value.wd_id = value.wd_id.replace("<http://www.wikidata.org/entity/", "").replace(">", "")

root = from_wd_id["<http://www.wikidata.org/entity/Q2382443>"]
newick = as_newick(root)

In [5]:
def yield_all_descendants(taxon):
    yield taxon
    for child in taxon.children:
        yield from yield_all_descendants(child)

for starting_point in tqdm.tqdm(list(yield_all_descendants(root))):
    if starting_point.taxRank:
        for descendant in yield_all_descendants(starting_point):
            descendant.ranks[starting_point.taxRank] = starting_point.name

100%|██████████| 179780/179780 [00:05<00:00, 35418.38it/s]


In [6]:
from_wd_id["<http://www.wikidata.org/entity/Q1344556>"]

Taxon(wd_id='Q1344556', name='Duck-billed Buntingi@en', parent='<http://www.wikidata.org/entity/Q662946>', wikipedia_id='<https://en.wikipedia.org/wiki/Adrianichthys_kruyti>', image='<http://commons.wikimedia.org/wiki/Special:FilePath/Andrianichthys%20kruyti.jpg>', taxRank='species@en', children=[], ranks={'superdomain@en': 'biota@en', 'superkingdom@en': 'eukaryote@en', 'kingdom@en': 'animal@en', 'subkingdom@en': 'Bilateria@en', 'infrakingdom@en': 'deuterostome@en', 'phylum@en': 'Chordata@en', 'subphylum@en': 'Vertebrata@en', 'infraphylum@en': 'Gnathostomata@en', nan: 'Teleostomi@en', 'megaclass@en': 'Osteichthyes@en', 'superclass@en': 'Actinopterygii@en', 'order@en': 'Beloniformes@en', 'suborder@en': 'Adrianichthyoidei@en', 'family@en': 'ricefish@en', 'subfamily@en': 'Adrianichthyinae@en', 'genus@en': 'Adrianichthys@en', 'species@en': 'Duck-billed Buntingi@en'})

In [9]:
rows = []
for node in tqdm.tqdm(list(yield_all_descendants(root))):
    row = {}
    row["wd_id"] = node.wd_id
    for rank, name in node.ranks.items():
        # if rank or name is nan, skip
        if pd.isnull(rank) or pd.isnull(name):
            continue
        rank_fixed = rank.replace("@en", "")
        name_fixed = name.replace("@en", "")
        row[rank_fixed] = name_fixed
    rows.append(row)

import pandas as pd
rank_df = pd.DataFrame(rows)


100%|██████████| 179780/179780 [00:03<00:00, 51771.84it/s]


In [12]:
#replace any nans with ""
rank_df = rank_df.fillna("")

In [28]:

import gzip
# write to file
with open("wikidata-taxonomy.nwk", "wt") as f:
    f.write(newick)
    f.write(";")

In [14]:
df_out = df.copy()
# filter to to_include
df_out = df_out[df_out.wikidata_id.isin(to_include)]
#remove @en@ from labels
df_out["name"] = df_out["name"].str.replace("@en", "")
df_out["taxRankName"] = df_out["taxRankName"].str.replace("@en", "")
# remove <http://www.wikidata.org/entity/ from wikidata_id
df_out["wikidata_id"] = df_out["wikidata_id"].str.replace("<http://www.wikidata.org/entity/", "").str.replace(">", "")
# remove <https://en.wikipedia.org/wiki/ from wikipedia_id
df_out["wikipedia_id"] = df_out["wikipedia_id"].str.replace("<https://en.wikipedia.org/wiki/", "").str.replace(">", "")

import hashlib
import urllib
def get_wc_thumb(image, width=200): # image = e.g. from Wikidata, width in pixels
    if image == "":
        return ""
    if pd.isnull(image):
        return ""
    image = image.replace(' ', '_') # need to replace spaces with underline 
    m = hashlib.md5()
    m.update(image.encode('utf-8'))
    d = m.hexdigest()
    return "https://upload.wikimedia.org/wikipedia/commons/thumb/"+d[0]+'/'+d[0:2]+'/'+image+'/'+str(width)+'px-'+image

# strip the http://commons.wikimedia.org/wiki/Special:FilePath/ from image
# if image is nan set to ""
df_out.loc[pd.isnull(df_out.image), "image"] = ""
df_out["image_thumb"] = df_out["image"].str.replace("<http://commons.wikimedia.org/wiki/Special:FilePath/", "")
df_out["image_thumb"] = df_out["image_thumb"].str.replace(">", "")
#unurl encode
def unurlencode(s):
    # %20 to " ", etc.
    return urllib.parse.unquote(s)
df_out["image_thumb"] = df_out["image_thumb"].apply(unurlencode)
# add the thumb
df_out["image_thumb"] = df_out["image_thumb"].apply(get_wc_thumb)



  df_out["wikidata_id"] = df_out["wikidata_id"].str.replace("<http://www.wikidata.org/entity/", "").str.replace(">", "")
  df_out["wikipedia_id"] = df_out["wikipedia_id"].str.replace("<https://en.wikipedia.org/wiki/", "").str.replace(">", "")
  df_out["image_thumb"] = df_out["image"].str.replace("<http://commons.wikimedia.org/wiki/Special:FilePath/", "")


In [15]:
def format_wikilink(s):
    if pd.isnull(s):
        return ""
    if s == "":
        return ""
    return f"[{s}](https://en.wikipedia.org/wiki/{s})"
df_out["wikipedia_id"] = df_out["wikipedia_id"].apply(format_wikilink)

# capitalize the first letter of the name
df_out["name"] = df_out["name"].str.capitalize()
def format_wikidata(s):
    if pd.isnull(s):
        return ""
    if s == "":
        return ""
    return f"[{s}](https://www.wikidata.org/wiki/{s})"

df_out["wikidata_link"] = df_out["wikidata_id"].apply(format_wikidata)

In [16]:
# filter columns and rename to WikiPage, WikiData_id, ThumbnailURL, rank
df_out = df_out[["wikidata_id", "wikipedia_id", "image_thumb", "name", "taxRankName","wikidata_link"]]
df_out.columns = ["WikiData_id", "WikiPage", "ThumbnailURL", "name", "rank","WikidataLink"]



In [18]:
#make df_out unique by wikidata_id
df_out = df_out.drop_duplicates(subset="WikiData_id")
# merge with rank_df
df_out = df_out.merge(rank_df, left_on="WikiData_id", right_on="wd_id", how="left")
df_out.to_csv("wikidata-taxonomy.tsv", sep="\t", index=False)

In [19]:
rank_df.columns

Index(['wd_id', 'superdomain', 'domain', 'superkingdom', 'phylum', 'order',
       'family', 'genus', 'species', 'class', 'kingdom', 'superphylum',
       'subkingdom', 'infrakingdom', 'pathovar', 'subspecies', 'subphylum',
       'tribe', 'subclass', 'subfamily', 'variety', 'suborder', 'division',
       'infraphylum', 'superclass', 'superorder', 'subgenus', 'subdomain',
       'subdivision', 'superfamily', 'infraorder', 'form', 'section',
       'parvorder', 'infraclass', 'supertribe', 'clade', 'nothospecies',
       'magnorder', 'hyporder', 'subtribe', 'grandorder', 'mirorder',
       'superlegion', 'legion', 'infralegion', 'megaclass', 'cohort',
       'megacohort', 'supercohort', 'subcohort', 'infracohort', 'subsection',
       'series', 'ichnogenus', 'subterclass', 'epifamily', 'species group',
       'infratribe', 'nanorder', 'forma specialis', 'realm'],
      dtype='object')

In [47]:
#extra_cols = [x for x in list(rank_df.columns) if x != "wd_id" and " " not in x]
extra_cols = ["kingdom","phylum", "order", "family", "genus"]

In [49]:
command = f"""newick_to_taxonium -i wikidata-taxonomy.nwk -m wikidata-taxonomy.tsv -j config.json --key_column WikiData_id -c WikiPage,ThumbnailURL,name,rank,WikidataLink,{
    ",".join(extra_cols)
} -o out.jsonl.gz"""
import os
os.system(command)

Loading metadata file..
Metadata loaded


  config = json.load(open(config_file))


Ladderizing tree..
Ladderizing done
Setting x coordinates |████████████████████████████████████████| (!) 179780 in 0.2s (1081799.10/s) 
Normalising x coordinates |████████████████████████████████████████| 179780/179780 [100%] in 0.1s (1383505.75/s) 
Setting terminal y coordinates |████████████████████████████████████████| (!) 128831 in 0.1s (1143369.78/s) 
Setting internal y coordinates |████████████████████████████████████████| (!) 50949 in 0.1s (474609.38/s) 
Sorting on y |████████████████████████████████████████| (!) 179780 in 0.1s (1259269.29/s) 
Converting each node, and writing out in JSON |████████████████████████████████████████| 179780/179780 [100%] in 4.4s (40651.16/s) 
Done. Output written to out.jsonl.gz, with 179780 nodes.


  do_processing(args.input,


0