In [1]:
import init_project
PROJECT = init_project.PROJECT

from IPython.display import display, Markdown
import pandas as pd

M1 MacBook Air で 約 15 秒。M3 MacBook Air で約 7.5 秒。

In [2]:
COLUMNS = 'col:ID col:parentID col:status col:scientificName col:rank'.split()
CoL = pd.read_csv(PROJECT / 'data' / 'NameUsage.tsv', sep='\t', header=0, on_bad_lines='warn', usecols=COLUMNS).rename(columns=lambda x: x.split(':')[1]).query('status == "accepted"').drop(columns=['status'])
CoL.reset_index()
CoL

Unnamed: 0,ID,parentID,scientificName,rank
0,63W48,HFD,Trichosphaerium,genus
1,HFD,QM,Trichosidae,family
2,7C977,646LT,Thermococcus barossii,species
3,53FRF,6463M,Sulfurococcus mirabilis,species
4,42FG7,5QTK,Methanotorris igneus,species
...,...,...,...,...
5005929,C8Y6D,C8YWG,Anaspidoglanis,genus
5005930,C9MCZ,C9MCY,Dinophysis,genus
5005931,C9X59,622TP,Greenella,genus
5005932,CC2RH,7NLGN,Reddellobus,genus


In [3]:
ROOTS = {id for id in CoL[CoL.parentID.isna()].ID}
CoL[CoL.ID.isin(ROOTS)]

Unnamed: 0,ID,parentID,scientificName,rank
4997820,5T6MX,,Biota,unranked
5000408,V,,Viruses,unranked


# 木構造の構成

`build_tree()` は MacBook Air M1 で 1分8秒。一度実行すると3つの JSON ファイルを生成する。二度目以後は、JSON ファイルが存在する場合は、ファイルを読み込むだけだから高速で 11 秒。

M1 MacBook Air の Low Power Mode on のとき
1m 20s: Root Life: Biota, Viruses

M3 MacBook Air で約 51s。JSON の保存に約 10s

In [29]:
import json

def build_tree():
    ROOT = 'ROOT of LIFE'

    # CoL ID -> index の対応表
    index = {id: i for i, id in enumerate(CoL.ID)}
    index[ROOT] = -1

    # Life の木構造（MacBook Air M1 で約48秒）
    # 各頂点の先頭の要素はその頂点の親の ID、それ以降はその頂点の子

    lives = [{'n': n, 'name': name, 'parent': -1, 'children': []}
           for n, name in enumerate(CoL.scientificName)]
    lives.append({'n': -1, 'name': ROOT, 'parent': -1, 'children': []})  # ToL[-1]

    # 学名 -> index の対応表
    lookup = { life['name']: life['n'] for life in lives }

    orphans = []

    for _, life in CoL.iterrows():
        _life = lives[index[life.ID]]
        try:
            if pd.isna(life.parentID): print(f'Root of life: {life.scientificName}')
            parent = lives[index[life.parentID]]
            _life['parent'] = parent['n']
            parent['children'].append(_life['n'])
        except: orphans.append({ 'id': life.ID, 'name': life.scientificName }) # parentID が登録されていない Life は無視する

    return dict(lives=lives, index=index, lookup=lookup, orphans=orphans)

F_ToL = PROJECT / 'data' / 'tree_of_life.json'
ToL = build_tree()
with F_ToL.open('w') as w: json.dump(ToL, w)

Root of life: Biota
Root of life: Viruses


In [33]:
ToL['lives'][ToL['lookup']['Biota']]

{'n': 2429906,
 'name': 'Biota',
 'parent': -1,
 'children': [225, 8676, 306929, 2429995, 2431687, 2432069, 2436434]}

In [34]:
ToL['lives'][ToL['lookup']['Homo sapiens']]

{'n': 1576323, 'name': 'Homo sapiens', 'parent': 1571040, 'children': []}