# Load the downloaded data and prepare it

In [1]:
import random

In [2]:
timestamp = '2023-06-15T14:27:48UTC'

DATA_DIR = '../data'
ARCHIVE_DIR = f'{DATA_DIR}/archive'
archive_file = f'{DATA_DIR}/ncbi_taxonomy_taxdump_{timestamp}.tar.gz'

archive_file

'../data/ncbi_taxonomy_taxdump_2023-06-15T14:27:48UTC.tar.gz'

In [3]:
# Uncomment and run the below if you haven't already

# !mkdir -p {ARCHIVE_DIR}
# !tar xzvf {archive_file} --directory {ARCHIVE_DIR}

In [4]:
# Readme.txt explains the format of the documents
# we are interested in extracting
# common name
# scientific name
# of all 'species' rank

In [5]:
rank_species_tax_ids = []

In [6]:
with open(f'{ARCHIVE_DIR}/nodes.dmp') as nodes:
    for line in nodes:
        (tax_id, _, rank, *_) = line.split("\t|\t")
        if rank == 'species':
            rank_species_tax_ids.append(tax_id)

In [7]:
len(rank_species_tax_ids)

2053178

In [8]:
rank_species_tax_ids[:10]

['7', '9', '11', '14', '17', '19', '21', '23', '24', '25']

In [9]:
# Collect all the common and scientific names

In [10]:
tax_id_common_name = {}
tax_id_scientific_name = {}

In [11]:
with open(f"{ARCHIVE_DIR}/names.dmp") as names:
    for line in names:
        (tax_id, name, _, _name_type) = line.split("\t|\t")
        name_type = _name_type.rstrip("\t|\n")
        
        if name_type == 'common name' or name_type == 'genbank common name':
            tax_id_common_name[tax_id] = name
        elif name_type == 'scientific name':
            tax_id_scientific_name[tax_id] = name

In [12]:
len(tax_id_common_name), len(tax_id_scientific_name)

(36813, 2509799)

In [13]:
!grep 'common name' {ARCHIVE_DIR}/names.dmp | wc -l

44984


In [14]:
!grep 'genbank common name' {ARCHIVE_DIR}/names.dmp | wc -l

30337


In [15]:
!grep 'scientific name' {ARCHIVE_DIR}/names.dmp | wc -l

2509799


In [16]:
names_both = []
names_only_common = []
names_only_scientific = []
names_scientific_maybe_common = []

for i, tax_id in enumerate(rank_species_tax_ids):
    sci_name, common_name = tax_id_scientific_name.get(tax_id, None), tax_id_common_name.get(tax_id, None)
    match (sci_name, common_name):
        case None, _:
            raise "should not happen"
        case _, None:
            names_only_scientific.append(sci_name)
            names_scientific_maybe_common.append(f'{sci_name}')
        case _, _:
            names_only_scientific.append(sci_name)
            names_only_common.append(common_name)
            names_both.append(f'{sci_name} ({common_name})')
            names_scientific_maybe_common.append(f'{sci_name} ({common_name})')

len(names_both), len(names_only_common), len(names_only_scientific), len(names_scientific_maybe_common)

(33335, 33335, 2053178, 2053178)

In [17]:
random.sample(names_both, 10)

['Atropus hedlandensis (bumpnose trevally)',
 'Naso lopezi (elongate unicornfish)',
 'Euphorbia serrata (saw-tooth spurge)',
 'Ichthyophis tricolor (three-colored caecilian)',
 'Amphiprion akindynos (barrier reef anemonefish)',
 'Hipparchia cretica (Cretan greyling)',
 'Bombus lucorum (white-tailed bumblebee)',
 'Soleichthys serpenpellis (snakeskin sole)',
 "Heterixalus boettgeri (Boettiger's reed frog)",
 'Ctenochaetus truncatus (squaretail bristletooth tang)']

In [18]:
random.sample(names_only_common, 10)

['Atlantic lizardfish',
 'ornate tree toad',
 'redspot cardinalfish',
 "Jan's cliff racer",
 'longfin smelt',
 'black-tailed whistler',
 'black-streaked puffbird',
 'red-tailed snakehead',
 'South American leaf blight of rubber trees',
 'Trinidad piping guan']

In [19]:
random.sample(names_only_scientific, 10)

['Schizoseris multifoliata',
 'Paropsis variolosa',
 'Cochylis tasmaniana',
 'Cecidomyiidae sp. BIOUG20703-B03',
 'Myxobolus sp. GZP-2018-Samsun3',
 'Anystidae sp. BIOUG16068-B12',
 'Myanmarorchestia peterjaegeri',
 'Ampedus quercicola',
 'Eupelmus cerris',
 'Rahnella sp. J78']

In [20]:
random.sample(names_scientific_maybe_common, 10)

['Micrococcus sp. Eur1 9.1',
 'Nocardiopsis sp. M5S13',
 'Oppiella sp. BIOUG27128-B06',
 'Vibrio sp. HQW7',
 'Alviniconcha sp. Al-I-5',
 'Pantoea sp. SAP72_2',
 'Empidinae sp. BIOUG27353-D02',
 'Pseudomonas sp. M5-33',
 'Suctobelbella sp. BIOUG25049-B12',
 'Eucera spectabilis']

In [21]:
with open(f"{DATA_DIR}/species_names_both.txt", "w") as names:
    names.write('\n'.join(names_both))

# with open(f"{DATA_DIR}/species_names_only_common.txt", "w") as names:
#     names.write('\n'.join(names_only_common))

# with open(f"{DATA_DIR}/species_names_only_scientific.txt", "w") as names:
#     names.write('\n'.join(names_only_scientific))

# with open(f"{DATA_DIR}/species_names_scientific_maybe_common.txt", "w") as names:
#     names.write('\n'.join(names_scientific_maybe_common))

In [22]:
!ls -alh {DATA_DIR}

total 61M
drwxr-xr-x 4 containeruser users 4.0K Jun 15 18:41 .
drwxr-xr-x 5 containeruser users 4.0K Jun 15 16:36 ..
drwxr-xr-x 2 containeruser users 4.0K Jun 15 18:41 .ipynb_checkpoints
drwxr-xr-x 3 containeruser users 4.0K Jun 15 18:14 archive
-rw-r--r-- 1 containeruser users  59M Jun 15 16:27 ncbi_taxonomy_taxdump_2023-06-15T14:27:48UTC.tar.gz
-rw-r--r-- 1 containeruser users 1.3M Jun 15 18:41 species_names_both.txt


In [23]:
# that's give or take the same amount as tinyshakespeare dataset
# so we should be good to go