In [1]:
from pathlib import Path

from tqdm.notebook import tqdm

In [2]:
with Path("untitled1.txt").open() as ifh:
    tolnames = [_.strip() for _ in ifh.readlines()]

In [3]:
!head new_taxdump/rankedlineage.dmp

1	|	root	|		|		|		|		|		|		|		|		|
131567	|	cellular organisms	|		|		|		|		|		|		|		|		|
2157	|	Archaea	|		|		|		|		|		|		|		|		|
1935183	|	Asgard group	|		|		|		|		|		|		|		|	Archaea	|
2798909	|	Candidatus Baldrarchaeota	|		|		|		|		|		|		|		|	Archaea	|
2798916	|	Candidatus Baldrarchaeia	|		|		|		|		|		|	Candidatus Baldrarchaeota	|		|	Archaea	|
2798922	|	Candidatus Baldrarchaeales	|		|		|		|		|	Candidatus Baldrarchaeia	|	Candidatus Baldrarchaeota	|		|	Archaea	|
2798928	|	Candidatus Baldrarchaeaceae	|		|		|		|	Candidatus Baldrarchaeales	|	Candidatus Baldrarchaeia	|	Candidatus Baldrarchaeota	|		|	Archaea	|
2798934	|	Candidatus Baldrarchaeum	|		|		|	Candidatus Baldrarchaeaceae	|	Candidatus Baldrarchaeales	|	Candidatus Baldrarchaeia	|	Candidatus Baldrarchaeota	|		|	Archaea	|
2798940	|	Candidatus Baldrarchaeum yapensis	|		|	Candidatus Baldrarchaeum	|	Candidatus Baldrarchaeaceae	|	Candidatus Baldrarchaeales	|	Candidatus Baldrarchaeia	|	Candidatus Baldrarchaeota	|		|	Archaea	|


In [4]:
def is_excluded(str):
    """Return True if we should exclude this record"""
    exclude_list = ["Candidatus", "sp.", "unclassified", "samples",
                    "unidentified", "uncultured", "symbiont", " x ",
                    "cf.", "-like"]
    for _ in exclude_list:
        if _ in str:
            return True
    return False

genus_species = []

with Path("new_taxdump/rankedlineage.dmp").open() as ifh:
    for line in (_.strip() for _ in ifh.readlines()):
        parts = [_.strip() for _ in line.split("|")]
        if len(parts[2]) == 0 and len(parts[3]):  # no species, but genus exists -> candidate Genus/species pair
            if not is_excluded(parts[1]) and len(parts[1].split()) == 2:
                genus_species.append(parts[1])

In [5]:
len(genus_species)

553346

In [6]:
genus_species[1000:2000]

['Helicobacter cinaedi',
 'Helicobacter colisuis',
 'Helicobacter cynogastricus',
 'Helicobacter delphinicola',
 'Helicobacter didelphidarum',
 'Helicobacter enhydrae',
 'Helicobacter equorum',
 'Helicobacter felis',
 'Helicobacter fennelliae',
 'Helicobacter ganmani',
 'Helicobacter heilmannii',
 'Helicobacter hepaticus',
 'Helicobacter himalayensis',
 'Helicobacter jaachi',
 'Helicobacter japonicus',
 'Helicobacter kayseriensis',
 'Helicobacter kumamotonensis',
 'Helicobacter labacensis',
 'Helicobacter labetoulli',
 'Helicobacter macacae',
 'Helicobacter magdeburgensis',
 'Helicobacter marmotae',
 'Helicobacter mastomyrinus',
 'Helicobacter mehlei',
 'Helicobacter mesocricetorum',
 'Helicobacter monodelphidis',
 'Helicobacter muricola',
 'Helicobacter muridarum',
 'Helicobacter mustelae',
 'Helicobacter pametensis',
 'Helicobacter peregrinus',
 'Helicobacter pullorum',
 'Helicobacter pylori',
 'Helicobacter rappini',
 'Helicobacter rodentium',
 'Helicobacter saguini',
 'Helicobacter

In [7]:
matched_names = []

for name in tqdm(genus_species):
    testname = name.replace(" ", "_")
    for tolname in tolnames:
        if testname in tolname:
            matched_names.append(testname)

len(matched_names)

  0%|          | 0/553346 [00:00<?, ?it/s]

1203

In [8]:
sorted(matched_names)[:100]

['Abiotrophia_defectiva',
 'Acanthamoeba_castellanii',
 'Acaricomes_phytoseiuli',
 'Acaryochloris_marina',
 'Acetivibrio_cellulolyticus',
 'Acetobacter_pasteurianus',
 'Acetobacterium_woodii',
 'Acetohalobium_arabaticum',
 'Acetonema_longum',
 'Acholeplasma_laidlawii',
 'Achromobacter_xylosoxidans',
 'Acidaminobacter_hydrogenoformans',
 'Acidaminococcus_fermentans',
 'Acidianus_hospitalis',
 'Acidilobus_saccharovorans',
 'Acidimicrobium_ferrooxidans',
 'Acidiphilium_multivorum',
 'Acidithiobacillus_ferrivorans',
 'Acidobacterium_capsulatum',
 'Acidocella_facilis',
 'Acidothermus_cellulolyticus',
 'Acinetobacter_baumannii',
 'Actibacterium_mucosum',
 'Actinobacillus_pleuropneumoniae',
 'Actinokineospora_inagensis',
 'Actinomadura_madurae',
 'Actinomyces_naeslundii',
 'Actinomycetospora_chiangmaiensis',
 'Actinoplanes_missouriensis',
 'Actinopolymorpha_alba',
 'Actinopolyspora_mortivallis',
 'Actinospica_robiniae',
 'Actinosynnema_mirum',
 'Actinotalea_ferrariae',
 'Acyrthosiphon_pisum',

In [9]:
with Path("TOL_matches.txt").open("w") as ofh:
    for name in sorted(matched_names):
        ofh.write(f'{name.replace("_", " ")}\n')