# process taxa names

In [1]:
import pandas as pd
from datetime import date
from pathlib import Path 
import numpy as np
import time
import requests

import sys
sys.path.append(str(Path.cwd().parent))

from scripts.normalize_taxa import (
    taxon_name_parser, 
    get_parent_taxa, 
    PBDB_TAXA_NAME,
    add_normalized_name_column
)
from scripts.normalize_data import print_df


In [2]:
taxa_draft_path = Path('..', 'processed_data', 'drafts', 'taxa_list.csv')
genus_path = Path('..', 'processed_data', 'drafts', 'genera.csv')

approved_taxa_path = Path('..', 'raw_data', 'PI_processed_files', 'LIMS_Micropal_headers_PBDB_Taxonomy_notes_taxa_list_2022-02-22.csv')
taxa_search_path = Path('..', 'processed_data', 'draft', 'taxa_list_search.csv')


## add name parts columns to taxa file

In [3]:
taxa_list_df = pd.read_csv(taxa_draft_path)
taxa_list_df.head()

Unnamed: 0,verbatim_name
0,Beella digitata
1,Candeina nitida
2,Dentoglobigerina altispira
3,Dentoglobigerina altispira _T_ _PL5
4,Dentoglobigerina altispira _T_ _PL5_


In [4]:
taxa_list = []

for taxon in taxa_list_df['verbatim_name'].values:
    if not pd.isna(taxon):
        
        taxon_name_parts = taxon_name_parser(taxon)
        taxon_name_parts['verbatim_name'] = taxon
 
        taxa_list.append(taxon_name_parts)
        
len(taxa_list)

141

In [5]:
new_df = pd.DataFrame(taxa_list)
print_df(new_df)

(141, 7)


Unnamed: 0,genus name,species name,verbatim_name,subspecies name,species modifier,non-taxa descriptor,subspecies modifier
0,Beella,digitata,Beella digitata,,,,
1,Candeina,nitida,Candeina nitida,,,,
2,Dentoglobigerina,altispira,Dentoglobigerina altispira,,,,
3,Dentoglobigerina,altispira,Dentoglobigerina altispira _T_ _PL5,_T_,,,
4,Dentoglobigerina,altispira,Dentoglobigerina altispira _T_ _PL5_,_T_,,,


In [6]:
new_df.to_csv(taxa_draft_path, index=False)

## add pbdb taxon ids for genera

In [7]:
taxa_df = pd.read_csv(taxa_draft_path, dtype=str)
print_df(taxa_df)

(141, 7)


Unnamed: 0,genus name,species name,verbatim_name,subspecies name,species modifier,non-taxa descriptor,subspecies modifier
0,Beella,digitata,Beella digitata,,,,
1,Candeina,nitida,Candeina nitida,,,,
2,Dentoglobigerina,altispira,Dentoglobigerina altispira,,,,
3,Dentoglobigerina,altispira,Dentoglobigerina altispira _T_ _PL5,_T_,,,
4,Dentoglobigerina,altispira,Dentoglobigerina altispira _T_ _PL5_,_T_,,,


In [8]:
genus_df = pd.DataFrame(taxa_df['genus name'].unique(), columns=['genus name'])

print_df(genus_df)

(18, 1)


Unnamed: 0,genus name
0,Beella
1,Candeina
2,Dentoglobigerina
3,Dextral:Sinistral
4,Globigerina


In [9]:
for index, row in genus_df.iterrows():

    time.sleep(0.25)
    
    if index % 50 == 0:
        print(index, end=' ')

        
    url =  PBDB_TAXA_NAME +  row['genus name']
        
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            genus_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            round = 0
            get_parent_taxa(genus_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)

      

0 

In [10]:
print_df(genus_df)

(18, 10)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Beella,951.0,Beella,genus,82191.0,Globigerinidae,288974.0,Foraminifera,212476.0,Rhizaria
1,Candeina,1053.0,Candeina,genus,422277.0,Candeinidae,288974.0,Foraminifera,212476.0,Rhizaria
2,Dentoglobigerina,1264.0,Dentoglobigerina,genus,82191.0,Globigerinidae,288974.0,Foraminifera,212476.0,Rhizaria
3,Dextral:Sinistral,,,,,,,,,
4,Globigerina,1498.0,Globigerina,genus,82191.0,Globigerinidae,288974.0,Foraminifera,212476.0,Rhizaria


In [11]:
genus_df.to_csv(genus_path, index=False)

In [12]:
merged_df = taxa_df.merge(genus_df,  on = 'genus name' )

print_df(merged_df)

(141, 16)


Unnamed: 0,genus name,species name,verbatim_name,subspecies name,species modifier,non-taxa descriptor,subspecies modifier,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Beella,digitata,Beella digitata,,,,,951,Beella,genus,82191,Globigerinidae,288974,Foraminifera,212476,Rhizaria
1,Candeina,nitida,Candeina nitida,,,,,1053,Candeina,genus,422277,Candeinidae,288974,Foraminifera,212476,Rhizaria
2,Dentoglobigerina,altispira,Dentoglobigerina altispira,,,,,1264,Dentoglobigerina,genus,82191,Globigerinidae,288974,Foraminifera,212476,Rhizaria
3,Dentoglobigerina,altispira,Dentoglobigerina altispira _T_ _PL5,_T_,,,,1264,Dentoglobigerina,genus,82191,Globigerinidae,288974,Foraminifera,212476,Rhizaria
4,Dentoglobigerina,altispira,Dentoglobigerina altispira _T_ _PL5_,_T_,,,,1264,Dentoglobigerina,genus,82191,Globigerinidae,288974,Foraminifera,212476,Rhizaria


In [13]:
merged_df.to_csv(taxa_draft_path, index=False)

## create csv to search for taxa

In [14]:
all_taxa_df = pd.read_csv(approved_taxa_path, dtype=str)

print_df(all_taxa_df, 2)

(4753, 32)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,
1,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria


In [15]:
del all_taxa_df['taxon_group']
del all_taxa_df['notes']
del all_taxa_df['name comment field']
del all_taxa_df['Comment']


all_taxa_df.drop_duplicates(inplace=True)
print_df(all_taxa_df, 2)

(4606, 28)


Unnamed: 0,verbatim_name,name,"name to use (if different from ""name"")",Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Pyrite,Pyrite,,,,,,,,,...,,,,,,,,,,
1,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,,Euuvigerina,,,,miozea,...,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria


In [16]:
taxa_df = pd.read_csv(taxa_draft_path, dtype=str, usecols=['verbatim_name'])


In [17]:
merged_df = taxa_df.merge(all_taxa_df)
print_df(merged_df)

(144, 28)


Unnamed: 0,verbatim_name,name,"name to use (if different from ""name"")",Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Beella digitata,Beella digitata,,,,Beella,,,,digitata,...,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria
1,Candeina nitida,Candeina nitida,,,,Candeina,,,,nitida,...,422277,Candeinidae,,,,,288974,Foraminifera,212476,Rhizaria
2,Dentoglobigerina altispira,Dentoglobigerina altispira,,,,Dentoglobigerina,,,,altispira,...,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria
3,Dentoglobigerina altispira _T_ _PL5,Dentoglobigerina altispira _T_ _PL5,Dentoglobigerina altispira,,,Dentoglobigerina,,,,altispira,...,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria
4,Dentoglobigerina altispira _T_ _PL5_,Dentoglobigerina altispira _T_ _PL5_,Dentoglobigerina altispira,,,Dentoglobigerina,,,,altispira,...,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria


In [18]:
add_normalized_name_column(merged_df)
add_normalized_name_column(merged_df, include_descriptor=False, 
                           include_modifier=False, col_name='basic_name')

print_df(merged_df)

(144, 30)


Unnamed: 0,verbatim_name,name,"name to use (if different from ""name"")",Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,normalized_name,basic_name
0,Beella digitata,Beella digitata,,,,Beella,,,,digitata,...,,,,,288974,Foraminifera,212476,Rhizaria,Beella digitata,Beella digitata
1,Candeina nitida,Candeina nitida,,,,Candeina,,,,nitida,...,,,,,288974,Foraminifera,212476,Rhizaria,Candeina nitida,Candeina nitida
2,Dentoglobigerina altispira,Dentoglobigerina altispira,,,,Dentoglobigerina,,,,altispira,...,,,,,288974,Foraminifera,212476,Rhizaria,Dentoglobigerina altispira,Dentoglobigerina altispira
3,Dentoglobigerina altispira _T_ _PL5,Dentoglobigerina altispira _T_ _PL5,Dentoglobigerina altispira,,,Dentoglobigerina,,,,altispira,...,,,,,288974,Foraminifera,212476,Rhizaria,Dentoglobigerina altispira,Dentoglobigerina altispira
4,Dentoglobigerina altispira _T_ _PL5_,Dentoglobigerina altispira _T_ _PL5_,Dentoglobigerina altispira,,,Dentoglobigerina,,,,altispira,...,,,,,288974,Foraminifera,212476,Rhizaria,Dentoglobigerina altispira,Dentoglobigerina altispira


In [19]:
taxa_df = merged_df[['verbatim_name', 'normalized_name', 'basic_name']]
taxa_df

Unnamed: 0,verbatim_name,normalized_name,basic_name
0,Beella digitata,Beella digitata,Beella digitata
1,Candeina nitida,Candeina nitida,Candeina nitida
2,Dentoglobigerina altispira,Dentoglobigerina altispira,Dentoglobigerina altispira
3,Dentoglobigerina altispira _T_ _PL5,Dentoglobigerina altispira,Dentoglobigerina altispira
4,Dentoglobigerina altispira _T_ _PL5_,Dentoglobigerina altispira,Dentoglobigerina altispira
...,...,...,...
139,Sphaeroidinellopsis seminulina _T_ _PL4,Sphaeroidinellopsis seminulina,Sphaeroidinellopsis seminulina
140,Sphaeroidinellopsis seminulina _T_ _PL4_,Sphaeroidinellopsis seminulina,Sphaeroidinellopsis seminulina
141,Sphaeroidinellopsis subdehiscens,Sphaeroidinellopsis subdehiscens,Sphaeroidinellopsis subdehiscens
142,Turborotalita humilis,Turborotalita humilis,Turborotalita humilis


In [20]:
taxa_df.to_csv(taxa_search_path, index=False)