# process taxa names

In [1]:
import pandas as pd
from datetime import date
from pathlib import Path 
import numpy as np
import time
import requests

import sys
sys.path.append(str(Path.cwd().parent))

from scripts.normalize_taxa import (
    taxon_name_parser, 
    get_parent_taxa, 
    PBDB_TAXA_NAME,
    add_normalized_name_column
)
from scripts.normalize_data import print_df


In [3]:
taxa_list_df = pd.read_csv(taxa_draft_path)
taxa_list_df.head()

Unnamed: 0,verbatim_name
0,Beella digitata
1,Candeina nitida
2,Dentoglobigerina altispira
3,Dentoglobigerina altispira _T_ _PL5
4,Dentoglobigerina altispira _T_ _PL5_


In [4]:
taxa_list = []

for taxon in taxa_list_df['verbatim_name'].values:
    if not pd.isna(taxon):
        
        taxon_name_parts = taxon_name_parser(taxon)
        taxon_name_parts['verbatim_name'] = taxon
 
        taxa_list.append(taxon_name_parts)
        
len(taxa_list)

141

In [5]:
new_df = pd.DataFrame(taxa_list)
print_df(new_df)

(141, 7)


Unnamed: 0,genus name,species name,verbatim_name,subspecies name,species modifier,non-taxa descriptor,subspecies modifier
0,Beella,digitata,Beella digitata,,,,
1,Candeina,nitida,Candeina nitida,,,,
2,Dentoglobigerina,altispira,Dentoglobigerina altispira,,,,
3,Dentoglobigerina,altispira,Dentoglobigerina altispira _T_ _PL5,_T_,,,
4,Dentoglobigerina,altispira,Dentoglobigerina altispira _T_ _PL5_,_T_,,,


In [6]:
new_df.to_csv(taxa_draft_path, index=False)

## add pbdb taxon ids for genera

In [7]:
taxa_df = pd.read_csv(taxa_draft_path, dtype=str)
print_df(taxa_df)

(141, 7)


Unnamed: 0,genus name,species name,verbatim_name,subspecies name,species modifier,non-taxa descriptor,subspecies modifier
0,Beella,digitata,Beella digitata,,,,
1,Candeina,nitida,Candeina nitida,,,,
2,Dentoglobigerina,altispira,Dentoglobigerina altispira,,,,
3,Dentoglobigerina,altispira,Dentoglobigerina altispira _T_ _PL5,_T_,,,
4,Dentoglobigerina,altispira,Dentoglobigerina altispira _T_ _PL5_,_T_,,,


In [8]:
genus_df = pd.DataFrame(taxa_df['genus name'].unique(), columns=['genus name'])

print_df(genus_df)

(18, 1)


Unnamed: 0,genus name
0,Beella
1,Candeina
2,Dentoglobigerina
3,Dextral:Sinistral
4,Globigerina


In [9]:
for index, row in genus_df.iterrows():

    time.sleep(0.25)
    
    if index % 50 == 0:
        print(index, end=' ')

        
    url =  PBDB_TAXA_NAME +  row['genus name']
        
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()["records"]
        if len(data) == 1:
            genus_df.at[index, f'pbdb_taxon_id'] = str(data[0]["taxon_no"])
            genus_df.at[index, f'pbdb_taxon_name'] = data[0]["taxon_name"]
            genus_df.at[index, f'pbdb_taxon_rank'] = data[0]["taxon_rank"]
            
            round = 0
            get_parent_taxa(genus_df, data[0]["parent_no"], data[0]["taxon_rank"], round, index, None)

      

0 

In [10]:
print_df(genus_df)

(18, 10)


Unnamed: 0,genus name,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Beella,951.0,Beella,genus,82191.0,Globigerinidae,288974.0,Foraminifera,212476.0,Rhizaria
1,Candeina,1053.0,Candeina,genus,422277.0,Candeinidae,288974.0,Foraminifera,212476.0,Rhizaria
2,Dentoglobigerina,1264.0,Dentoglobigerina,genus,82191.0,Globigerinidae,288974.0,Foraminifera,212476.0,Rhizaria
3,Dextral:Sinistral,,,,,,,,,
4,Globigerina,1498.0,Globigerina,genus,82191.0,Globigerinidae,288974.0,Foraminifera,212476.0,Rhizaria


In [11]:
genus_df.to_csv(genus_path, index=False)

In [12]:
merged_df = taxa_df.merge(genus_df,  on = 'genus name' )

print_df(merged_df)

(141, 16)


Unnamed: 0,genus name,species name,verbatim_name,subspecies name,species modifier,non-taxa descriptor,subspecies modifier,pbdb_taxon_id,pbdb_taxon_name,pbdb_taxon_rank,family_taxon_id,family_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Beella,digitata,Beella digitata,,,,,951,Beella,genus,82191,Globigerinidae,288974,Foraminifera,212476,Rhizaria
1,Candeina,nitida,Candeina nitida,,,,,1053,Candeina,genus,422277,Candeinidae,288974,Foraminifera,212476,Rhizaria
2,Dentoglobigerina,altispira,Dentoglobigerina altispira,,,,,1264,Dentoglobigerina,genus,82191,Globigerinidae,288974,Foraminifera,212476,Rhizaria
3,Dentoglobigerina,altispira,Dentoglobigerina altispira _T_ _PL5,_T_,,,,1264,Dentoglobigerina,genus,82191,Globigerinidae,288974,Foraminifera,212476,Rhizaria
4,Dentoglobigerina,altispira,Dentoglobigerina altispira _T_ _PL5_,_T_,,,,1264,Dentoglobigerina,genus,82191,Globigerinidae,288974,Foraminifera,212476,Rhizaria


In [13]:
merged_df.to_csv(taxa_draft_path, index=False)