In [3]:
import pandas as pd
from datetime import date
from pathlib import Path 
import re

import sys
sys.path.append(str(Path.cwd().parent))

from scripts.normalize_data import (
    ddm2dec,
    print_df
)

from scripts.normalize_taxa import (
    add_normalized_name_column
)

In [8]:
hole_path = Path('..', 'processed_data', 'Hole Summary_23_2_2021.csv')
approved_taxa_path = Path('..', 'raw_data', 'PI_processed_files', 'LIMS_Micropal_headers_PBDB_Taxonomy_notes_taxa_list_2022-02-22.csv')
taxa_draft_path = Path('..', 'processed_data', 'drafts', 'taxa_list.csv')
noramalized_taxa_path = Path('..', 'processed_data',  'normalized_taxa_list.csv')


# clean up holes coordinates

In [40]:
na_values = ['', '*']             
hole_df = pd.read_csv(hole_path, na_values=na_values)
print_df(hole_df)

(3821, 22)


Unnamed: 0,Exp,Site,Hole,Latitude,Longitude,Water depth (m),Penetration DSF (m),Cored interval (m),Recovered length (m),Recovery (%),...,Total cores (no.),APC cores (no.),HLAPC cores (no.),XCB cores (no.),RCB cores (no.),Other cores (no.),Date started (UTC),Date finished (UTC),Time on hole (days),Comments
0,1,1,,25 51.498 N,92 10.998 W,,86.3,77.2,58.7,76.04,...,9,0,0,0,8,1,10/23/1997 08:44,10/23/1997 08:44,0.0,
1,1,2,,23 27.3 N,92 35.2002 W,,26.5,26.5,12.1,45.66,...,5,0,0,0,5,0,10/23/1997 08:45,10/23/1997 08:45,0.0,
2,1,3,,23 1.8 N,92 2.598 W,,99.2,99.2,46.6,46.98,...,11,0,0,0,11,0,10/23/1997 08:45,10/23/1997 08:45,0.0,
3,1,4,,24 28.68 N,73 47.52 W,,36.7,36.7,12.9,35.15,...,4,0,0,0,4,0,10/23/1997 08:45,10/23/1997 08:45,0.0,
4,1,4,A,24 28.68 N,73 47.52 W,,18.2,18.2,5.7,31.32,...,2,0,0,0,2,0,10/23/1997 08:45,10/23/1997 08:45,0.0,


remove bad rows

In [41]:
hole_df = hole_df[hole_df['Exp'] != 'TEST']
hole_df = hole_df[hole_df['Exp'] != '999']
hole_df.dropna(axis='index', how='all',  inplace=True, subset=['Latitude', 'Longitude'])

fix typos

In [42]:
hole_df.loc[2935, 'Latitude'] = '38 49.7822 S'
hole_df.loc[2964, 'Latitude'] = '38 8.1532 S'

add decimal coordinates

In [43]:
hole_df['Latitude_decimal'] = hole_df['Latitude'].apply(ddm2dec)
hole_df['Longitude_decimal'] = hole_df['Longitude'].apply(ddm2dec)

In [44]:
print_df(hole_df)

(3789, 24)


Unnamed: 0,Exp,Site,Hole,Latitude,Longitude,Water depth (m),Penetration DSF (m),Cored interval (m),Recovered length (m),Recovery (%),...,HLAPC cores (no.),XCB cores (no.),RCB cores (no.),Other cores (no.),Date started (UTC),Date finished (UTC),Time on hole (days),Comments,Latitude_decimal,Longitude_decimal
0,1,1,,25 51.498 N,92 10.998 W,,86.3,77.2,58.7,76.04,...,0,0,8,1,10/23/1997 08:44,10/23/1997 08:44,0.0,,25.8583,-92.1833
1,1,2,,23 27.3 N,92 35.2002 W,,26.5,26.5,12.1,45.66,...,0,0,5,0,10/23/1997 08:45,10/23/1997 08:45,0.0,,23.455,-92.58667
2,1,3,,23 1.8 N,92 2.598 W,,99.2,99.2,46.6,46.98,...,0,0,11,0,10/23/1997 08:45,10/23/1997 08:45,0.0,,23.03,-92.0433
3,1,4,,24 28.68 N,73 47.52 W,,36.7,36.7,12.9,35.15,...,0,0,4,0,10/23/1997 08:45,10/23/1997 08:45,0.0,,24.478,-73.792
4,1,4,A,24 28.68 N,73 47.52 W,,18.2,18.2,5.7,31.32,...,0,0,2,0,10/23/1997 08:45,10/23/1997 08:45,0.0,,24.478,-73.792


In [45]:
hole_df.to_csv(hole_path, index=False)

## create normalize taxa

In [9]:

all_taxa_df = pd.read_csv(approved_taxa_path, dtype=str)

print_df(all_taxa_df, 2)

(4753, 32)


Unnamed: 0,taxon_group,verbatim_name,name,"name to use (if different from ""name"")",name comment field,Comment,notes,Any taxon above genus,genus modifier,genus name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,benthic_forams,Pyrite,Pyrite,,,"not a taxa; Going into Macrostrat, not PBDB",Summer 2020: deal with later; should go in as ...,,,,...,,,,,,,,,,
1,benthic_forams,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,group,"Summer 2020: enter ""Euuvigerina miozea"" and re...",,,Euuvigerina,...,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria


In [10]:
del all_taxa_df['taxon_group']
del all_taxa_df['notes']
del all_taxa_df['name comment field']
del all_taxa_df['Comment']


all_taxa_df.drop_duplicates(inplace=True)
print_df(all_taxa_df, 2)

(4606, 28)


Unnamed: 0,verbatim_name,name,"name to use (if different from ""name"")",Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Pyrite,Pyrite,,,,,,,,,...,,,,,,,,,,
1,Euuvigerina miozea (group) >100 m,Euuvigerina miozea (group) >100 m,Euuvigerina miozea,,,Euuvigerina,,,,miozea,...,,,,,,,288974.0,Foraminifera,212476.0,Rhizaria


In [11]:
taxa_df = pd.read_csv(taxa_draft_path, dtype=str, usecols=['verbatim_name'])
print_df(taxa_df)

(114, 1)


Unnamed: 0,verbatim_name
0,Candeina nitida
1,Dentoglobigerina altispira _T_ _PL5
2,Dentoglobigerina altispira _T_ _PL5_
3,Dextral:Sinistral _P. obliquiloculata_
4,Dextral:Sinistral _P. praecursor_


In [12]:
merged_df = taxa_df.merge(all_taxa_df)
print_df(merged_df)

(117, 28)


Unnamed: 0,verbatim_name,name,"name to use (if different from ""name"")",Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,...,family_taxon_id,family_taxon_name,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name
0,Candeina nitida,Candeina nitida,,,,Candeina,,,,nitida,...,422277,Candeinidae,,,,,288974,Foraminifera,212476,Rhizaria
1,Dentoglobigerina altispira _T_ _PL5,Dentoglobigerina altispira _T_ _PL5,Dentoglobigerina altispira,,,Dentoglobigerina,,,,altispira,...,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria
2,Dentoglobigerina altispira _T_ _PL5_,Dentoglobigerina altispira _T_ _PL5_,Dentoglobigerina altispira,,,Dentoglobigerina,,,,altispira,...,82191,Globigerinidae,,,,,288974,Foraminifera,212476,Rhizaria
3,Dextral:Sinistral _P. obliquiloculata_,Dextral P. obliquiloculata,,,,Pulleniatina,,,,obliquiloculata,...,82192,Globorotaliidae,,,,,288974,Foraminifera,212476,Rhizaria
4,Dextral:Sinistral _P. obliquiloculata_,Sinistral P. obliquiloculata,,,,Pulleniatina,,,,obliquiloculata,...,82192,Globorotaliidae,,,,,288974,Foraminifera,212476,Rhizaria


In [13]:
add_normalized_name_column(merged_df)
add_normalized_name_column(merged_df, include_descriptor=False, 
                           include_modifier=False, col_name='basic_name')

print_df(merged_df)

(117, 30)


Unnamed: 0,verbatim_name,name,"name to use (if different from ""name"")",Any taxon above genus,genus modifier,genus name,subgenera modifier,subgenera name,species modifier,species name,...,order_taxon_id,order_taxon_name,class_taxon_id,class_taxon_name,phylum_taxon_id,phylum_taxon_name,kingdom_taxon_id,kingdom_taxon_name,normalized_name,basic_name
0,Candeina nitida,Candeina nitida,,,,Candeina,,,,nitida,...,,,,,288974,Foraminifera,212476,Rhizaria,Candeina nitida,Candeina nitida
1,Dentoglobigerina altispira _T_ _PL5,Dentoglobigerina altispira _T_ _PL5,Dentoglobigerina altispira,,,Dentoglobigerina,,,,altispira,...,,,,,288974,Foraminifera,212476,Rhizaria,Dentoglobigerina altispira,Dentoglobigerina altispira
2,Dentoglobigerina altispira _T_ _PL5_,Dentoglobigerina altispira _T_ _PL5_,Dentoglobigerina altispira,,,Dentoglobigerina,,,,altispira,...,,,,,288974,Foraminifera,212476,Rhizaria,Dentoglobigerina altispira,Dentoglobigerina altispira
3,Dextral:Sinistral _P. obliquiloculata_,Dextral P. obliquiloculata,,,,Pulleniatina,,,,obliquiloculata,...,,,,,288974,Foraminifera,212476,Rhizaria,Pulleniatina obliquiloculata (dextral),Pulleniatina obliquiloculata
4,Dextral:Sinistral _P. obliquiloculata_,Sinistral P. obliquiloculata,,,,Pulleniatina,,,,obliquiloculata,...,,,,,288974,Foraminifera,212476,Rhizaria,Pulleniatina obliquiloculata (sinistral),Pulleniatina obliquiloculata


In [14]:
taxa_df = merged_df[['verbatim_name', 'normalized_name', 'basic_name']]
taxa_df

Unnamed: 0,verbatim_name,normalized_name,basic_name
0,Candeina nitida,Candeina nitida,Candeina nitida
1,Dentoglobigerina altispira _T_ _PL5,Dentoglobigerina altispira,Dentoglobigerina altispira
2,Dentoglobigerina altispira _T_ _PL5_,Dentoglobigerina altispira,Dentoglobigerina altispira
3,Dextral:Sinistral _P. obliquiloculata_,Pulleniatina obliquiloculata (dextral),Pulleniatina obliquiloculata
4,Dextral:Sinistral _P. obliquiloculata_,Pulleniatina obliquiloculata (sinistral),Pulleniatina obliquiloculata
...,...,...,...
112,Sphaeroidinella dehiscens sensu lato _B_,Sphaeroidinella s.l. dehiscens,Sphaeroidinella dehiscens
113,Sphaeroidinellopsis kochi _T,Sphaeroidinellopsis kochi,Sphaeroidinellopsis kochi
114,Sphaeroidinellopsis kochi _T_,Sphaeroidinellopsis kochi,Sphaeroidinellopsis kochi
115,Sphaeroidinellopsis seminulina _T_ _PL4,Sphaeroidinellopsis seminulina,Sphaeroidinellopsis seminulina


In [15]:
taxa_df.to_csv(noramalized_taxa_path, index=False)