#### Paleo Analysis and Parsing of the Paleontological Database
Data extracted from *The Paleobiology Database*															


In [111]:
# library imports
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib as mpl
import requests as rq
import io

In [112]:
# Scraping data from the Paleobiology Database and cleaning up the Null values
url = rq.get('https://paleobiodb.org/data1.2/occs/list.csv?base_name=Dinosauria&taxon_reso=species&idqual=certain&pres=regular&max_ma=252&min_ma=65&show=class,coords,loc,acconly').content
occ = pd.read_csv(io.StringIO(url.decode('utf-8'))).drop(columns=['early_interval', 'late_interval', 'occurrence_no', 'record_type', 'reid_no', 'flags', 'collection_no', 'accepted_rank', 'accepted_no', 'reference_no', 'phylum', 'class', 'county', 'latlng_basis', 'latlng_precision', 'geogscale', 'geogcomments'])
occ.columns = ['Species', 'Max MYA', 'Min MYA', 'Order', 'Family', 'Genus', 'Longitude', 'Latitude', 'Country', 'State'] # County may need to be added later

# USE THIS BLOCK TO CHANGE THE VALUES FOR WHEN FAMILY OR ORDER IS UNKNOWN
#occ.loc[occ['Family'] == 'NO_FAMILY_SPECIFIED', 'Family'] = 'Unknown Family'
#occ.loc[occ['Order'] == 'NO_ORDER_SPECIFIED', 'Order'] = 'Unknown Order'

occ.head()

Unnamed: 0,Species,Max MYA,Min MYA,Order,Family,Genus,Longitude,Latitude,Country,State
0,Chaoyangsaurus youngi,152.2,132.6,NO_ORDER_SPECIFIED,Chaoyangsauridae,Chaoyangsaurus,123.966698,42.9333,CN,Liaoning
1,Protarchaeopteryx robusta,125.77,119.5,Theropoda,NO_FAMILY_SPECIFIED,Protarchaeopteryx,120.73333,41.799999,CN,Liaoning
2,Caudipteryx zoui,125.77,119.5,Theropoda,NO_FAMILY_SPECIFIED,Caudipteryx,120.73333,41.799999,CN,Liaoning
3,Gorgosaurus libratus,83.6,72.1,Theropoda,Tyrannosauridae,Gorgosaurus,-111.528732,50.740726,CA,Alberta
4,Gorgosaurus libratus,83.6,72.1,Theropoda,Tyrannosauridae,Gorgosaurus,-111.549347,50.737015,CA,Alberta


The dataset we have here shows the occurrences of each species of dinosaur, where they were found, and where the fossil sits within the geological time scale. However, rather than showing each individual occurrence, I want to represent each species only once and give a general idea of where and when the dinosaur lived. To do this, we can grab each occurrence of a fossil (and keep a count to maintain the current taxon size) and create a list of unique locations that where the dinosaur was found. Using this same logic, we can also create a more accurate time frame of the geological time scale placement of the species, based on all occurrences of the fossil.

In [113]:
aggs = {'Max MYA': 'max',
        'Min MYA': 'min',
        'Country': 'unique',
        'State': 'unique'
        }

# Make sure to Run All since we drop the columns
temp_df = occ.groupby(occ['Species']).aggregate(aggs).reset_index()
occ = occ.drop(columns=['Max MYA', 'Min MYA', 'Country', 'State'])
temp_df = temp_df.merge(occ, how='inner', on='Species').drop_duplicates(subset='Species').reset_index().drop(columns='index')
occ = temp_df[['Species', 'Genus', 'Order', 'Max MYA', 'Min MYA', 'Country', 'State', 'Longitude', 'Latitude']]


In [114]:

testing = occ.loc[occ['Genus'] == 'Tyrannosaurus']
occ

Unnamed: 0,Species,Genus,Order,Max MYA,Min MYA,Country,State,Longitude,Latitude
0,Aardonyx celestae,Aardonyx,NO_ORDER_SPECIFIED,201.4,192.9,[ZA],[Free State],27.824444,-28.466389
1,Abavornis bonaparti,Abavornis,Alexornithiformes,93.9,89.8,[UZ],[Navoi],62.655315,42.117294
2,Abdarainurus barsboldi,Abdarainurus,NO_ORDER_SPECIFIED,86.3,72.1,[MN],[Omngov],103.154999,44.523335
3,Abditosaurus kuehnei,Abditosaurus,NO_ORDER_SPECIFIED,72.1,66.0,[ES],[Cataluña],0.973056,42.159443
4,Abelisaurus comahuensis,Abelisaurus,Theropoda,83.6,72.1,[AR],[Río Negro],-67.982765,-38.761730
...,...,...,...,...,...,...,...,...,...
1581,Zuniceratops christopheri,Zuniceratops,NO_ORDER_SPECIFIED,93.9,89.8,[US],[New Mexico],-108.849998,35.066666
1582,Zuolong salleei,Zuolong,Theropoda,161.5,154.8,[CN],[Xinjiang],88.919884,45.191288
1583,Zuoyunlong huangi,Zuoyunlong,NO_ORDER_SPECIFIED,100.5,83.6,[CN],[Shanxi],112.703056,40.013611
1584,Zupaysaurus rougieri,Zupaysaurus,Theropoda,227.0,208.5,[AR],[La Rioja],-68.066666,-29.850000


In [116]:

# Looking at the dataframe, we need to clean the early interval and late interval 
# --> Might need to make another np.select() call and manually assign geological ages using a premade list and condition
periods = ['Triassic', 'Jurassic', 'Cretaceous']
epochs = ['Early', 'Middle', 'Late']

tri_ages = ['Induan', 'Olenekian', 'Anisian', 'Ladinian', 'Carnian', 'Norian', 'Rhaetian']
jur_ages = ['Hettangian', 'Sinemurian', 'Pliensbachian', 'Toarcian', 'Aalenian', 'Bajocian', 'Bathonian', 'Callovian', 'Oxfordian', 'Kimmeridgian', 'Tithonian']
cre_ages = ['Berriasian', 'Valanginian', 'Hauterivian', 'Barremian', 'Aptian', 'Albian', 'Cenomanian', 'Turonian', 'Coniacian', 'Santonian', 'Campanian', 'Maastrichtian']

ages = [*tri_ages, *jur_ages, *cre_ages]

# arguements for np.select
mya_args = lambda x : [(occ[x] <= 251.9) & (occ[x] > 251.2), 
                (occ[x] <= 251.2) & (occ[x] > 247.2),
                (occ[x] <= 247.2) & (occ[x] > 242),
                (occ[x] <= 242) & (occ[x] > 237),
                (occ[x] <= 237) & (occ[x] > 227),
                (occ[x] <= 227) & (occ[x] > 208.5),
                (occ[x] <= 208.5) & (occ[x] > 201.4),
                (occ[x] <= 201.4) & (occ[x] > 199.5),
                (occ[x] <= 199.5) & (occ[x] > 192.9),
                (occ[x] <= 192.9) & (occ[x] > 184.2),
                (occ[x] <= 184.2) & (occ[x] > 174.7),
                (occ[x] <= 174.7) & (occ[x] > 170.9),
                (occ[x] <= 170.9) & (occ[x] > 168.2),
                (occ[x] <= 168.2) & (occ[x] > 165.3),
                (occ[x] <= 165.3) & (occ[x] > 161.5),
                (occ[x] <= 161.5) & (occ[x] > 154.8),
                (occ[x] <= 154.8) & (occ[x] > 149.2),
                (occ[x] <= 149.2) & (occ[x] > 145),
                (occ[x] <= 145) & (occ[x] > 139.8),
                (occ[x] <= 139.8) & (occ[x] > 132.6),
                (occ[x] <= 132.6) & (occ[x] > 125.77),
                (occ[x] <= 125.77) & (occ[x] > 121.4),
                (occ[x] <= 121.4) & (occ[x] > 113),
                (occ[x] <= 113) & (occ[x] > 100.5),
                (occ[x] <= 100.5) & (occ[x] > 93.9),
                (occ[x] <= 93.9) & (occ[x] > 89.8),
                (occ[x] <= 89.8) & (occ[x] > 86.3),
                (occ[x] <= 86.3) & (occ[x] > 83.6),
                (occ[x] <= 83.6) & (occ[x] > 72.1),
                (occ[x] <= 72.1) & (occ[x] > 66)] 
                
occ['Early Interval'] = np.select(mya_args('Max MYA'), ages, default=pd.NaT)

# We add 0.01 to accomodate for edge cases where a dinosaur is estimated to have lived at the cusp of two mesozoic ages
occ['Min MYA'] += 0.01
occ['Late Interval'] = np.select(mya_args('Min MYA'), ages, default=pd.NaT) # Fix this because Min MYA needs
occ['Min MYA'] -= 0.01
occ['Late Interval'] = occ['Late Interval'].fillna(occ['Early Interval'])
   
occ.head()


Unnamed: 0,Species,Genus,Order,Max MYA,Min MYA,Country,State,Longitude,Latitude,Early Interval,Late Interval
0,Aardonyx celestae,Aardonyx,NO_ORDER_SPECIFIED,201.4,192.9,[ZA],[Free State],27.824444,-28.466389,Hettangian,Sinemurian
1,Abavornis bonaparti,Abavornis,Alexornithiformes,93.9,89.8,[UZ],[Navoi],62.655315,42.117294,Turonian,Turonian
2,Abdarainurus barsboldi,Abdarainurus,NO_ORDER_SPECIFIED,86.3,72.1,[MN],[Omngov],103.154999,44.523335,Santonian,Campanian
3,Abditosaurus kuehnei,Abditosaurus,NO_ORDER_SPECIFIED,72.1,66.0,[ES],[Cataluña],0.973056,42.159443,Maastrichtian,Maastrichtian
4,Abelisaurus comahuensis,Abelisaurus,Theropoda,83.6,72.1,[AR],[Río Negro],-67.982765,-38.76173,Campanian,Campanian
