#### Paleo Analysis and Parsing of the Paleontological Database
Data extracted from *The Paleobiology Database*															


In [91]:
# library imports
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib as mpl
import requests as rq
import io

In [92]:
# Scraping data from the Paleobiology Database and cleaning up the Null values
url = rq.get('https://paleobiodb.org/data1.2/occs/list.csv?base_name=Dinosauria&taxon_reso=species&idqual=certain&pres=regular&max_ma=252&min_ma=65&show=class,coords,loc,acconly').content
occ = pd.read_csv(io.StringIO(url.decode('utf-8'))).drop(columns=['occurrence_no', 'record_type', 'reid_no', 'flags', 'collection_no', 'accepted_rank', 'accepted_no', 'reference_no', 'phylum', 'class', 'county', 'latlng_basis', 'latlng_precision', 'geogscale', 'geogcomments'])
occ.columns = ['Name', 'Early Interval', 'Late Interval', 'Max MYA', 'Min MYA', 'Order', 'Family', 'Genus', 'Longitude', 'Latitude', 'Country', 'State'] # County may need to be added later
#occ = occ[['Name', 'Genus', 'Family', 'Order', 'Max MYA', 'Min MYA', 'Early Interval', 'Late Interval', 'Country', 'State']]
occ['Late Interval'] = occ['Late Interval'].fillna(occ['Early Interval'])

# USE THIS BLOCK TO CHANGE THE VALUES FOR WHEN FAMILY OR ORDER IS UNKNOWN
#occ.loc[occ['Family'] == 'NO_FAMILY_SPECIFIED', 'Family'] = 'Unknown Family'
#occ.loc[occ['Order'] == 'NO_ORDER_SPECIFIED', 'Order'] = 'Unknown Order'


# Note that we use the GRE operator because the conditions do not coincide with each other
conditions = [
    (occ['Max MYA'] <= 251.9) & (occ['Min MYA'] >= 201.4), 
    (occ['Max MYA'] <= 201.4) & (occ['Min MYA'] >= 145.0),
    (occ['Max MYA'] <= 145.0) & (occ['Min MYA'] >= 66.0)
]

eras = ['Triassic', 'Jurassic', 'Cretaceous']

occ['Era'] = np.select(conditions, eras, default=pd.NaT) # Still need to address the cases where a species lived within to time periods (Late Jurassic to Early Cretaceous)
occ.head()

Unnamed: 0,Name,Early Interval,Late Interval,Max MYA,Min MYA,Order,Family,Genus,Longitude,Latitude,Country,State,Era
0,Chaoyangsaurus youngi,Late Kimmeridgian,Valanginian,152.2,132.6,NO_ORDER_SPECIFIED,Chaoyangsauridae,Chaoyangsaurus,123.966698,42.9333,CN,Liaoning,NaT
1,Protarchaeopteryx robusta,Late Barremian,Early Aptian,125.77,119.5,Theropoda,NO_FAMILY_SPECIFIED,Protarchaeopteryx,120.73333,41.799999,CN,Liaoning,Cretaceous
2,Caudipteryx zoui,Late Barremian,Early Aptian,125.77,119.5,Theropoda,NO_FAMILY_SPECIFIED,Caudipteryx,120.73333,41.799999,CN,Liaoning,Cretaceous
3,Gorgosaurus libratus,Late Campanian,Late Campanian,83.6,72.1,Theropoda,Tyrannosauridae,Gorgosaurus,-111.528732,50.740726,CA,Alberta,Cretaceous
4,Gorgosaurus libratus,Late Campanian,Late Campanian,83.6,72.1,Theropoda,Tyrannosauridae,Gorgosaurus,-111.549347,50.737015,CA,Alberta,Cretaceous


The dataset we have here shows the occurrences of each species of dinosaur, where they were found, and where the fossil sits within the geological time scale. However, rather than showing each individual occurrence, I want to represent each species only once and give a general idea of where and when the dinosaur lived. To do this, we can grab each occurrence of a fossil (and keep a count to maintain the current taxon size) and create a list of unique locations that where the dinosaur was found. Using this same logic, we can also create a more accurate time frame of the geological time scale placement of the species, based on all occurrences of the fossil.

In [93]:
aggs = {'Max MYA': 'max',
        'Min MYA': 'min',
        'Early Interval': 'unique',
        'Late Interval': 'unique',
        'Country': 'unique',
        'State': 'unique',
        'Era': 'unique'}

# Make sure to Run All since we drop the columns
temp_df = occ.groupby(occ['Name']).aggregate(aggs).reset_index()
occ = occ.drop(columns=['Max MYA', 'Min MYA', 'Early Interval', 'Late Interval', 'Country', 'State', 'Era'])
occ = occ.merge(temp_df, how='inner')
occ.head()

# Looking at the dataframe, we need to clean the early interval and late interval 
# --> Might need to make another np.select() call and manually assign geological ages using a premade list and condition

Unnamed: 0,Name,Order,Family,Genus,Longitude,Latitude,Max MYA,Min MYA,Early Interval,Late Interval,Country,State,Era
0,Chaoyangsaurus youngi,NO_ORDER_SPECIFIED,Chaoyangsauridae,Chaoyangsaurus,123.966698,42.9333,152.2,132.6,[Late Kimmeridgian],[Valanginian],[CN],[Liaoning],[NaT]
1,Protarchaeopteryx robusta,Theropoda,NO_FAMILY_SPECIFIED,Protarchaeopteryx,120.73333,41.799999,125.77,119.5,[Late Barremian],[Early Aptian],[CN],[Liaoning],[Cretaceous]
2,Caudipteryx zoui,Theropoda,NO_FAMILY_SPECIFIED,Caudipteryx,120.73333,41.799999,125.77,119.5,[Late Barremian],[Early Aptian],[CN],[Liaoning],[Cretaceous]
3,Gorgosaurus libratus,Theropoda,Tyrannosauridae,Gorgosaurus,-111.528732,50.740726,83.6,72.1,"[Late Campanian, Campanian, Middle Campanian]","[Late Campanian, Campanian]","[CA, US]","[Alberta, Montana]",[Cretaceous]
4,Gorgosaurus libratus,Theropoda,Tyrannosauridae,Gorgosaurus,-111.549347,50.737015,83.6,72.1,"[Late Campanian, Campanian, Middle Campanian]","[Late Campanian, Campanian]","[CA, US]","[Alberta, Montana]",[Cretaceous]
