In [1]:
# Data processing of the "Endangered Species" dataset from the U.S. Fish and Wildlife Service (FWS)
# https://www.fws.gov/endangered/species/index.html

# importing packages
import pandas as pd
import lxml

pd.__version__

'1.3.4'

In [2]:
# merging animal and plant data .csv-files
df = pd.concat(map(pd.read_csv, ['../Data/species-listings-by-tax-group-report_plants.csv', '../Data/species-listings-by-tax-group-report_animals.csv']), ignore_index=True)


# add ID column based on dataset identifier(ES <- Endangered Species) and index
df.insert(0, 'ID', 'ES' + df.index.astype(str))


# change column names (spaces to underscore, lower case, "group" to "category"
df = df.rename(columns={"Group": "category"})
df.columns = [c.replace(' ', '_') for c in df.columns]
df.columns = df.columns.str.lower()


# add a column with region names based on "region" number (https://www.fws.gov/endangered/regions/index.html)
dicRegionNames = {1: "Pacific", 2: "Southwest", 3: "Midwest", 4: "Southeast", 5: "Northeast", 6: "Mountain-Prairie", 7: "Alaska", 8: "Pacific Southwest"}
df.insert(4, 'region_name', df['region'].map(dicRegionNames))


# drop all values with special "ESA Listing Status" (72 entries in total)
df = df[df.esa_listing_status != 'Experimental Population, Non-Essential'] #(64 entries)
df = df[df.esa_listing_status != 'Similarity of Appearance (Threatened)'] #(8 entries)


# drop "Where Listed" column from DataFrame
#df = df.drop('WhereListed', 1)

df.head()

Unnamed: 0,id,scientific_name,common_name,where_listed,region_name,region,esa_listing_status,category
0,ES0,Pediocactus knowltonii,Knowlton's cactus,Wherever found,Southwest,2,Endangered,Flowering Plants
1,ES1,Schiedea sarmentosa,No common name,Wherever found,Pacific,1,Endangered,Flowering Plants
2,ES2,Deinandra increscens ssp. villosa,Gaviota Tarplant,Wherever found,Pacific Southwest,8,Endangered,Flowering Plants
3,ES3,Silene lanceolata,No common name,Wherever found,Pacific,1,Endangered,Flowering Plants
4,ES4,Bidens campylotheca ssp. waihoiensis,Ko`oko`olau,Wherever found,Pacific,1,Endangered,Flowering Plants


In [4]:
# define function to collaps duplicates in list 
#lambda df: lang_pred_with_pub(df)
def collaps_to_set(dataFrame):
    
    # build in check if group contains more thans one entry
    if len(dataFrame) <= 1:
        return dataFrame
    
    # collaps where_listed
    # should return a list of the series
    where_listed_list = list(dataFrame['where_listed'].unique())
    # set all rows to the list name
    dataFrame['where_listed'] = '|'.join(where_listed_list)
    
    
    # collaps region
    # should return a list of the series
    region_list = list(dataFrame['region'].unique())
    # convert integers in list to strings
    region_list = [str(i) for i in range(len(region_list))]
    # set all rows to the list name
    dataFrame['region'] = '|'.join(region_list)
    
    
    # collaps region_name
    # should return a list of the series
    region_name_list = list(dataFrame['region_name'].unique())
    # set all rows to the list name
    dataFrame['region_name'] = '|'.join(region_name_list)
    
    
    # collaps esa_listing_status
    # should return a list of the series
    esa_listing_status_list = list(dataFrame['esa_listing_status'].unique())
    # set all rows to the list name
    dataFrame['esa_listing_status'] = '|'.join(esa_listing_status_list)
    
    # return only one row from the data frame
    return_row = dataFrame.iloc[[0],:]
    
    return return_row

In [10]:
# use defined function to collaps duplicates
df = df.groupby('scientific_name').apply(lambda x: collaps_to_set(x)).reset_index(drop=True)

# print an example
df[df['scientific_name'] == 'Chelonia mydas']

Unnamed: 0,id,scientific_name,common_name,where_listed,region_name,region,esa_listing_status,category
235,ES1005,Chelonia mydas,Green sea turtle,North Atlantic DPS|South Atlantic DPS|Central ...,Southeast|Pacific|Pacific Southwest,0|1|2,Threatened|Endangered,Reptile


In [6]:
# create dictionary for category mapping to general categories rooted from the "Biodiversity" dataset
mapping_dic = {'Mammals':'Mammal',
                'Birds':'Bird',
                'Insects':'Insect',
                'Fishes':'Fish',
                'Reptiles':'Reptile',
                'Clams':'Invertebrate',
                'Snails':'Slug/Snail',
                'Arachnids':'Spider/Scorpion',
                'Amphibians':'Amphibian',
                'Crustaceans':'Crab/Lobster/Shrimp',
                'Lichens':'Fungi',
                'Ferns and Allies':'Vascular Plant',
                'Flowering Plants':'Vascular Plant',
                'Conifers and Cycads':'Vascular Plant'        
}

df['category'] = df['category'].map(mapping_dic)

In [7]:
# save DataFrame to .csv
df.to_csv('../Data/species-listings-by-tax-group-report_animals_and_plants_optimized.csv', index = False)

In [8]:
def convert_to_xml_species(row, output_file):
    
    # add species opening tag to xml string
    xml_species = ['<Species>']
    
    # get ID from the row
    ident = row['id']
    #create ID string with attribute value by substituting {} with ident
    xml_species.append('  <ID>{}</ID>'.format(ident))
    
    #create Provenance string with attribute value by substituting {} with ident
    xml_species.append('  <Provenance>{}</Provenance>'.format('FWS-EndangeredSpecies'))

    
    # get scientific name from the row
    scientific_name = row['scientific_name']
    #create scientific_name string with attribute value by substituting {} with scientific_name
    xml_species.append('  <Scientific_Name>{}</Scientific_Name>'.format(scientific_name))
    
    # get common name from the row
    common_name = row['common_name']
    #create common_name string with attribute value by substituting {} with common_name
    xml_species.append('  <Common_Names>')
    xml_species.append('    <Common_Name>{}</Common_Name>'.format(common_name))
    xml_species.append('  </Common_Names>')

    
    # cast list string back to a list object
    # and check if attribute is of type string, if not cast to string
    if not isinstance(row['where_listed'], str):
        where_listed_list = str(row['where_listed']).split('|')
    else:
        where_listed_list = row['where_listed'].split('|')
    ## here the conversion to XML of the list attributes
    xml_species.append('  <Where_Listeds>')
    for i in range(len(where_listed_list)):
        # get where_listed from the list element
        where_listed = where_listed_list[i]
        #create common_name string with attribute value by substituting {} with common_name
        xml_species.append('    <Where_Listed>{}</Where_Listed>'.format(where_listed))
    xml_species.append('  </Where_Listeds>')
        
    # cast list string back to a list object
    # and check if attribute is of type string, if not cast to string
    if not isinstance(row['region'], str):
        region_list = str(row['region']).split('|')
    else:
        region_list = row['region'].split('|')
    ## here the conversion to XML of the list attributes
    xml_species.append('  <Regions>')
    for i in range(len(region_list)):
        # get where_listed from the list element
        region = region_list[i]
        #create common_name string with attribute value by substituting {} with region
        xml_species.append('    <Region>{}</Region>'.format(region))
    xml_species.append('  </Regions>')
      
    # cast list string back to a list object
    region_name_list = row['region_name'].split('|')
    ## here the conversion to XML of the list attributes
    xml_species.append('  <Region_Names>')
    for i in range(len(region_name_list)):
        # get where_listed from the list element
        region_name = region_name_list[i]
        #create common_name string with attribute value by substituting {} with region_name
        xml_species.append('    <Region_Name>{}</Region_Name>'.format(region_name))
    xml_species.append('  </Region_Names>')
   
    # cast list string back to a list object
    esa_listing_status_list = row['esa_listing_status'].split('|')
    ## here the conversion to XML of the list attributes
    xml_species.append('  <Listing_Statuses>')
    for i in range(len(esa_listing_status_list)):
        # get where_listed from the list element
        esa_listing_status = esa_listing_status_list[i]
        #create common_name string with attribute value by substituting {} with region_name
        xml_species.append('    <Listing_Status>{}</Listing_Status>'.format(esa_listing_status))
    xml_species.append('  </Listing_Statuses>')
    
    # get category from the row
    category = row['category']
    #create category string with attribute value by substituting {} with category
    xml_species.append('<Category>{}</Category>'.format(category))
    
    # add species closing tag to xml string
    xml_species.append('</Species>')
  
    species_element = '\n'.join(xml_species)
    #print(species_element)
    
    # open output file
    f = open(output_file, "a")
    # write to output file
    f.write(species_element + "\n")
    
    return row

In [11]:
#df.apply(lambda row: convert_to_xml_species(row, output_file), axis =1)

In [9]:
# define the .xml destination file
species_xml = "../xml/endangered_species.xml"

# open file with writing, thus the current file is overwritten
f = open(species_xml, "w")
# write the start of the XML document
f.write("<?xml version='1.0' encoding='utf-8'?>\n<Animals_And_Plants>\n")
f.close()

# run the function and give it the destination file as attribute
# it will append any species elements to the file, without overwriting the existing information
df.apply(lambda row: convert_to_xml_species(row, species_xml), axis =1)

# append the closing tag of the XML to the document
f = open(species_xml, "a")
f.write("</Animals_And_Plants>")
f.close()