In [1]:
# Data processing of the "FWS-Listed U.S. Species by Taxonomic Group" dataset
# https://www.fws.gov/endangered/species/index.html

# importing packages
import pandas as pd
import lxml

pd.__version__

'1.1.3'

In [9]:
# merging .csv files
df = pd.concat(map(pd.read_csv, ['../Data/species-listings-by-tax-group-report_plants.csv', '../Data/species-listings-by-tax-group-report_animals.csv']), ignore_index=True)
#df.index.name='ID'

# change column names (spaces to underscore, lower case, "group" to "category"
df = df.rename(columns={"Group": "category"})
df.columns = [c.replace(' ', '_') for c in df.columns]
df.columns = df.columns.str.lower()

# add a column with region names based on "region" number
dicRegionNames = {1: "Pacific", 2: "Southwest", 3: "Midwest", 4: "Southeast", 5: "Northeast", 6: "Mountain-Prairie", 7: "Alaska", 8: "Pacific Southwest"}
df.insert(4, 'region_name', '')
df['region_name'] = df['region'].map(dicRegionNames)

# dropping all values with special "ESA Listing Status" (72 entries in total)
df = df[df.esa_listing_status != 'Experimental Population, Non-Essential'] #(64 entries)
df = df[df.esa_listing_status != 'Similarity of Appearance (Threatened)'] #(8 entries)

# merge duplicates based on "scientific_name"
#df = df.groupby([df.scientific_name])['where_listed'].apply(', '.join).reset_index()
print(df.head())

# drop duplicates based on "scientific_name", but keep the first entry
#df.drop_duplicates(subset = ['scientific_name'], keep = 'first')

# drop "Where Listed" column from DataFrame
#df = df.drop('WhereListed', 1)



                        scientific_name        common_name    where_listed  \
0                Pediocactus knowltonii  Knowlton's cactus  Wherever found   
1                   Schiedea sarmentosa     No common name  Wherever found   
2     Deinandra increscens ssp. villosa   Gaviota Tarplant  Wherever found   
3                     Silene lanceolata     No common name  Wherever found   
4  Bidens campylotheca ssp. waihoiensis        Ko`oko`olau  Wherever found   

   region        region_name esa_listing_status          category  
0       2          Southwest         Endangered  Flowering Plants  
1       1            Pacific         Endangered  Flowering Plants  
2       8  Pacific Southwest         Endangered  Flowering Plants  
3       1            Pacific         Endangered  Flowering Plants  
4       1            Pacific         Endangered  Flowering Plants  


In [10]:
# define function to collaps duplicates in list 
#lambda df: lang_pred_with_pub(df)
def collaps_to_set(dataFrame):
    
    # build in check if group contains more thans one entry
    if len(dataFrame) <= 1:
        return dataFrame
    
    # collaps where_listed
    # should return a list of the series
    where_listed_list = list(dataFrame['where_listed'].unique())
    
    # set all rows to the list name
    dataFrame['where_listed'] = str(where_listed_set)
    
    
    # collaps region
    # should return a list of the series
    where_listed_list = list(dataFrame['region'].unique())
    
    # set all rows to the list name
    dataFrame['region'] = str(where_listed_set)
    
    
    # collaps region_name
    # should return a list of the series
    where_listed_list = list(dataFrame['region_name'].unique())
    
    # set all rows to the list name
    dataFrame['region_name'] = str(where_listed_set)
    
    
    # collaps esa_listing_status
    # should return a list of the series
    where_listed_list = list(dataFrame['esa_listing_status'].unique())
    
    # set all rows to the list name
    dataFrame['esa_listing_status'] = str(where_listed_set)
    
    
    # return only one row from the data frame
    return_row = dataFrame.iloc[[0],:]
    
    return return_row


In [11]:
df = df.groupby('scientific_name').apply(lambda x: collaps_to_set(x))

In [12]:
df[df['scientific_name'] == 'Chelonia mydas']

Unnamed: 0_level_0,Unnamed: 1_level_0,scientific_name,common_name,where_listed,region,region_name,esa_listing_status,category
scientific_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Chelonia mydas,1005,Chelonia mydas,Green sea turtle,"['North Atlantic DPS', 'South Atlantic DPS', '...","[4, 1, 8]","['Southeast', 'Pacific', 'Pacific Southwest']","['Threatened', 'Endangered']",Reptiles


In [17]:
# save DataFrame to .csv
df.to_csv('../Data/species-listings-by-tax-group-report_animals_and_plants_optimized.csv', index = False)

In [18]:
# save DataFrame to .xml
#df.to_xml('../xml/endangered-species.xml', index=False, root_name='animals_and_plants', row_name='species')
print(df.to_xml(index=False, root_name='animals_and_plants', row_name='species'))

AttributeError: 'DataFrame' object has no attribute 'to_xml'