# Created aggregated checklist

In [1]:
import pandas as pd
from collections import OrderedDict

In [2]:
data = pd.read_table('../data/interim/verified-checklist.tsv', dtype=object)

In [3]:
data.head()

Unnamed: 0,index,nameMatchValidation,taxonID,datasetName,taxonRank,scientificName,verbatimScientificName,synonym,kingdom,phylum,...,notes,manual_acceptedKey,gbifapi_usageKey,gbifapi_scientificName,gbifapi_canonicalName,gbifapi_status,gbifapi_rank,gbifapi_matchType,gbifapi_acceptedKey,gbifapi_acceptedScientificName
0,0,ok,,fishes,species,Acipenser baerii,Acipenser baeri,,Animalia,,...,,,4287131,"Acipenser baerii Brandt, 1869",Acipenser baerii,ACCEPTED,SPECIES,EXACT,4287131,"Acipenser baerii Brandt, 1869"
1,1,ok,,fishes,species,Acipenser gueldenstaedtii,Acipenser guldenstaedti,,Animalia,,...,,,2402129,"Acipenser gueldenstaedtii Brandt & Ratzeburg, ...",Acipenser gueldenstaedtii,ACCEPTED,SPECIES,EXACT,2402129,"Acipenser gueldenstaedtii Brandt & Ratzeburg, ..."
2,2,ok,,fishes,species,Acipenser ruthenus,Acipenser ruthenus,,Animalia,,...,,,2402168,"Acipenser ruthenus Linnaeus, 1758",Acipenser ruthenus,ACCEPTED,SPECIES,EXACT,2402168,"Acipenser ruthenus Linnaeus, 1758"
3,3,ok,,fishes,species,Ameiurus nebulosus,Ameiurus nebulosus,,Animalia,,...,,,2340989,"Ameiurus nebulosus (Lesueur, 1819)",Ameiurus nebulosus,ACCEPTED,SPECIES,EXACT,2340989,"Ameiurus nebulosus (Lesueur, 1819)"
4,4,ok: SYNONYM confirmed,,fishes,species,Aspius aspius,Aspius aspius,,Animalia,,...,,,2360181,"Aspius aspius (Linnaeus, 1758)",Aspius aspius,SYNONYM,SPECIES,EXACT,5851603,"Leuciscus aspius (Linnaeus, 1758)"


## Retrieve valid records only

Only records that contain ok, so no wrong matches, no matches or unverified synonyms.

In [4]:
valid_records = data[data['nameMatchValidation'].str.contains('^ok', regex=True, na=False)]

In [5]:
valid_records['index'].count()

2646

## Aggregate and sort on gbifapi_acceptedName & collect unique values for some columns

In [6]:
def split_delimited_values(series, delimiter='|'):
    # Splits values (of a series) containing a delimiter in multiple values
    # Is a bit of a hack
    
    # Only split if the series contains more than NaN values, otherwise, just return series back
    if len(series.dropna()) > 0:
        series = series.str.lower().str.split(delimiter).apply(pd.Series, 1).stack()
        series = series.str.strip()
        series.index = series.index.droplevel(-1)
    return series

In [7]:
def get_unique_values(series):
    # dropna(): remove NaN values
    # tuple(): will collect all values of a series as a tuple
    # set(): will only keep (unordered) unique values
    # Returns a set
    
    # It would have been easier to just return a list(), so we can sort it too,
    # but then we get 'Function does not reduce' (http://stackoverflow.com/a/37955931)
    # so have to use tuple()
    return set(tuple(series.dropna()))

In [8]:
# Default dict syntax for aggregation does not preserve column order, which is why we use OrderedDict
unique_values_per_column = OrderedDict([
    ('kingdom', lambda x: get_unique_values(x)),
    ('datasetName', lambda x: get_unique_values(x)),
    ('firstObservationYearBE', lambda x: get_unique_values(x)),
    ('firstObservationYearFL', lambda x: get_unique_values(x)),
    ('invasionStage', lambda x: get_unique_values(x)),
    ('habitat', lambda x: get_unique_values(x)),
    ('nativeRange', lambda x: get_unique_values(x)),
    ('introductionPathway', lambda x: get_unique_values(split_delimited_values(x))), # Can contain |
    ('presenceBE', lambda x: get_unique_values(x)),
    ('presenceFL', lambda x: get_unique_values(x)),
    ('presenceWA', lambda x: get_unique_values(x)),
    ('presenceBR', lambda x: get_unique_values(x)),
    ('gbifapi_scientificName', lambda x: get_unique_values(x)),
    ('index', lambda x: get_unique_values(x))
])

In [9]:
aggregated_records = valid_records.groupby(['gbifapi_acceptedScientificName','gbifapi_acceptedKey']).agg(unique_values_per_column).reset_index()

In [10]:
aggregated_records = aggregated_records.sort_values(by='gbifapi_acceptedScientificName')

In [11]:
aggregated_records.head()

Unnamed: 0,gbifapi_acceptedScientificName,gbifapi_acceptedKey,kingdom,datasetName,firstObservationYearBE,firstObservationYearFL,invasionStage,habitat,nativeRange,introductionPathway,presenceBE,presenceFL,presenceWA,presenceBR,gbifapi_scientificName,index
0,Abies alba Mill.,2685484,{Plantae},{plants},{2008},{},{introduced},{to be determined by experts},{Europe},{escape > horticulture},{present},{present},{absent},{absent},{Abies alba Mill.},{1778}
1,Abies grandis (Douglas ex D. Don) Lindl.,2685361,{Plantae},{plants},{2009},{},{introduced},{to be determined by experts},{N. America},{escape > horticulture},{present},{present},{absent},{absent},{Abies grandis (Douglas ex D. Don) Lindl.},{1779}
2,Abronia fragrans Nutt. ex Hook.,5384889,{Plantae},{plants},{1951},{},{introduced},{to be determined by experts},{N. America},{escape > horticulture},{present},{present},{absent},{absent},{Abronia fragrans Nutt.},{1659}
3,Abutilon theophrasti Medik.,3152614,{Plantae},{plants},{<1929},{},{unknown},{to be determined by experts},{Europe | Asia},{contaminant > seed contaminant},{present},{present},{present},{present},{Abutilon theophrasti Medik.},{1610}
4,Acalypha indica L.,3056259,{Plantae},{plants},{2003},{},{introduced},{to be determined by experts},{Africa | Asia},{contaminant > contaminant nursery material},{present},{present},{absent},{absent},{Acalypha indica L.},{1221}


## Sort and concatenate unique values per column

In [12]:
def sort_and_concatenate(aSet, sortAs=str):
    # sortAs allows to sort more specific
    # {9, 200, 12} will be sorted as:
    # '12, 200, 9' with default str
    # '9, 12, 200' with int
    sortedList = sorted(aSet, key=sortAs)
    concatenatedList = ' | '.join(str(i) for i in sortedList)
    return concatenatedList # a string

In [13]:
aggregated_records['kingdom'] = aggregated_records['kingdom'].apply(lambda x: sort_and_concatenate(x))
aggregated_records['datasetName'] = aggregated_records['datasetName'].apply(lambda x: sort_and_concatenate(x))
aggregated_records['firstObservationYearBE'] = aggregated_records['firstObservationYearBE'].apply(lambda x: sort_and_concatenate(x))
aggregated_records['firstObservationYearFL'] = aggregated_records['firstObservationYearFL'].apply(lambda x: sort_and_concatenate(x))
aggregated_records['invasionStage'] = aggregated_records['invasionStage'].apply(lambda x: sort_and_concatenate(x))
aggregated_records['habitat'] = aggregated_records['habitat'].apply(lambda x: sort_and_concatenate(x))
aggregated_records['nativeRange'] = aggregated_records['nativeRange'].apply(lambda x: sort_and_concatenate(x))
aggregated_records['introductionPathway'] = aggregated_records['introductionPathway'].apply(lambda x: sort_and_concatenate(x))
aggregated_records['presenceBE'] = aggregated_records['presenceBE'].apply(lambda x: sort_and_concatenate(x))
aggregated_records['presenceFL'] = aggregated_records['presenceFL'].apply(lambda x: sort_and_concatenate(x))
aggregated_records['presenceWA'] = aggregated_records['presenceWA'].apply(lambda x: sort_and_concatenate(x))
aggregated_records['presenceBR'] = aggregated_records['presenceBR'].apply(lambda x: sort_and_concatenate(x))
aggregated_records['gbifapi_scientificName'] = aggregated_records['gbifapi_scientificName'].apply(lambda x: sort_and_concatenate(x))
aggregated_records['index'] = aggregated_records['index'].apply(lambda x: sort_and_concatenate(x, int))

In [14]:
aggregated_records.head()

Unnamed: 0,gbifapi_acceptedScientificName,gbifapi_acceptedKey,kingdom,datasetName,firstObservationYearBE,firstObservationYearFL,invasionStage,habitat,nativeRange,introductionPathway,presenceBE,presenceFL,presenceWA,presenceBR,gbifapi_scientificName,index
0,Abies alba Mill.,2685484,Plantae,plants,2008,,introduced,to be determined by experts,Europe,escape > horticulture,present,present,absent,absent,Abies alba Mill.,1778
1,Abies grandis (Douglas ex D. Don) Lindl.,2685361,Plantae,plants,2009,,introduced,to be determined by experts,N. America,escape > horticulture,present,present,absent,absent,Abies grandis (Douglas ex D. Don) Lindl.,1779
2,Abronia fragrans Nutt. ex Hook.,5384889,Plantae,plants,1951,,introduced,to be determined by experts,N. America,escape > horticulture,present,present,absent,absent,Abronia fragrans Nutt.,1659
3,Abutilon theophrasti Medik.,3152614,Plantae,plants,<1929,,unknown,to be determined by experts,Europe | Asia,contaminant > seed contaminant,present,present,present,present,Abutilon theophrasti Medik.,1610
4,Acalypha indica L.,3056259,Plantae,plants,2003,,introduced,to be determined by experts,Africa | Asia,contaminant > contaminant nursery material,present,present,absent,absent,Acalypha indica L.,1221


## Write aggregated records to file

In [15]:
aggregated_records.to_csv('../data/processed/aggregated-checklist.tsv', sep='\t', index=False)