In [1]:
#import statements
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import datetime as dt
import math
import json

here = os.getcwd()
flora_h = F"{here}/data/florahelvetica/"
flora_h_ws = F"{flora_h}atlasws/"
flora_h_55 = F"{flora_h}atlas5x5/"
data_2020 = F"{here}/data/2020/"

## Harmonize taxonomy and aggregate results from different data sources

The data for the observations is collected by different groups and aggregated at different geographic scales. Here all the data is categorized by *genus-species*, the subspcieces are accounted for and the survey results froom 2020 are harmonized with the prior observations.

In [2]:
my_data_methods = {"csv":pd.read_csv}

# organize the data sources into dicts one for each directory
d_files = {
    "surveys":"2020datasimp.csv",
    "map_keys":"map-keys-2020.csv",    
}

w_lists = {
    "list_2014":"BL_WL_2014_modified.csv",
    "under_sampled":"taxa_sous_echantillonnes.csv",
    "red_list":"CH-RLreg_Tracheophyta_2019.csv",
    "gbif":"gbif_species_list.csv"    
}

ws_lists ={
    "biel":"AtlasWS_151_Biel.csv",
    "bielersee":"AtlasWS_253_Bielersee.csv",
    "aarberg":"AtlasWS_300_Aarberg.csv",
    "buren":"AtlasWS_301_Bueren.csv",
    "grenchen":"AtlasWS_154_Grenchen.csv",
    "beatenberg":"AtlasWS_572_Beatenberg.csv",
    "interlaken":"AtlasWS_573_Interlaken.csv"   
}

fx_lists = {
    "585215":"Atlas5x5_585_215.csv",
    "585220":"Atlas5x5_585_220.csv",
    "580220":"Atlas5x5_580_220.csv",
    "580215":"Atlas5x5_580_215.csv"
}

# convenience method to gather up all these files:
def get_the_data(file_exts, a_dir, methods, this_method="csv", myencoding=None):
    wiw = {}
    for k,v in file_exts.items():
        if myencoding == None:
            wiw.update({k:methods[this_method](F"{a_dir}{v}")})            
        else:
            wiw.update({k:methods[this_method](F"{a_dir}{v}",sep = ";", encoding=myencoding)})
            
    return wiw

# convenience method to make slugs
def change_species(x):
    # make a slug
    try: 
        int_data = x.split()
        data = int_data[:2]
        data = "-".join(data)
        data = data.lower()
    except:
        data = "none"
    return data


data_and_keys = get_the_data(d_files, data_2020, my_data_methods, this_method="csv")
watch_lists = get_the_data(w_lists, flora_h, my_data_methods, this_method="csv")
welt_sut =  get_the_data(ws_lists, flora_h_ws, my_data_methods, this_method="csv", myencoding = "utf-16" )
fivex =  get_the_data(fx_lists, flora_h_55, my_data_methods, this_method="csv", myencoding = "utf-16")

# housekeeping: make sure that each data set has the column "species", with the value species:
welt_sut['aarberg']['species']= welt_sut['aarberg']['art']
welt_sut['buren']['species']= welt_sut['buren']['art']
watch_lists["list_2014"]["species"] = watch_lists["list_2014"].Latin
watch_lists["under_sampled"]["species"] = watch_lists["under_sampled"].taxon
watch_lists["red_list"]["species"] = watch_lists["red_list"].scientific_name



# housekeeping: fx_lists['585220'] has one record in one column of one row stored as a string
adf = fivex['585220'].copy()

# get the string value, split it by the comma and make a dict
def make_this_a_dict(x, these_cols):
    return {these_cols[i]:j for i, j in enumerate(x.split(','))}

# make a column with that
adf['a_dict']=adf[adf.columns[0]].map(lambda x:make_this_a_dict(x, fivex['585215'].columns))

# make a dataframe with that column, replace the old data frame:
fivex['585220'] = pd.DataFrame(list(adf['a_dict'].values))

In [3]:
# create a new column to hold a species 'slug'
for element in [fivex, welt_sut, watch_lists]:
    for the_data in element:
        element[the_data]['species_slug'] = 'none'

# make the species slug
for element in [fivex, welt_sut, watch_lists]:
    for the_data in element:
        element[the_data]['species_slug'] = element[the_data].species.map(lambda x: change_species(x))

# add a column to identify the map source:
for element in [fivex, welt_sut]:
    for the_data in element:
        element[the_data]['map'] = the_data
        element[the_data]['spec_map'] = list(zip(element[the_data].species_slug,element[the_data].map))
        
# add a column to identify watch list:
for element in [watch_lists]:
    for the_data in element:
        if the_data == "list_2014":
            element[the_data]['watch_list'] = element[the_data][the_data]
        else:
            element[the_data]['watch_list'] = the_data
        

### Species name and observations: harmonizing taxonomy

The genus-species nomenclature will be used to group observations.

All observations will be classified according to that standard. As a result some subspecies will be folded in with the parent genus-species. This is a reflection of the survey method and the expectation of reasonable results, not a prioritization of importance. The complete list of genus species and subspecies is included at the end of this document.

In [4]:
def account_for_subspecies(an_array, a_dict):
    for element in an_array:
        try:
            a_dict[element[0]].append(element[1])
        except:
            a_dict[element[0]] = [element[1]]
    return a_dict
a_dict ={}

for element in [fivex, welt_sut, watch_lists]:
    for label in element:
        # use this data frame
        som_data = element[label].copy()
        
        # group by species slug and count the number of unique species values
        c_s_p_s = som_data.groupby('species_slug', as_index=False).species.nunique()        
        # just the records with more than one species value
        m_t_one = c_s_p_s[c_s_p_s.species > 1].species_slug
        # pair the species_slug to the species name:
        mto = som_data.loc[som_data.species_slug.isin(m_t_one)][['species_slug', 'species']].copy().to_numpy()
        
        # update the dict
        account_for_subspecies(mto, a_dict)

# the species_slugs that account for more than one sub species
sub_species_accounted =  {k:list(set(v)) for k,v in a_dict.items()}

# the species_slugs
gs_parent = list(sub_species_accounted.keys())

# the number of species_slugs
number_of_gs = len(gs_parent)

# the the number of sub species accounted for
number_of_ss = sum([len(v) for k,v in sub_species_accounted.items()])

In [5]:
print(F"\nThere are {number_of_gs} genus-species that account for {number_of_ss} sub-species. For example records with 'trifolium-incarnatum' include:\n\n{sub_species_accounted['trifolium-incarnatum']}\n")
print("In this specific example all records with the genus-species of 'trifolium-incarnatum' will be counted as the same, indifferent of the sub-species value.")


There are 323 genus-species that account for 1156 sub-species. For example records with 'trifolium-incarnatum' include:

['Trifolium incarnatum L.', 'Trifolium incarnatum L. subsp. incarnatum']

In this specific example all records with the genus-species of 'trifolium-incarnatum' will be counted as the same, indifferent of the sub-species value.


### Collect genus-species observations from each map set.

In [6]:
# collect all the observations into one df
fx = pd.concat([v[['species_slug', 'map', 'spec_map']] for k,v in fivex.items()])
wsx = pd.concat([v[['species_slug', 'map', 'spec_map']] for k,v in welt_sut.items()])
f_w_obs = pd.concat([fx, wsx])

In [7]:
# gather up the map names
m_ap_columns = f_w_obs.map.unique()

# create a column for each map, indicate
for col in m_ap_columns:    
    f_w_obs[col] = f_w_obs['map'] == col

obs_map =f_w_obs.groupby(['species_slug']).sum()

obs_map['maps'] = obs_map.index.map(lambda x: f_w_obs[f_w_obs.species_slug == x]['map'].unique())

#### The genus-species and the maps it was identified on:

In [8]:
obs_map

Unnamed: 0_level_0,585215,585220,580220,580215,biel,bielersee,aarberg,buren,grenchen,beatenberg,interlaken,maps
species_slug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
abies-alba,1,1,1,1,1,0,1,1,1,1,1,"[585215, 585220, 580220, 580215, biel, aarberg..."
abies-nordmanniana,0,0,0,0,0,0,0,0,1,0,0,[grenchen]
abutilon-theophrasti,1,1,0,0,1,0,1,1,0,0,0,"[585215, 585220, biel, aarberg, buren]"
acanthus-mollis,0,0,0,0,1,0,0,0,0,0,0,[biel]
acer-campestre,1,1,1,1,1,1,1,1,1,1,1,"[585215, 585220, 580220, 580215, biel, bielers..."
...,...,...,...,...,...,...,...,...,...,...,...,...
xanthium-spinosum,0,0,1,0,1,0,0,0,0,0,0,"[580220, biel]"
xanthium-strumarium,2,2,2,0,2,0,2,0,2,0,0,"[585215, 585220, 580220, biel, aarberg, grenchen]"
yucca-filamentosa,0,0,0,0,0,0,0,0,1,0,0,[grenchen]
zannichellia-palustris,0,0,1,1,0,1,1,1,0,1,0,"[580220, 580215, bielersee, aarberg, buren, be..."


In [9]:
print(F"\nFor example 'abies-alba' was identified in the following maps:\n\n{obs_map.loc['abies-alba']['maps']}\n")


For example 'abies-alba' was identified in the following maps:

['585215' '585220' '580220' '580215' 'biel' 'aarberg' 'buren' 'grenchen'
 'beatenberg' 'interlaken']



In [10]:
print(F"\nBack to the trifolium example:\n")
print("Not all subspecies were idenitfied on all the maps\n")
obs_map.loc['trifolium-incarnatum']


Back to the trifolium example:

Not all subspecies were idenitfied on all the maps



585215                                                        2
585220                                                        2
580220                                                        0
580215                                                        1
biel                                                          2
bielersee                                                     0
aarberg                                                       2
buren                                                         0
grenchen                                                      2
beatenberg                                                    0
interlaken                                                    0
maps          [585215, 585220, 580215, biel, aarberg, grenchen]
Name: trifolium-incarnatum, dtype: object

In [11]:
#exports the dictionary to a .json file
nt = obs_map['maps'].to_dict()

ntx = {k:list(v) for k,v in nt.items()}

with open(F"{here}/output/ws_list.json","w") as afile:
    json.dump(ntx,afile)
    
print(F"\nWhich maps 'trifolium-incarnatum' were found in? indifferent of subspecies?:\n\n{ntx['trifolium-incarnatum']}\n")


Which maps 'trifolium-incarnatum' were found in? indifferent of subspecies?:

['585215', '585220', '580215', 'biel', 'aarberg', 'grenchen']



### Key the watch lists to genus-species

Identify which species are on which watch list. Produce a boolean matrix with species_slug as index and watch list name for the columns.

In [12]:
the_lists = list(watch_lists.keys())

keep = []

for element in the_lists:
    a = watch_lists[element][['species_slug','watch_list']]
    keep.append(a)
wl_species = pd.concat(keep)
m_ap_columns = wl_species.watch_list.unique()

for col in m_ap_columns:    
    wl_species[col] = wl_species['watch_list'] == col

wl_sp_map =wl_species.groupby(['species_slug']).sum()

wl_sp_map['lists'] = wl_sp_map.index.map(lambda x: wl_species[wl_species.species_slug == x]['watch_list'].unique())

In [13]:
wl_sp_map.head()

Unnamed: 0_level_0,BL,WL,NaN,under_sampled,red_list,gbif,lists
species_slug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
abies-alba,0,0,0,0,1,1,"[red_list, gbif]"
abies-nordmanniana,0,0,0,0,0,1,[gbif]
abutilon-theophrasti,1,0,0,0,0,1,"[BL, gbif]"
acacia-dealbata,0,1,0,0,0,1,"[WL, gbif]"
acalypha-australis,0,0,0,0,0,1,[gbif]


In [14]:
print("\nThe list of genus-species that appear in the black list and gbif:\n\n")

wl_sp_map[(wl_sp_map.BL >0)&(wl_sp_map.gbif >0)]


The list of genus-species that appear in the black list and gbif:




Unnamed: 0_level_0,BL,WL,NaN,under_sampled,red_list,gbif,lists
species_slug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
abutilon-theophrasti,1,0,0,0,0,1,"[BL, gbif]"
ailanthus-altissima,1,0,0,0,0,1,"[BL, gbif]"
ambrosia-artemisiifolia,1,0,0,0,0,1,"[BL, gbif]"
amorpha-fruticosa,1,0,0,0,0,1,"[BL, gbif]"
artemisia-verlotiorum,1,0,0,0,0,1,"[BL, gbif]"
asclepias-syriaca,1,0,0,0,0,1,"[BL, gbif]"
buddleja-davidii,1,0,0,0,0,1,"[BL, gbif]"
bunias-orientalis,1,0,0,0,0,1,"[BL, gbif]"
cabomba-caroliniana,1,0,0,0,0,1,"[BL, gbif]"
crassula-helmsii,1,0,0,0,0,1,"[BL, gbif]"


In [15]:
nt = wl_sp_map['lists'].to_dict()

ntx = {k:list(v) for k,v in nt.items()}

with open(F"{here}/output/flora_list.json","w") as afile:
    json.dump(ntx,afile)

### Taxonomy of 2020 hd samples

In [16]:
#correct wrong species name inputs in the samples data
#samples list
samples = data_and_keys['surveys'].copy()
replacedict = {
    'verbanum bonariensis ':'verbena bonariensis',
    'medicago varia':'medicago sativa',
    "oenothera":"oenothera biennis",
    "geranium pratens":"geranium pratense",
    "senecio jacobaea": "jacobaea vulgaris",
    "oenothera biennis ": "oenothera biennis",
    "oenothera biennis agg.": "oenothera biennis",
    "solidalgo canadensis": "solidago canadensis",
    "verbascum lynchitis":"verbascum lychnitis",
    "verbascum negris":"verbascum nigrum",
    "securigea varia": "securigera varia",
    "melilotus officianalis": "melilotus officinalis",
    "knautia maxima": "knautia dipsacifolia",
    "hieracium aurantiacum":"pilosella aurantiaca",
    "sysimbrium officinale":"sisymbrium officinale",
    "geranium robertanium":"geranium robertianum",
    "mycelis muralis": "lactuca muralis",
    "calamintha-nepeta":"clinopodium nepeta",
    "polygonum-persicaria":"persicaria maculosa",
    "sorbus-aria":"aria edulis",
    "taraxacum": "taraxacum officinale"
}
def new_func(x,keys):
    try:
        data = keys[x]
    except:
        data = x
    return data
samples["species2"] = samples.sci.map(lambda x: new_func(x, replacedict))
samples["species_slug"] = samples.species2.map(lambda x: change_species(x))

data_and_keys.update({'surveys':samples})

In [26]:
#correct incorrectly formatted date inputs in the sample dataset
def change_string(x):
    try:
        s_data = x.split('.')
        data = s_data[::-1]
        data = "-".join(data)
    except:
        print("no luck")
        data = x
    
    return data
samples['new_date'] = samples.date.map(lambda x: change_string(x))


def make_timestamp(x):
    try:        
        data = dt.datetime.strptime(x, "%Y-%m-%d")        
    except:        
        data = 'no luck'
    
    return data
# samples['check_date'] = samples.new_date.map(lambda x: make_timestamp(x))
samples['date'] = samples.new_date

print(F"\nThese are the records in a with no valid date:\n\n{samples[samples.date == 'no luck']}\n")


These are the records in a with no valid date:

Empty DataFrame
Columns: [place, date, time, sci, name, species2, species_slug, new_date, check_date, place1, waterbody, wsnum]
Index: []



In [27]:
d=data_and_keys['surveys'].copy()
d['loc_date'] = list(zip(d.place, d.date))
slugs_2020 = d.species_slug.unique()


a_name =  slugs_2020[12]
one_slug = d[d.species_slug == a_name].copy()

maps_and_lists = pd.concat([obs_map.loc[a_name],wl_sp_map.loc[a_name]])
maps_and_lists

585215                                                           2
585220                                                           2
580220                                                           3
580215                                                           2
biel                                                             3
bielersee                                                        1
aarberg                                                          2
buren                                                            3
grenchen                                                         3
beatenberg                                                       2
interlaken                                                       3
maps             [585215, 585220, 580220, 580215, biel, bielers...
BL                                                               0
WL                                                               0
NaN                                                           

In [28]:
obs_map.loc[a_name][:-1].sum()

26

In [29]:
dname = d[d.species_slug == a_name]
dname.loc_date

15              (alleestrasse 3, 2020-06-16)
25              (alleestrasse 4, 2020-06-16)
208        (chemin de la course, 2020-08-30)
329                  (energie 1, 2020-06-16)
411                (gottstatt 3, 2020-08-24)
487                 (rondchatel, 2020-09-22)
507              (rue de leau 1, 2020-08-23)
539              (rue de leau 4, 2020-08-17)
551              (rue de leau 5, 2020-08-18)
605           (schlosslistrasse, 2020-06-18)
708     (sundgraben bridge left, 2020-09-05)
729    (sundgraben forest right, 2020-09-13)
762                    (sureaux, 2020-08-09)
Name: loc_date, dtype: object

In [30]:
len(dname)/len(d)

0.013903743315508022

In [31]:
len(f_w_obs[f_w_obs.species_slug == a_name])/len(f_w_obs)

0.002009894867037724

In [32]:
#sample place columns

#makes place slug in new column "place1" for the locations with more than one word.
def change_place(x):
    data = x.split(" ")
    data = "-".join(data)
    return data
samples["place1"] = samples.place.map(lambda x: change_place(x))

#adds identifying closest waterbody to the list of place names
#note that suze_b = suze in biel, suz_u = suze upstream from biel.
def add_waterbody(x):
    pattern1 = "sund"
    pattern2 = "frinv"
    pattern3 = "orvine"
    pattern4 = "cheyre"
    pattern5 = "ligerz"
    pattern6 = "twann"
    pattern7 = "tauben"
    pattern8 = "lucherz"
    pattern9 = "rondc"
    pattern10 = "weisse"
    if pattern1 in x or pattern10 in x:
        data = "thunersee"
    elif pattern2 in x or pattern7 in x or pattern9 in x:
        data = "suze_u"
    elif pattern3 in x:
        data = "orvine"
    elif pattern4 in x:
        data = "neuenburgersee"
    elif pattern5 in x or pattern6 in x or pattern8 in x:
        data = "bielersee"
    else:
        data = "suze_b"
    return data
samples["waterbody"] = samples.place.map(lambda x: add_waterbody(x))

#add welten-sutter number to sample location
def addwsnum(x):
    pattern1 = "sund"
    pattern2 = "cheyre"
    pattern3 = "lucherz"
    pattern4 = "weisse"
    pattern5 = "falls"
    if pattern1 in x:
        data = "ws572"
    elif pattern2 in x:
        data = "ws226"
    elif pattern3 in x:
        data = "ws252"
    elif pattern4 in x:
        data = "ws573"
    elif pattern5 in x:
        data = "not-relevant"
    else:
        data = "ws151"
    return data

samples["wsnum"] = samples.place.map(lambda x: addwsnum(x))
samples.tail()

Unnamed: 0,place,date,time,sci,name,species2,species_slug,new_date,check_date,place1,waterbody,wsnum
930,weissenau,2020-08-10,09:14:53,allium carinatum,ail caréné,allium carinatum,allium-carinatum,2020-08-10,2020-08-10,weissenau,thunersee,ws573
931,weissenau,2020-08-10,09:09:48,viburnum lantana,"mancienne, viorne lantane",viburnum lantana,viburnum-lantana,2020-08-10,2020-08-10,weissenau,thunersee,ws573
932,weissenau,2020-08-10,09:07:35,filipendula ulmaria,"reine des prés, spirée",filipendula ulmaria,filipendula-ulmaria,2020-08-10,2020-08-10,weissenau,thunersee,ws573
933,weissenau,2020-08-10,09:05:26,lythrum salicaria,salicaire commune,lythrum salicaria,lythrum-salicaria,2020-08-10,2020-08-10,weissenau,thunersee,ws573
934,weissenau,2020-08-10,09:04:12,lysimachia vulgaris,lysimaque commune,lysimachia vulgaris,lysimachia-vulgaris,2020-08-10,2020-08-10,weissenau,thunersee,ws573


In [None]:
samples.head()