# Preprocessing

In [5]:
# math and data packages
import pandas as pd
import numpy as np
import math

# charting and graphics
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# os and file types
import os
import sys
import datetime as dt
import json
import csv

# images and display
import base64, io, IPython
from PIL import Image as PILImage
from IPython.display import Markdown as md
from IPython.display import display, Math, Latex


# set useful variables for accessing the files
here = os.getcwd()
resources = F"{here}/resources/"
flora_h_ws = F"{resources}atlasws/"
flora_h_55 = F"{resources}atlas5x5/"

## Import data

Fix any known formatting problems here

In [7]:
survey_data = pd.read_csv("resources/surveys.csv")
survey_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935 entries, 0 to 934
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   place   935 non-null    object
 1   date    935 non-null    object
 2   time    756 non-null    object
 3   sci     935 non-null    object
 4   name    671 non-null    object
dtypes: object(5)
memory usage: 36.6+ KB


In [9]:
survey_data.head()

Unnamed: 0,place,date,time,sci,name
0,alleestrasse 1,2020-09-02,11:09:04 AM,plantago lanceolata,plantain lancéolé
1,alleestrasse 1,2020-09-02,11:08:19 AM,centaurea nigra,centaurée noire
2,alleestrasse 1,2020-09-02,11:06:59 AM,plantago media,plantain moyen
3,alleestrasse 1,2020-09-02,11:06:05 AM,chenopodium album agg.,chénopode blanc
4,alleestrasse 1,2020-09-02,11:05:18 AM,centaurea jacea agg.,centaurée jacée


In [15]:
survey_data['loc_date'] = list(zip(survey_data.place, survey_data['date']))
places = survey_data.place.unique()
nsurveys = survey_data.loc_date.nunique()

In [16]:
scis = survey_data.sci.unique()
names = survey_data.name.unique()

In [17]:
# match place names to map coordinates
map_keys = pd.read_csv("resources/map-keys.csv")
mplaces = map_keys.place.unique()
print(len(places), len(mplaces))

111 110


In [18]:
# find the difference

in_survey_data = [x for x in places if x not in mplaces]
in_map_data = [ x for x in mplaces if x not in places]

# these need to be slugged
print(in_survey_data)
print(in_map_data)

['alleestrasse 1', 'alleestrasse 2', 'alleestrasse 3', 'alleestrasse 4', 'bluets 1', 'bluets 2', 'bluets 3', 'boujean 1', 'boujean 2', 'boujean 3', 'buren 1', 'buren 2', 'buren 3', 'cff lot 1', 'cff lot 2', 'cff path 1', 'cff path 2', 'cff path 3', 'chemin de la course', 'chemin des voies', 'cheyres path', 'cheyres reserve', 'cygnes lot 1', 'cygnes lot 2', 'cygnes lot 3', 'cygnes lot 4', 'cygnes lot 5', 'cygnes lot 6', 'cygnes lot 7', 'cygnes lot 8', 'energie 1', 'energie 2', 'energie 3', 'football 1', 'football 2', 'football 3', 'football 4', 'football 5', 'football 6', 'frinvillier fabrique 1', 'frinvillier fabrique 2', 'frinvillier fabrique 3', 'frinvillier fabrique 4', 'frinvillier ramp', 'gottstatt 1', 'gottstatt 2', 'gottstatt 3', 'hayek 1', 'jura 1', 'jura 2', 'lezard 1', 'ligerz favorite', 'lucherz bureli', 'lucherz seestrasse', 'orvin petit moulin', 'pery taubenlochweg', 'pieterlen stockweg', 'rue alfred-aebi', 'rue centrale', 'rue de leau 1', 'rue de leau 2', 'rue de leau 3',

In [19]:
# plant names
invsvs = pd.read_csv("resources/inprocess/invasives.csv")
invsvs.columns

Index(['Latin', 'French', 'Jura', 'Plateau', 'Versant Nord des Alpes',
       'Alpes centrales Ouest', 'Alpes centrales Est', 'Versant Sud des Alpes',
       'non établi en Suisse', '1 Eau1 libres', '2 Rivages et lieu humides',
       '3 Glaciers, rochers, éboulis et moraines', '4 Pelouses et prairies',
       '5 Landes, lisières et mégaphorbiaies', '6 Forêts',
       '7 Végétations pionnières des endroits perturbés',
       '8 Plantations, champs, cultures', '9 Milieu1 construits',
       'Potentiel d'e1pansion', 'santé', 'écologie, biodiversité', 'économie',
       'Präventionscharakter (hoch=1, mittel=2, gering=3)',
       'Regionale Wichtigkeit (hoch=1, mittel=2, gering=3)',
       'Lack List / Watch List "old"', 'list_2014',
       'Ordonnonce sur la dissémination des organismes (ODE)'],
      dtype='object')

In [21]:
invsvs.Latin.head()

0            Abutilon theophrasti Medik.
1    Ailanthus altissima (Mill.) Swingle
2             Ambrosia artemisiifolia L.
3                   Amorpha fruticosa L.
4          Artemisia verlotiorum Lamotte
Name: Latin, dtype: object

In [22]:
priority = pd.read_csv("resources/inprocess/priority.csv")
priority.columns

Index([' ID taxon', 'species', 'nom allemand', 'nom francais', 'nom italien',
       'priorite', 'menace', 'responsabilite',
       'necessite de prendre des mesures',
       'Necessite de surveiller les populations', 'connaissances suffisantes?',
       'techniques connues?', 'Jura', 'Plateau'],
      dtype='object')

In [23]:
priority.species.head()

0                                   Achillea atrata L.
1                                 Achillea clavenae L.
2                               Achillea collina Rchb.
3    Achillea erba-rotta subsp. moschata (Wulfen) V...
4                              Achillea macrophylla L.
Name: species, dtype: object

In [25]:
redlist = pd.read_csv("resources/inprocess/redlist.csv")
redlist.columns

Index(['FAMILY', 'GENUS', 'species', 'Deutscher Name', 'Nom en francais', 'CH',
       'crit_CH', 'JU', 'crit_JU', 'MP', 'crit_MP'],
      dtype='object')

In [26]:
redlist.species.head()

0             Abies alba
1         Acer campestre
2            Acer opalus
3       Acer platanoides
4    Acer pseudoplatanus
Name: species, dtype: object

In [2]:
# start by organizing them into dictionaries.
my_data_methods = {"csv":pd.read_csv}

# import the survey data files
survey_files = {
    "surveys_20":"surveys.csv",
    "surveys_21a":"data-2021.csv",
    "surveys_21b":"surveys-21-vf.csv",
    "map_keys_20":"map-keys.csv",
    "map_keys_21":"2021-survey-key.csv"
}


In [3]:
# Import the survey data files into panda data frames.

# surveys_20 = 2020 survey data
# surveys_21a = Feb - May 2021 survey data
# surveys_21b = Jun - Oct 2021 survey data
# keys_20 = locations keys for 2020 surveys
# keys_21a = location keys for Feb-May 2021 surveys

surveys_20 = pd.read_csv("resources/surveys.csv")
surveys_21a = pd.read_csv("resources/data-2021.csv")
surveys_21b= pd.read_csv("resources/surveys-21-vf.csv", encoding = "utf-16")
keys_20 = pd.read_csv("resources/map-keys.csv")
keys_21a = pd.read_csv("resources/2021-survey-key.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'resources/data-2021.csv'

In [None]:
# dict for the reference files that are not Welten Sutter or Flora Helvetica 5X5 lists

# watch_list = list of invasive and potentially invasive species
# red_list = conservation status of indigenous species
# priority_list = priority species for conservation in Switzerland

off_lists = {
    "list_2014":"BL_WL_2014_modified.csv",
    "under_sampled":"taxa_sous_echantillonnes.csv",
    "red_list":"redlist2019.csv",
    "cert_list": "Certification_specieslist_2021.csv",
    "ch_priority":"ch_priority_species.csv"
}

watch_list = pd.read_csv("resources/BL_WL_2014_modified.csv")
red_list = pd.read_csv("resources/redlist2019.csv")
under_sampled = pd.read_csv("resources/taxa_sous_echantillonnes.csv")
priority_list = pd.read_csv("resources/ch_priority_species.csv")



In [None]:
# import lists from Equipe Volo

volo_lists = {
    "imp_seeds":"important-seeds-2021.csv",
    "off_list":"all-seeds-2021.csv",
    "inventory":"volo-inventory-de.csv",
    "germ_exp":"germination-experiment.csv"
}

# germ_exp = data from 2021 seed germination experiment
# imp_seeds = seeds that should have been collected in 2021
# all_seeds = all plant species in the volo catalogues for 2021
# inventory = equipe volo seed inventory

imp_seeds = pd.read_csv("resources/important-seeds-2021.csv")
off_list = pd.read_csv("resources/all-seeds-2021.csv")
inventory = pd.read_csv("resources/volo-inventory-de.csv")
germ_exp = pd.read_csv("resources/germination-experiment.csv")


In [None]:
# dict for the Welten-Sutter map reference files, downloaded from here: www.infoflora.ch
# all observations included in this report were conducted within one of these geographic boundaries (true for 2020 surveys)
# need to include some 2021 surveys.
ws_lists ={
    "151":"AtlasWS_151_Biel.csv",
    "252":"AtlasWS_252_Erlach.csv",
    "300":"AtlasWS_300_Aarberg.csv",
    "301":"AtlasWS_301_Bueren.csv",
    "154":"AtlasWS_154_Grenchen.csv",
    "572":"AtlasWS_572_Beatenberg.csv",
    "573":"AtlasWS_573_Interlaken.csv",
    "226":"AtlasWS_226_Estavayer.csv",
    "251":"AtlasWS_251_BernWest.csv",
    "145":"AtlasWS_145_LesRangiers.csv"
}


In [None]:
# dict for the 5X5 Flora Helvetica lists downloaded from www.infoflora.ch

# housekeeping: 585220 is separated by "," not ";" like the rest of the data sources
df = pd.read_csv("resources/atlas5x5/Atlas5x5_585_220.csv", sep = ",", encoding="utf-16")
df.to_csv('resources/atlas5x5/Atlas5x5_585_220_1.csv', sep=';', encoding = "utf-16", index = False)

fx_lists = {
    "585215":"Atlas5x5_585_215.csv", # Ipsach, Bielersee
    "585220":"Atlas5x5_585_220_1.csv", # Biel Stadt, Suze / Bielersee
    "580220":"Atlas5x5_580_220.csv", # Biel Mett, Suze
    "580215":"Atlas5x5_580_215.csv", # Port, Nidau-Bueren Kanal
    "625165":"Atlas5x5_625_165.csv", # Untersee, Thunersee
    "625170":"Atlas5x5_625_170.csv", # Sundlauenen, Thunersee
    "550185":"Atlas5x5_550_185.csv", # Estavayer, Lac de Neuchatel
    "575210":"Atlas5x5_575_210.csv", # Leuecherz, Bielersee
    "600200":"Atlas5x5_600_200.csv", # Bern west, Aare
    "575245":"Atlas5x5_575_245.csv", # Saint-Ursanne, Aare
    "545180":"Atlas5x5_545_180.csv",
    "575215":"Atlas5x5_575_215.csv"
}

In [None]:
# convenience method to gather up all the files:
def get_the_data(file_exts, methods, this_method="csv", myencoding=None):
    wiw = {}
    for k,v in file_exts.items():
        if myencoding == None:
            wiw.update({k:methods[this_method](F"resources/{v}")})            
        else:
            wiw.update({k:methods[this_method](F"resources/{v}",sep = ";", encoding=myencoding)})
    return wiw



In [None]:
# use the get_the_data method to collect these files
data_and_keys = get_the_data(survey_files, my_data_methods, this_method="csv")
watch_lists = get_the_data(w_lists, my_data_methods, this_method="csv")

In [None]:
# importing the data files
# create variables for convenience method and grouping

# start by organizing them into dictionaries.
my_data_methods = {"csv":pd.read_csv}

# dict for the 2020 survey data files
d_files = {
    "surveys_20":"surveys.csv",
    "surveys_21a":"data-2021.csv",
    "surveys_21b":"obs_export_2021-10-26_20h46.csv",
    "map_keys_20":"map-keys.csv",
    "map_keys_21":"2021-survey-key.csv"
}
# match map_keys_21 to surveys_21A
# date and time format correction
# species slug
# format to 2020

# dict for the reference files that are not Welten Sutter or Flora Helvetica 5X5 lists
w_lists = {
    "list_2014":"BL_WL_2014_modified.csv",
    "under_sampled":"taxa_sous_echantillonnes.csv",
    "red_list":"redlist2019.csv",
    "cert_list": "Certification_specieslist_2021.csv",
    "ch_priority":"ch_priority_species.csv"
}

# format and import ch_priority_species.csv to use with 2020 & 2021 survey results
# evaluate the survey results with respect to the columns "priorité	menace	responsabilité	nécessité de prendre des mesures	nécessité de surveiller les populations	connaissances suffisantes?	techniques connues?
# repeat 2020 data with redlist with 2021A

v_lists = {
    "imp_seeds":"important-seeds-2021.csv",
    "all_seeds":"all-seeds-2021.csv",
    "inv_cave":"volo-inventory-de.csv",
    "germ_exp":"germination-experiment.csv"
    
}
# add dict for the volo files

# dict for the Welten-Sutter map reference files, downloaded from here: https://www.infoflora.ch/de/daten/artenliste-welten-sutter.html
# all observations included in this report were conducted within one of these geographic boundaries
ws_lists ={
    "151":"AtlasWS_151_Biel.csv",
    "252":"AtlasWS_252_Erlach.csv",
    "300":"AtlasWS_300_Aarberg.csv",
    "301":"AtlasWS_301_Bueren.csv",
    "154":"AtlasWS_154_Grenchen.csv",
    "572":"AtlasWS_572_Beatenberg.csv",
    "573":"AtlasWS_573_Interlaken.csv",
    "226":"AtlasWS_226_Estavayer.csv",
    "251":"AtlasWS_251_BernWest.csv",
    "145":"AtlasWS_145_LesRangiers.csv"
}

# housekeeping: 585220 is separated by "," not ";" like the rest of the data sources
df = pd.read_csv("resources/atlas5x5/Atlas5x5_585_220.csv", sep = ",", encoding="utf-16")
df.to_csv('resources/atlas5x5/Atlas5x5_585_220_1.csv', sep=';', encoding = "utf-16", index = False)

# 5X5 Flora helvitca lists
fx_lists = {
    "585215":"Atlas5x5_585_215.csv", # Ipsach, Bielersee
    "585220":"Atlas5x5_585_220_1.csv", # Biel Stadt, Suze / Bielersee
    "580220":"Atlas5x5_580_220.csv", # Biel Mett, Suze
    "580215":"Atlas5x5_580_215.csv", # Port, Nidau-Bueren Kanal
    "625165":"Atlas5x5_625_165.csv", # Untersee, Thunersee
    "625170":"Atlas5x5_625_170.csv", # Sundlauenen, Thunersee
    "550185":"Atlas5x5_550_185.csv", # Estavayer, Lac de Neuchatel
    "575210":"Atlas5x5_575_210.csv", # Leuecherz, Bielersee
    "600200":"Atlas5x5_600_200.csv", # Bern west, Aare
    "575245":"Atlas5x5_575_245.csv", # Saint-Ursanne, Aare
    "545180":"Atlas5x5_545_180.csv",
    "575215":"Atlas5x5_575_215.csv"
}

# convenience method to gather up all the files:
def get_the_data(file_exts, a_dir, methods, this_method="csv", myencoding=None):
    wiw = {}
    for k,v in file_exts.items():
        if myencoding == None:
            wiw.update({k:methods[this_method](F"{a_dir}{v}")})            
        else:
            wiw.update({k:methods[this_method](F"{a_dir}{v}",sep = ";", encoding=myencoding)})
    return wiw

# use the get_the_data method to collect these files
data_and_keys = get_the_data(d_files, data_2020, my_data_methods, this_method="csv")
watch_lists = get_the_data(w_lists, flora_h, my_data_methods, this_method="csv")

# Why are we usig utf-16 here? Don't we lose some options later on?
welt_sut =  get_the_data(ws_lists, flora_h_ws, my_data_methods, this_method="csv", myencoding = "utf-16" )
fivex =  get_the_data(fx_lists, flora_h_55, my_data_methods, this_method="csv", myencoding = "utf-16")

## The Species columns

In [None]:
# make sure that each data set has the column "species", with the value species:
watch_lists["list_2014"]["species"] = watch_lists["list_2014"].Latin
watch_lists["under_sampled"]["species"] = watch_lists["under_sampled"].taxon
watch_lists["red_list"]["species"] = watch_lists["red_list"].scientific_name
watch_lists["cert_list"]["species"] = watch_lists["cert_list"]["Short Name"]

In [None]:
# make a species slug (genus-species) to link data from across the survey and reference files. 
# This is necessary as some species columns have only "Genus species", some include subspecies, and some include the taxonomic reference.

# function to make the species slugs
def to_species_slug(x):
    try: 
        int_data = x.split()
        data = int_data[:2]
        data = "-".join(data)
        data = data.lower()
    except:
        data = "none"
    return data

# create a new column to hold the slug
for element in [fivex, welt_sut, watch_lists]:
    for the_data in element:
        element[the_data]['species_slug'] = 'none'

# make the species slug for all reference files
for element in [fivex, welt_sut, watch_lists]:
    for the_data in element:
        element[the_data]['species_slug'] = element[the_data].species.map(lambda x: to_species_slug(x))

## The _map_ column and the _spec\_map_ columns

In [None]:
# Add identifying columns to the reference datasets

# add a column to identify the map source for the geographic data:
for element in [fivex, welt_sut]:
    for the_data in element:
        element[the_data]['map'] = the_data
        element[the_data]['spec_map'] = list(zip(element[the_data].species_slug,element[the_data].map))
        


In [None]:
fivex.keys()

In [None]:
fivex['585215'].head()

## The _watch\_list_ column

In [None]:
for element in [watch_lists]:
    for the_data in element:
        if the_data == "list_2014":
            element[the_data]['watch_list'] = element[the_data][the_data]
        else:
            element[the_data]['watch_list'] = the_data

# housekeeping: fill in nan values in the watchlist and certification list reference files.
fill_nans = watch_lists["list_2014"].copy()
fill_nans = fill_nans.fillna(0)
watch_lists.update({"list_2014":fill_nans[fill_nans.watch_list != 0]})

fill_nans = watch_lists["cert_list"].copy()
fill_nans = fill_nans.fillna(0)
watch_lists.update({"cert_list":fill_nans[fill_nans.watch_list != 0]})

In [None]:
watch_lists.keys()

In [None]:
watch_lists['red_list'].head()

### Species name and observations: harmonizing taxonomy

The genus-species nomenclature will be used to group observations.

All observations will be classified according to that standard. As a result subspecies will be folded in with the parent species. This is a reflection of the survey method and the expectation of reasonable results, not a prioritization of importance.

In [None]:
def account_for_subspecies(an_array, a_dict):
    for element in an_array:
        try:
            a_dict[element[0]].append(element[1])
        except:
            a_dict[element[0]] = [element[1]]
    return a_dict
a_dict ={}

for element in [fivex, welt_sut, watch_lists]:
    for label in element:
        # use this data frame
        som_data = element[label].copy()
        
        # group by species slug and count the number of unique species values
        c_s_p_s = som_data.groupby('species_slug', as_index=False).species.nunique()        
        
        # just the records with more than one species value
        m_t_one = c_s_p_s[c_s_p_s.species > 1].species_slug
        
        # pair the species_slug to the species name:
        mto = som_data.loc[som_data.species_slug.isin(m_t_one)][['species_slug', 'species']].copy().to_numpy()
        
        # update the dict
        account_for_subspecies(mto, a_dict)

# the species_slugs that account for more than one sub species
sub_species_accounted =  {k:list(set(v)) for k,v in a_dict.items()}

# the species_slugs
gs_parent = list(sub_species_accounted.keys())

# the number of species_slugs
number_of_gs = len(gs_parent)

# the the number of sub species accounted for
number_of_ss = sum([len(v) for k,v in sub_species_accounted.items()])

## Determine wether or not a species was detected within a geographic limit

The territory is divided into different segments. Flora-helvitica and WS maps have different geographic bounds. Here the presence or not of a species within the confines of one of the different boundaries is determined.

In [None]:
# collect all the observations from flora helvitaca and the WS into one df
fx = pd.concat([v[['species_slug', 'map', 'spec_map']] for k,v in fivex.items()])
wsx = pd.concat([v[['species_slug', 'map', 'spec_map']] for k,v in welt_sut.items()])
f_w_obs = pd.concat([fx, wsx])

In [None]:
f_w_obs.head()

In [None]:
# replacedict = {
#     'verbanum bonariensis ':'verbena bonariensis',
#     'medicago varia':'medicago sativa',
#     "oenothera":"oenothera biennis",
#     "geranium pratens":"geranium pratense",
#     "oenothera biennis ": "oenothera biennis",
#     "oenothera biennis agg.": "oenothera biennis",
#     "solidalgo canadensis": "solidago canadensis",
#     "verbascum lynchitis":"verbascum lychnitis",
#     "verbascum negris":"verbascum nigrum",
#     "securigea varia": "securigera varia",
#     "melilotus officianalis": "melilotus officinalis",
#     "knautia maxima": "knautia dipsacifolia",
#     "hieracium aurantiacum":"pilosella aurantiaca",
#     "sysimbrium officinale":"sisymbrium officinale",
#     "geranium robertanium":"geranium robertianum",
#     "mycelis muralis": "lactuca muralis",
#     "calamintha-nepeta":"clinopodium nepeta",
#     "polygonum-persicaria":"persicaria maculosa",
#     "sorbus-aria":"aria edulis",
#     "taraxacum": "taraxacum officinale",
#     "jacobaea vulgaris" : "senecio jacobaea",
#     "erigeron canadensis" : "conyza canadensis",
#     "rorippa islandica" : "rorippa palustris",
#     "malus sylvestris" : "malus domestica",
#     "hylotelephium telephium" : "sedum telephium",
#     "lactuca muralis": "mycelis muralis",
#     "chaenorhinum minus": "chaenorrhinum minus",
#     "erigeron canadensis": "conzya canadensis",
#     "erigeron canadensis": "conzya canadensis",
#     "borkhausenia intermedia": "scandosorbus intermedia",
#     "centaurea nigra" : "centaurea jacea"
# }

In [None]:
# f_w_obs['ns'] = f_w_obs.species_slug
# def replace_this(x,a_dict):
#     if x in a_dict.keys():
#         data=a_dict[x]
#     else:
#         data = x
#     return data
# f_w_obs['ns'] = f_w_obs.ns.map(lambda x: replace_this(x, replacedict))
# f_w_obs['species_slug'] = f_w_obs.ns

In [None]:
astring = F"""
There are {len(f_w_obs.map.unique())} different map boundaries in this study
"""
md(astring)

In [None]:
# a = f_w_obs.set_index('species_slug')
# a.loc['bryonia-dioica']

In [None]:
# gather up the map names
m_ap_columns = f_w_obs.map.unique()

# create a column for each map, indicate
for col in m_ap_columns:
    
    f_w_obs[col] = f_w_obs['map'] == col

obs_map =f_w_obs.groupby(['species_slug']).sum()

# human readable column names need to be introduced here or a dict to rename
obs_map.head()

In [None]:
obs_map.loc['bryonia-dioica']

In [None]:
a_file_name = "species_map_located.csv"
obs_map.to_csv(F"resources/survey-data/{a_file_name}", index=True)

### Key the species to the different maps it was identified in


In [None]:
# #exports the dictionary to a .json file
# nt = obs_map['maps'].to_dict()

# ntx = {k:list(v) for k,v in nt.items()}

# with open(F"{here}/output/ws_list.json","w") as afile:
#     json.dump(ntx,afile)
    
# print(F"\nWhich maps 'trifolium-incarnatum' were found in? indifferent of subspecies?:\n\n{ntx['trifolium-incarnatum']}\n")

## The _species\_slug_ column

A coder friendly way to find the species but still maintain the proper nomenclature

In [None]:
# format scientific name in the sample dataset
samples = data_and_keys['surveys'].copy()

samples.head()

In [None]:
# dictionary of replacement values that are incorrect
replacedict = {
    'verbanum bonariensis ':'verbena bonariensis',
    'medicago varia':'medicago sativa',
    "oenothera":"oenothera biennis",
    "geranium pratens":"geranium pratense",
    "oenothera biennis ": "oenothera biennis",
    "oenothera biennis agg.": "oenothera biennis",
    "solidalgo canadensis": "solidago canadensis",
    "verbascum lynchitis":"verbascum lychnitis",
    "verbascum negris":"verbascum nigrum",
    "securigea varia": "securigera varia",
    "melilotus officianalis": "melilotus officinalis",
    "knautia maxima": "knautia dipsacifolia",
    "hieracium aurantiacum":"pilosella aurantiaca",
    "sysimbrium officinale":"sisymbrium officinale",
    "geranium robertanium":"geranium robertianum",
    "mycelis muralis": "lactuca muralis",
    "calamintha-nepeta":"clinopodium nepeta",
    "polygonum-persicaria":"persicaria maculosa",
    "sorbus-aria":"aria edulis",
    "taraxacum": "taraxacum officinale",
    "jacobaea vulgaris" : "senecio jacobaea",
    "erigeron canadensis" : "conyza canadensis",
    "rorippa islandica" : "rorippa palustris",
    "malus sylvestris" : "malus domestica",
    "hylotelephium telephium" : "sedum telephium",
    "lactuca muralis": "mycelis muralis",
    "chaenorhinum minus": "chaenorrhinum minus",
    "erigeron canadensis": "conzya canadensis",
    "erigeron canadensis": "conzya canadensis",
    "borkhausenia intermedia": "scandosorbus intermedia",
    "centaurea nigra" : "centaurea jacea"
}

# function to assign the correct value of the key is in the samples dictionary.
def new_func(x,keys):
    try:
        data = keys[x]
    except:
        data = x
    return data

# apply the funtion to a copy of the surveys data set.
samples["species2"] = samples.sci.map(lambda x: new_func(x, replacedict))
samples["species_slug"] = samples.species2.map(lambda x: to_species_slug(x))

# update the surveys dataset.
data_and_keys.update({'surveys':samples})

In [None]:
data_and_keys["surveys"]

## Format date column to ISO standard

In [None]:
# format dates in the sample dataset

# function converts DD.MM.YYYY format to YYYY-MM-DD format, ignores if already in YYYY-MM-DD format
# Do we even need this any more ? Looking at the cell above it appears the dates have been fixed at the source
def change_string(x):
    try:
        s_data = x.split('.')
        data = s_data[::-1]
        data = "-".join(data)
    except:
        print("no luck")
        data = x
    
    return data

# applies the function to a column in the samples data frame
samples['new_date'] = samples.date.map(lambda x: change_string(x))

# function makes a timestamp out of the YYYY-MM-DD string.
# def make_timestamp(x):
#     try:        
#         data = dt.datetime.strptime(x, "%Y-%m-%d")        
#     except:        
#         data = 'no luck'
    
#     return data

# run the make_timestamp function and store the results in the samples dataframe.
# samples['stamp_date'] = samples.new_date.map(lambda x: make_timestamp(x))
# samples['date'] = samples.stamp_date

## The _place1_ column

A coder friendly way to find the species but still maintain the proper nomenclature

In [None]:
# make place name slugs in the sample dataset
#  change_place function turns "place names 1" into place-names-1
def change_place(x):
    data = x.split(" ")
    data = "-".join(data)
    return data

samples["place_slug"] = samples.place.map(lambda x: change_place(x))

## Housekeeping and export of processed survey data

In [None]:
# clean up redundant and unused column names
# rename the place_slug column

samples.rename(columns= {'place1':'place_slug', 'new_date':'str_date'}, inplace=True)

In [None]:
samples.columns

In [None]:
# remove the unnecessary columns for this analysis
samples.drop(['time',  'sci', 'name', 'place', 'species2'] , inplace=True, axis=1)

samples['loc_date'] = list(zip(samples.place_slug, samples.date))

In [None]:
# export samples to a .csv for later use
samples.to_csv(F"resources/preprocessed/hd_samples_2020.csv", index=False)

In [None]:
samples.head()

Subspecies

In [None]:
def account_for_subspecies(an_array, a_dict):
    for element in an_array:
        try:
            a_dict[element[0]].append(element[1])
        except:
            a_dict[element[0]] = [element[1]]
    return a_dict
a_dict ={}

for element in [fivex, welt_sut, watch_lists]:
    for label in element:
        # use this data frame
        som_data = element[label].copy()
        
        # group by species slug and count the number of unique species values
        c_s_p_s = som_data.groupby('species_slug', as_index=False).species.nunique()        
        
        # just the records with more than one species value
        m_t_one = c_s_p_s[c_s_p_s.species > 1].species_slug
        
        # pair the species_slug to the species name:
        mto = som_data.loc[som_data.species_slug.isin(m_t_one)][['species_slug', 'species']].copy().to_numpy()
        
        # update the dict
        account_for_subspecies(mto, a_dict)

# the species_slugs that account for more than one sub species
sub_species_accounted =  {k:list(set(v)) for k,v in a_dict.items()}

# the species_slugs
gs_parent = list(sub_species_accounted.keys())

# the number of species_slugs
number_of_gs = len(gs_parent)

# the the number of sub species accounted for
number_of_ss = sum([len(v) for k,v in sub_species_accounted.items()])

### Key the watch lists to genus-species

Identify which species are on which watch list. Produce a boolean matrix with species_slug as index and watch list name for the columns.

In [None]:
the_lists = list(watch_lists.keys())

keep = []

for element in the_lists:
    a = watch_lists[element][['species_slug','watch_list']]
    keep.append(a)
wl_species = pd.concat(keep)
m_ap_columns = wl_species.watch_list.unique()

for col in m_ap_columns:    
    wl_species[col] = wl_species['watch_list'] == col

wl_sp_map =wl_species.groupby(['species_slug']).sum()

wl_sp_map['lists'] = wl_sp_map.index.map(lambda x: wl_species[wl_species.species_slug == x]['watch_list'].unique())

a_filename = "species_keyed_to_watch_list.csv"
wl_sp_map.reset_index().to_csv(F"resources/{a_filename}", index=False)

In [None]:
wl_sp_map

In [None]:
# nt = wl_sp_map['lists'].to_dict()

# ntx = {k:list(v) for k,v in nt.items()}

# with open(F"{here}/output/flora_list.json","w") as afile:
#     json.dump(ntx,afile)
# wl_sp_map.to_csv(F"{here}/resources/preprocessed/aggregated_list.csv",index=True)