This preprocessing file cleans up the survey and reference data files and creates reference files in the resources folder for the analysis carried out in the 2020 descriptive notebook.

In [1]:
# import statements

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import datetime as dt
import math
import json
import csv


# set useful variables for accessing the files

here = os.getcwd()
flora_h = F"{here}/resources/reference-data/"
flora_h_ws = F"{flora_h}atlasws/"
flora_h_55 = F"{flora_h}atlas5x5/"
data_2020 = F"{here}/resources/survey-data/"

In [2]:
# importing the data files

# start by organizing them into dictionaries.

my_data_methods = {"csv":pd.read_csv}

# dict for the 2020 survey data files
d_files = {
    "surveys":"2020datasimp.csv",
    "map_keys":"map-keys-2020.csv",    
}

# dict for the reference files that are not Welten Sutter or Flora Helvetica 5X5 lists

w_lists = {
    "list_2014":"BL_WL_2014_modified.csv",
    "under_sampled":"taxa_sous_echantillonnes.csv",
    "red_list":"CH-RLreg_Tracheophyta_2019.csv",
    "cert_list": "Certification_specieslist_2021.csv"
}

# dict for the Welten-Sutter map reference files, downloaded from here: https://www.infoflora.ch/de/daten/artenliste-welten-sutter.html
# all observations included in this report were conducted within one of these geographic boundaries

ws_lists ={
    "151":"AtlasWS_151_Biel.csv",
    "252":"AtlasWS_252_Erlach.csv",
    "300":"AtlasWS_300_Aarberg.csv",
    "301":"AtlasWS_301_Bueren.csv",
    "154":"AtlasWS_154_Grenchen.csv",
    "572":"AtlasWS_572_Beatenberg.csv",
    "573":"AtlasWS_573_Interlaken.csv",
    "226":"AtlasWS_226_Estavayer.csv",
    "251":"AtlasWS_251_BernWest.csv",
    "145":"AtlasWS_145_LesRangiers.csv"
}


# dict for the Flora Helvetica 5x5 map reference files, downloaded from here: https://www.infoflora.ch/de/daten/artenliste-5x5-km.html
# all observations included in this report were conducted within one of these geographic boundaries

# housekeeping: 585220 is separated by "," not ";" like the rest of the data sources
df = pd.read_csv("resources/reference-data/atlas5x5/Atlas5x5_585_220.csv", sep = ",", encoding="utf-16")
df.to_csv('resources/reference-data/atlas5x5/Atlas5x5_585_220_1.csv', sep=';', encoding = "utf-16", index = False)

fx_lists = {
    "585215":"Atlas5x5_585_215.csv", # Ipsach, Bielersee
    "585220":"Atlas5x5_585_220_1.csv", # Biel Stadt, Suze / Bielersee
    "580220":"Atlas5x5_580_220.csv", # Biel Mett, Suze
    "580215":"Atlas5x5_580_215.csv", # Port, Nidau-Bueren Kanal
    "625165":"Atlas5x5_625_165.csv", # Untersee, Thunersee
    "625170":"Atlas5x5_625_170.csv", # Sundlauenen, Thunersee
    "550185":"Atlas5x5_550_185.csv", # Estavayer, Lac de Neuchatel
    "575210":"Atlas5x5_575_210.csv", # Leuecherz, Bielersee
    "600200":"Atlas5x5_600_200.csv", # Bern west, Aare
    "575245":"Atlas5x5_575_245.csv"  # Saint-Ursanne, Aare
}

# convenience method to gather up all the files:

def get_the_data(file_exts, a_dir, methods, this_method="csv", myencoding=None):
    wiw = {}
    for k,v in file_exts.items():
        if myencoding == None:
            wiw.update({k:methods[this_method](F"{a_dir}{v}")})            
        else:
            wiw.update({k:methods[this_method](F"{a_dir}{v}",sep = ";", encoding=myencoding)})
    return wiw

# use the get_the_data method to collect these files

data_and_keys = get_the_data(d_files, data_2020, my_data_methods, this_method="csv")
watch_lists = get_the_data(w_lists, flora_h, my_data_methods, this_method="csv")
welt_sut =  get_the_data(ws_lists, flora_h_ws, my_data_methods, this_method="csv", myencoding = "utf-16" )
fivex =  get_the_data(fx_lists, flora_h_55, my_data_methods, this_method="csv", myencoding = "utf-16")

Organization of the reference files

In [3]:
# housekeeping

# housekeeping: make sure that each data set has the column "species", with the value species:
watch_lists["list_2014"]["species"] = watch_lists["list_2014"].Latin
watch_lists["under_sampled"]["species"] = watch_lists["under_sampled"].taxon
watch_lists["red_list"]["species"] = watch_lists["red_list"].scientific_name
watch_lists["cert_list"]["species"] = watch_lists["cert_list"]["Short Name"]


In [4]:
# make a species slug (genus-species) to link data from across the survey and reference files. 
# This is necessary as some species columns have only "Genus species", some include subspecies, and some include the taxonomic reference.

# function to make the species slugs
def to_species_slug(x):
    try: 
        int_data = x.split()
        data = int_data[:2]
        data = "-".join(data)
        data = data.lower()
    except:
        data = "none"
    return data

# create a new column to hold the slug

for element in [fivex, welt_sut, watch_lists]:
    for the_data in element:
        element[the_data]['species_slug'] = 'none'

# make the species slug for all reference files

for element in [fivex, welt_sut, watch_lists]:
    for the_data in element:
        element[the_data]['species_slug'] = element[the_data].species.map(lambda x: to_species_slug(x))

In [13]:
# Add identifying columns to the reference datasets

# add a column to identify the map source for the geographic data:

for element in [fivex, welt_sut]:
    for the_data in element:
        element[the_data]['map'] = the_data
        element[the_data]['spec_map'] = list(zip(element[the_data].species_slug,element[the_data].map))
        
# add a column to identify watch list:

for element in [watch_lists]:
    for the_data in element:
        if the_data == "list_2014":
            element[the_data]['watch_list'] = element[the_data][the_data]
        else:
            element[the_data]['watch_list'] = the_data

# housekeeping: fill in nan values in the watchlist and certification list reference files.

fill_nans = watch_lists["list_2014"].copy()
fill_nans = fill_nans.fillna(0)
watch_lists.update({"list_2014":fill_nans[fill_nans.watch_list != 0]})

fill_nans = watch_lists["cert_list"].copy()
fill_nans = fill_nans.fillna(0)
watch_lists.update({"cert_list":fill_nans[fill_nans.watch_list != 0]})


# replace asterisk with 1

watch_lists['cert_list'].replace("*",1)

Unnamed: 0,No ISFS,Full Name,Short Name,Family,200 Bellis,400 Iris,600 Dryas,Indigenat,F value,R value,N value,species,species_slug,watch_list
0,100,Abies alba Mill.,Abies alba,Pinaceae,1,1,1,I,4,3,3,Abies alba,abies-alba,cert_list
1,300,Acer campestre L.,Acer campestre,Sapindaceae,1,1,1,I,2.5,3,3,Acer campestre,acer-campestre,cert_list
2,600,Acer opalus Mill.,Acer opalus,Sapindaceae,0,0,1,I,2,4,3,Acer opalus,acer-opalus,cert_list
3,700,Acer platanoides L.,Acer platanoides,Sapindaceae,1,1,1,I,3,4,3,Acer platanoides,acer-platanoides,cert_list
4,800,Acer pseudoplatanus L.,Acer pseudoplatanus,Sapindaceae,1,1,1,I,3.5,3,3,Acer pseudoplatanus,acer-pseudoplatanus,cert_list
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,449200,Viola calcarata L.,Viola calcarata,Violaceae,0,0,1,I,3,3,2,Viola calcarata,viola-calcarata,cert_list
596,450300,Viola hirta L.,Viola hirta,Violaceae,0,1,1,I,2.5,4,2,Viola hirta,viola-hirta,cert_list
597,451600,Viola reichenbachiana Boreau,Viola reichenbachiana,Violaceae,1,1,1,I,3,3,3,Viola reichenbachiana,viola-reichenbachiana,cert_list
598,452500,Viola tricolor L.,Viola tricolor,Violaceae,1,1,1,I,3,2,3,Viola tricolor,viola-tricolor,cert_list


In [14]:
watch_lists["list_2014"]

Unnamed: 0,Latin,French,Jura,Plateau,Versant Nord des Alpes,Alpes centrales Ouest,Alpes centrales Est,Versant Sud des Alpes,non établi en Suisse,1 Eau1 libres,...,"écologie, biodiversité",économie,"Präventionscharakter (hoch=1, mittel=2, gering=3)","Regionale Wichtigkeit (hoch=1, mittel=2, gering=3)","Lack List / Watch List ""old""",list_2014,Ordonnonce sur la dissémination des organismes (ODE),species,species_slug,watch_list
0,Abutilon theophrasti Medik.,Abutilon de Théophraste,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1,3.0,2,2,-,BL,-,Abutilon theophrasti Medik.,abutilon-theophrasti,BL
1,Ailanthus altissima (Mill.) Swingle,Ailante,2.0,3.0,1.0,2.0,1.0,3.0,0.0,0.0,...,2,2.0,3,3,BL,BL,-,Ailanthus altissima (Mill.) Swingle,ailanthus-altissima,BL
2,Ambrosia artemisiifolia L.,Ambroisie à feuilles d'armoise,3.0,3.0,1.0,2.0,1.0,3.0,0.0,0.0,...,-1,3.0,3,2,BL,BL,ODE,Ambrosia artemisiifolia L.,ambrosia-artemisiifolia,BL
3,Amorpha fruticosa L.,Amorphe buissonnante,1.0,-1.0,0.0,0.0,0.0,2.0,0.0,0.0,...,3,2.0,1,1,WL,BL,-,Amorpha fruticosa L.,amorpha-fruticosa,BL
4,Artemisia verlotiorum Lamotte,Armoise des frères Verlot,2.0,3.0,2.0,2.0,1.0,3.0,0.0,0.0,...,2,2.0,3,3,BL,BL,-,Artemisia verlotiorum Lamotte,artemisia-verlotiorum,BL
5,Asclepias syriaca L.,Asclépiade de Syrie,1.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,...,3,3.0,1,1,WL,BL,-,Asclepias syriaca L.,asclepias-syriaca,BL
6,Buddleja davidii Franch.,Buddléia de David,3.0,3.0,3.0,2.0,2.0,3.0,0.0,0.0,...,3,1.0,3,3,BL,BL,-,Buddleja davidii Franch.,buddleja-davidii,BL
7,Bunias orientalis L.,Bunias d'Orient,3.0,2.0,0.0,3.0,2.0,1.0,0.0,0.0,...,2,3.0,2,2,WL,BL,-,Bunias orientalis L.,bunias-orientalis,BL
8,Cabomba caroliniana A. Gray,"Cabomba, Evantail de Caroline",0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,3,3.0,1,0,-,BL,-,Cabomba caroliniana A. Gray,cabomba-caroliniana,BL
9,Crassula helmsii (Kirk) Cockayne,Crassule de Helm,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,3,3.0,1,0,-,BL,ODE,Crassula helmsii (Kirk) Cockayne,crassula-helmsii,BL


Organization of the survey data

In [6]:
# clean up input errors in the sample data

samples = data_and_keys['surveys'].copy()

# dictionary of replacement values that are incorrect

replacedict = {
    'verbanum bonariensis ':'verbena bonariensis',
    'medicago varia':'medicago sativa',
    "oenothera":"oenothera biennis",
    "geranium pratens":"geranium pratense",
    "senecio jacobaea": "jacobaea vulgaris",
    "oenothera biennis ": "oenothera biennis",
    "oenothera biennis agg.": "oenothera biennis",
    "solidalgo canadensis": "solidago canadensis",
    "verbascum lynchitis":"verbascum lychnitis",
    "verbascum negris":"verbascum nigrum",
    "securigea varia": "securigera varia",
    "melilotus officianalis": "melilotus officinalis",
    "knautia maxima": "knautia dipsacifolia",
    "hieracium aurantiacum":"pilosella aurantiaca",
    "sysimbrium officinale":"sisymbrium officinale",
    "geranium robertanium":"geranium robertianum",
    "mycelis muralis": "lactuca muralis",
    "calamintha-nepeta":"clinopodium nepeta",
    "polygonum-persicaria":"persicaria maculosa",
    "sorbus-aria":"aria edulis",
    "taraxacum": "taraxacum officinale"
}

# function to assign the correct value of the key is in the samples dictionary.

def new_func(x,keys):
    try:
        data = keys[x]
    except:
        data = x
    return data

# apply the funtion to a copy of the surveys data set.

samples["species2"] = samples.sci.map(lambda x: new_func(x, replacedict))
samples["species_slug"] = samples.species2.map(lambda x: to_species_slug(x))

# update the surveys dataset.

data_and_keys.update({'surveys':samples})

In [None]:
# format dates in the sample dataset

# function converts DD.MM.YYYY format to YYYY-MM-DD format, ignores if already in YYYY-MM-DD format

def change_string(x):
    try:
        s_data = x.split('.')
        data = s_data[::-1]
        data = "-".join(data)
    except:
        print("no luck")
        data = x
    
    return data

# applies the function to a column in the samples data frame

samples['new_date'] = samples.date.map(lambda x: change_string(x))

# function makes a timestamp out of the YYYY-MM-DD string.

def make_timestamp(x):
    try:        
        data = dt.datetime.strptime(x, "%Y-%m-%d")        
    except:        
        data = 'no luck'
    
    return data

# run the make_timestamp function and store the results in the samples dataframe.

samples['stamp_date'] = samples.new_date.map(lambda x: make_timestamp(x))
samples['date'] = samples.stamp_date

In [None]:
# make place name slugs

#  change_place function turns "place names 1" into place-names-1
def change_place(x):
    data = x.split(" ")
    data = "-".join(data)
    return data
samples["place1"] = samples.place.map(lambda x: change_place(x))


In [None]:
samples.head()

In [None]:
# clean up redundant and unused column names

# rename the place_slug column

samples.rename(columns= {'place1':'place_slug'}, inplace=True)

# remove the unnecessary columns for this analysis

samples.drop(['species2', 'new_date', 'stamp_date', 'place', 'sci', 'time', 'name'] , inplace=True, axis=1)

In [None]:
# export samples to a .csv and a .json file for later use
samples.to_csv(F"resources/preprocessed/hd_samples_2020.csv", index=False)