# Preprocessing

In [1]:
# math and data packages
import pandas as pd
import numpy as np
import math

# charting and graphics
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# os and file types
import os
import sys
import datetime as dt
import json
import csv

# images and display
import base64, io, IPython
from PIL import Image as PILImage
from IPython.display import Markdown as md
from IPython.display import display, Math, Latex



## Import data

Fix any known formatting problems here

In [2]:
# make slugs for species and places

# function to make the species slugs
def to_slug(x):
    try: 
        int_data = x.split()
        data = int_data[:2]
        data = "-".join(data)
        data = data.lower()
    except:
        data = "none"
    return data

invasives = pd.read_csv("resources/inprocess/invasives.csv")
priority = pd.read_csv("resources/inprocess/priority.csv")
redlist = pd.read_csv("resources/inprocess/redlist.csv")
surveys = pd.read_csv("resources/inprocess/combined_survey_data.csv")
invasives.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 27 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Latin                                                 58 non-null     object 
 1   French                                                58 non-null     object 
 2   Jura                                                  41 non-null     float64
 3   Plateau                                               47 non-null     float64
 4   Versant Nord des Alpes                                20 non-null     float64
 5   Alpes centrales Ouest                                 28 non-null     float64
 6   Alpes centrales Est                                   20 non-null     float64
 7   Versant Sud des Alpes                                 43 non-null     float64
 8   non établi en Suisse                                  10 non-n

In [3]:
def make_copy(df,col, to_slug):
    df['acopy'] = df[col]
    df["species"] = df.acopy.map(lambda x: to_slug(x))
    return df
invasives = make_copy(invasives, "Latin", to_slug)
priority = make_copy(priority, "species", to_slug)
redlist = make_copy(redlist, "species", to_slug)

In [4]:
# Keep the following columns: ODE (ODE means officially regulated, "-" means it is not), list_2014 (BL = black list / WL = watch list, BL is more threat than WL), species,  

invcolumns = ["species", "list_2014", "Ordonnonce sur la dissémination des organismes (ODE)", "acopy"]
invasives = invasives.rename(columns={invcolumns[2]:"ode"})


invasives["ode"] = invasives.ode.where(invasives.ode == "ODE", "X")

invdata = invasives[["species", "list_2014","ode", "acopy"]].copy()

In [5]:
# keep the following columns: species, jura (JU if present, 0 otherwise), plateau (MP if present, 0 otherwise), priorite ( 4 = high, 1 = low), responsabilite (4= high, 0 = none)
# https://www.infoflora.ch/fr/conservation-des-especes/liste-rouge.html#especes-prioritaires

pricolumns = ["species", "Jura", "Plateau", "responsabilite", "priorite", "acopy"]
new_names = {x:x.lower() for x in pricolumns}
priority.rename(columns=new_names, inplace=True)
print(priority.info())
pdata = priority[new_names.values()].copy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 842 entries, 0 to 841
Data columns (total 15 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0    ID taxon                                842 non-null    int64 
 1   species                                  842 non-null    object
 2   nom allemand                             842 non-null    object
 3   nom francais                             842 non-null    object
 4   nom italien                              842 non-null    object
 5   priorite                                 842 non-null    int64 
 6   menace                                   842 non-null    object
 7   responsabilite                           842 non-null    int64 
 8   necessite de prendre des mesures         842 non-null    int64 
 9   Necessite de surveiller les populations  842 non-null    int64 
 10  connaissances suffisantes?               842 non-null    int64

In [6]:
redlist.info()

# keep the following columns: family, genus, species, CH (national status), JU (status in Jura), MP (status in central plateau)

pricolumns = ["FAMILY", "GENUS", "species", "CH", "JU", "MP", "acopy"]
new_names = {x:x.lower() for x in pricolumns}
redlist.rename(columns=new_names, inplace=True)
reddata = redlist[new_names.values()].copy()
reddata.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2915 entries, 0 to 2914
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   FAMILY           2915 non-null   object
 1   GENUS            2915 non-null   object
 2   species          2915 non-null   object
 3   Deutscher Name   2915 non-null   object
 4   Nom en francais  2915 non-null   object
 5   CH               2915 non-null   object
 6   crit_CH          1120 non-null   object
 7   JU               2856 non-null   object
 8   crit_JU          917 non-null    object
 9   MP               2700 non-null   object
 10  crit_MP          1052 non-null   object
 11  acopy            2915 non-null   object
dtypes: object(12)
memory usage: 273.4+ KB


Unnamed: 0,family,genus,species,ch,ju,mp,acopy
0,Pinaceae,Abies,abies-alba,LC,LC,LC,Abies alba
1,Sapindaceae,Acer,acer-campestre,LC,LC,LC,Acer campestre
2,Sapindaceae,Acer,acer-opalus,LC,LC,NT,Acer opalus
3,Sapindaceae,Acer,acer-platanoides,LC,LC,LC,Acer platanoides
4,Sapindaceae,Acer,acer-pseudoplatanus,LC,LC,LC,Acer pseudoplatanus


In [7]:
surveys.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7943 entries, 0 to 7942
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  7943 non-null   int64 
 1   date        7943 non-null   object
 2   place       7943 non-null   object
 3   species     7943 non-null   object
 4   year        7943 non-null   int64 
 5   loc_date    7943 non-null   object
dtypes: int64(2), object(4)
memory usage: 372.5+ KB


In [8]:
pdata

Unnamed: 0,species,jura,plateau,responsabilite,priorite,acopy
0,achillea-atrata,0,0,3,4,Achillea atrata L.
1,achillea-clavenae,0,0,2,3,Achillea clavenae L.
2,achillea-collina,0,0,1,4,Achillea collina Rchb.
3,achillea-erba-rotta,0,0,3,4,Achillea erba-rotta subsp. moschata (Wulfen) V...
4,achillea-macrophylla,0,0,3,4,Achillea macrophylla L.
...,...,...,...,...,...,...
837,woodsia-ilvensis,0,0,3,2,Woodsia ilvensis (L.) R. Br.
838,woodsia-pulchella,0,0,2,2,Woodsia pulchella Bertol.
839,xanthium-strumarium,A,A,1,3,Xanthium strumarium L.
840,xeranthemum-inapertum,0,C,2,1,Xeranthemum inapertum (L.) Mill.


In [9]:
invdata["species"] = invdata.acopy.map(lambda x: to_slug(x))
pdata["species"] = pdata.acopy.map(lambda x: to_slug(x))
reddata["species"] = reddata.acopy.map(lambda x: to_slug(x))

In [10]:
s_locs = surveys.species.unique()

In [11]:
invdata[invdata.species.isin(s_locs)]

Unnamed: 0,species,list_2014,ode,acopy
4,artemisia-verlotiorum,BL,X,Artemisia verlotiorum Lamotte
6,buddleja-davidii,BL,X,Buddleja davidii Franch.
14,erigeron-annuus,BL,X,Erigeron annuus (L.) Desf. s.l.
15,heracleum-mantegazzianum,BL,ODE,Heracleum mantegazzianum Sommier & Levier
17,impatiens-glandulifera,BL,ODE,Impatiens glandulifera Royle
25,prunus-laurocerasus,BL,X,Prunus laurocerasus L.
26,prunus-serotina,BL,X,Prunus serotina Ehrh.
28,reynoutria-japonica,BL,ODE,Reynoutria japonica Houtt.
32,robinia-pseudoacacia,BL,X,Robinia pseudoacacia L.
33,rubus-armeniacus,BL,X,Rubus armeniacus Focke


In [12]:
count = {"priority":0, "invasive":0, "redlist":0}
names = {"priority":[], "invasive":[], "redlist":[]}
for plant in surveys.species.unique():
    x = len(priority.loc[priority.species == plant])
    y = len(invasives.loc[invasives.species == plant])
    z = len(redlist.loc[redlist.species == plant])   
    
    if x > 0:
        count["priority"] += x
        names["priority"].append(plant)
    elif y > 0:
        count["invasive"] += y
        names["invasive"].append(plant)
    elif z > 0:
        count["redlist"] += z
        names["redlist"].append(plant)
    else:
        pass
    
    
   

In [13]:
replacedict = {
    'verbanum bonariensis ':'verbena bonariensis',
    'medicago varia':'medicago sativa',
    "oenothera":"oenothera biennis",
    "geranium pratens":"geranium pratense",
    "oenothera biennis ": "oenothera biennis",
    "oenothera biennis agg.": "oenothera biennis",
    "solidalgo canadensis": "solidago canadensis",
    "verbascum lynchitis":"verbascum lychnitis",
    "verbascum negris":"verbascum nigrum",
    "securigea varia": "securigera varia",
    "melilotus officianalis": "melilotus officinalis",
    "knautia maxima": "knautia dipsacifolia",
    "hieracium aurantiacum":"pilosella aurantiaca",
    "sysimbrium officinale":"sisymbrium officinale",
    "geranium robertanium":"geranium robertianum",
    "mycelis muralis": "lactuca muralis",
    "calamintha-nepeta":"clinopodium nepeta",
    "polygonum-persicaria":"persicaria maculosa",
    "sorbus-aria":"aria edulis",
    "taraxacum": "taraxacum officinale",
    "jacobaea vulgaris" : "senecio jacobaea",
    "erigeron canadensis" : "conyza canadensis",
    "rorippa islandica" : "rorippa palustris",
    "malus sylvestris" : "malus domestica",
    "hylotelephium telephium" : "sedum telephium",
    "lactuca muralis": "mycelis muralis",
    "chaenorhinum minus": "chaenorrhinum minus",
    "erigeron canadensis": "conzya canadensis",
    "erigeron canadensis": "conzya canadensis",
    "borkhausenia intermedia": "scandosorbus intermedia",
    "centaurea nigra" : "centaurea jacea"
}

In [14]:
count

{'priority': 23, 'invasive': 15, 'redlist': 499}

## Determine wether or not a species was detected within a geographic limit

The territory is divided into different segments. Flora-helvitica and WS maps have different geographic bounds. Here the presence or not of a species within the confines of one of the different boundaries is determined.

### Key the species to the different maps it was identified in


## Format date column to ISO standard