# Preprocessing

In [1]:
# math and data packages
import pandas as pd
import numpy as np
import math

# charting and graphics
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# os and file types
import os
import sys
import datetime as dt
import json
import csv

# images and display
import base64, io, IPython
from PIL import Image as PILImage
from IPython.display import Markdown as md
from IPython.display import display, Math, Latex



## Import data

Fix any known formatting problems here

In [2]:
# make slugs for species and places

# function to make the species slugs
def to_slug(x):
    try: 
        int_data = x.split()
        data = int_data[:2]
        data = "-".join(data)
        data = data.lower()
    except:
        data = "none"
    return data


def asplit(x):
    akey=x
    int_data = x.strip().split("-")
    return int_data, akey

def check_length(x):
    data = len(x[0])
    return data, x[0], x[1]

def new_data(data, suffixes, threecharacters, twocharacters):
    """Removes the specified suffix from a string"""
    
    if data[0] == 1:        
        if data[1][0].endswith(suffixes):
            new_x = data[1][:-4]
        elif data[1][0].endswith(threecharacters):
            new_x = data[1][:-3]
        else:
            new_x = data[1][0]
    else:
        
        if data[1][1] in suffixes:
            new_x = data[1][0]
        elif data[1][1].endswith(threecharacters):
            new_x = data[1][0]
        elif data[1][1].endswith(twocharacters):
            new_x = data[1][0]
        else:
            new_x = "-".join(data[1])            
    
    return {data[2]:new_x}

def change_a_column_value(df,coltocopy, coltorecieve, this_function=to_slug):
    copyname = f"{coltocopy}_copy"
    df[copyname] = df[coltocopy]
    df[coltorecieve] = df[copyname].map(lambda x: this_function(x))
    return df

def columns_to_lower(df, oldnames):
    new_names = {x:x.lower() for x in oldnames}
    return df.rename(columns=new_names, inplace=True)

invasives = pd.read_csv("resources/inprocess/invasives.csv")
priority = pd.read_csv("resources/inprocess/priority.csv")
redlist = pd.read_csv("resources/inprocess/redlist.csv")
surveys = pd.read_csv("resources/inprocess/combined_survey_data.csv")
invasives.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 27 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Latin                                                 58 non-null     object 
 1   French                                                58 non-null     object 
 2   Jura                                                  41 non-null     float64
 3   Plateau                                               47 non-null     float64
 4   Versant Nord des Alpes                                20 non-null     float64
 5   Alpes centrales Ouest                                 28 non-null     float64
 6   Alpes centrales Est                                   20 non-null     float64
 7   Versant Sud des Alpes                                 43 non-null     float64
 8   non établi en Suisse                                  10 non-n

In [18]:
invasives = change_a_column_value(invasives, "Latin", "species", to_slug)[["species", "list_2014", "Ordonnonce sur la dissémination des organismes (ODE)"]]
priority = change_a_column_value(priority, "species", "species", to_slug)[["species", "Jura", "Plateau", "responsabilite", "priorite"]]
redlist = change_a_column_value(redlist, "species", "species", to_slug)[["FAMILY", "GENUS", "species", "CH", "JU", "MP"]]

the_lists = [invasives.species, priority.species, redlist.species]
uniques = np.concatenate([x.unique() for x in the_lists])
uniques = list(set(uniques))

list_index = {0:"invasives", 1:"priority", 2:"redlist"}

# the binomial
s_sp = surveys.species.sort_values().unique()
sp_k = list(zip(np.arange(len(s_sp)), s_sp))

# the genus
genus_s = np.unique(np.array([x.split("-")[0] for x in s_sp]))
genus_u = np.unique(np.array([x.split("-")[0] for x in uniques]))

In [112]:
def the_list_value(x,alist, the_xcoord):
    the_val = alist[alist[the_xcoord].isin([x])].to_numpy()
    if len(the_val) > 1:
        data = the_val
    else:
        data = the_val
    return data
    


def define_list_membership(alist, others, contains=True):
    all_results = {}
    for i,name in enumerate(alist):
        name_results = {}
        for j,aseries in enumerate(others):
            if contains:
                name_results.update({j:aseries.str.contains(name, regex=False, case=False).sum()})
            else:
                name_results.update({j:(aseries == name).sum()})
                # aval = the_list_value(x, aseries, name
        all_results.update({name:name_results})
            
       
    return all_results

species_mem = define_list_membership(genus_s, the_lists)
genus_mem = define_list_membership(genus_u, the_lists) 

species_match = define_list_membership(s_sp, the_lists, contains=False)

In [115]:
species_to_list = pd.DataFrame.from_dict(species_match, orient="index")
species_to_list.rename(columns=list_index, inplace=True)
species_to_list[species_to_list.redlist > 1]

Unnamed: 0,invasives,priority,redlist
achillea-millefolium,0,0,2
allium-carinatum,0,1,3
anthriscus-sylvestris,0,1,3
anthyllis-vulneraria,0,2,5
arabis-hirsuta,0,0,2
...,...,...,...
verbascum-thapsus,0,0,3
veronica-hederifolia,0,0,4
veronica-serpyllifolia,0,0,3
vicia-cracca,0,0,4


In [161]:
my_species = ["anthyllis-vulneraria"]
this_column = "species"
columns = ["ch", "ju", "mp"]
redlist.fillna("X", inplace=True)

In [180]:

def the_list_value_for_each_species(aseries, search_column="species", my_list="redlist", my_species=["anthyllis-vulneraria"], this_column=["CH"]):
    data = []
    for element in my_species:
        a = get_the_most_common_value(aseries, search_column=search_column, my_species=[element], this_column=this_column)
        data.append(a)
    
    return {my_list:data}   



def get_the_most_common_value(aseries, search_column="species", my_species=["anthyllis-vulneraria"], this_column=["CH"]):
        aval = aseries.loc[aseries[search_column].isin(my_species)][this_column]
        if len(aval.index):
            val = aval.value_counts(this_column).index[0][0]
        else:
            val = "X"
            
        return {"species":my_species[0], "rated":val}


v = the_list_value_for_each_species(redlist, search_column="species", my_species=s_sp, this_column=["CH"])
w = the_list_value_for_each_species(redlist, search_column="species", my_species=s_sp, this_column=["JU"])
x = the_list_value_for_each_species(redlist, search_column="species", my_species=s_sp, this_column=["MP"])

In [181]:
pd.concat[DataFrame.from_records(x["redlist"])

Unnamed: 0,species,rated
0,abies-alba,LC
1,acer,X
2,acer-campestre,LC
3,acer-negundo,X
4,acer-platanoides,LC
...,...,...
654,vincetoxicum-hirundinaria,NT
655,viola-alba,LC
656,viola-hirta,LC
657,viola-odorata,LC


In [25]:
priority.loc[priority.species == "anthyllis-vulneraria"][["species", "Jura", "Plateau", "responsabilite", "priorite"]]

Unnamed: 0,species,Jura,Plateau,responsabilite,priorite
58,anthyllis-vulneraria,C,A,1,4
59,anthyllis-vulneraria,0,0,3,4


In [23]:
invasives.loc[invasives.species == "anthyllis-vulneria"][["species", "list_2014", "Ordonnonce sur la dissémination des organismes (ODE)"]]

Unnamed: 0,species,list_2014,Ordonnonce sur la dissémination des organismes (ODE)


In [10]:
species_to_list = pd.DataFrame.from_dict(species_mem, orient="index")
species_to_list.rename(columns=list_index, inplace=True)
species_to_list.sum()

invasives      32
priority      445
redlist      1941
dtype: int64

In [11]:
surveys2021 = pd.read_csv("resources/inprocess/surveys21-new.csv")
surveys2021.columns
# Name', 'description', lon', 'lat

Index(['Name', 'description', 'timestamp', 'begin', 'end', 'altitudeMode',
       'tessellate', 'extrude', 'visibility', 'drawOrder', 'icon',
       'description_1', 'type', 'htmlpopup', 'layerId', 'reg', 'lon', 'lat'],
      dtype='object')

In [14]:
alist=[1,2,3,4]
colnames=[0,1,2,3]
# new_array = np.zeros(( len(alist)+1,len(colnames)))
# new_array[0] = np.arange(len(colnames))
# new_array[1:,0] = alist
# new_array

In [None]:
# Keep the following columns: ODE (ODE means officially regulated, "-" means it is not), list_2014 (BL = black list / WL = watch list, BL is more threat than WL), species,  

invcolumns = ["species", "list_2014", "Ordonnonce sur la dissémination des organismes (ODE)", "acopy"]
invasives = invasives.rename(columns={invcolumns[2]:"ode"})


invasives["ode"] = invasives.ode.where(invasives.ode == "ODE", "X")

invdata = invasives[["species", "list_2014","ode", "acopy"]].copy()

In [None]:
# keep the following columns: species, jura (JU if present, 0 otherwise), plateau (MP if present, 0 otherwise), priorite ( 4 = high, 1 = low), responsabilite (4= high, 0 = none)
# https://www.infoflora.ch/fr/conservation-des-especes/liste-rouge.html#especes-prioritaires

pricolumns = ["species", "Jura", "Plateau", "responsabilite", "priorite", "acopy"]
new_names = {x:x.lower() for x in pricolumns}
priority.rename(columns=new_names, inplace=True)
print(priority.info())
pdata = priority[new_names.values()].copy()

In [None]:
redlist.info()

# keep the following columns: family, genus, species, CH (national status), JU (status in Jura), MP (status in central plateau)

pricolumns = ["FAMILY", "GENUS", "species", "CH", "JU", "MP", "acopy"]
new_names = {x:x.lower() for x in pricolumns}
redlist.rename(columns=new_names, inplace=True)
reddata = redlist[new_names.values()].copy()
reddata.head()

In [None]:
surveys.info()

In [None]:
pdata

In [None]:
invdata["species"] = invdata.acopy.map(lambda x: to_slug(x))
pdata["species"] = pdata.acopy.map(lambda x: to_slug(x))
reddata["species"] = reddata.acopy.map(lambda x: to_slug(x))

In [None]:
s_locs = surveys.species.unique()

In [None]:
invdata[invdata.species.isin(s_locs)]

In [None]:
count = {"priority":0, "invasive":0, "redlist":0}
names = {"priority":[], "invasive":[], "redlist":[]}
for plant in surveys.species.unique():
    x = len(priority.loc[priority.species == plant])
    y = len(invasives.loc[invasives.species == plant])
    z = len(redlist.loc[redlist.species == plant])   
    
    if x > 0:
        count["priority"] += x
        names["priority"].append(plant)
    elif y > 0:
        count["invasive"] += y
        names["invasive"].append(plant)
    elif z > 0:
        count["redlist"] += z
        names["redlist"].append(plant)
    else:
        pass
    
    
   

In [None]:
replacedict = {
    'verbanum bonariensis ':'verbena bonariensis',
    'medicago varia':'medicago sativa',
    "oenothera":"oenothera biennis",
    "geranium pratens":"geranium pratense",
    "oenothera biennis ": "oenothera biennis",
    "oenothera biennis agg.": "oenothera biennis",
    "solidalgo canadensis": "solidago canadensis",
    "verbascum lynchitis":"verbascum lychnitis",
    "verbascum negris":"verbascum nigrum",
    "securigea varia": "securigera varia",
    "melilotus officianalis": "melilotus officinalis",
    "knautia maxima": "knautia dipsacifolia",
    "hieracium aurantiacum":"pilosella aurantiaca",
    "sysimbrium officinale":"sisymbrium officinale",
    "geranium robertanium":"geranium robertianum",
    "mycelis muralis": "lactuca muralis",
    "calamintha-nepeta":"clinopodium nepeta",
    "polygonum-persicaria":"persicaria maculosa",
    "sorbus-aria":"aria edulis",
    "taraxacum": "taraxacum officinale",
    "jacobaea vulgaris" : "senecio jacobaea",
    "erigeron canadensis" : "conyza canadensis",
    "rorippa islandica" : "rorippa palustris",
    "malus sylvestris" : "malus domestica",
    "hylotelephium telephium" : "sedum telephium",
    "lactuca muralis": "mycelis muralis",
    "chaenorhinum minus": "chaenorrhinum minus",
    "erigeron canadensis": "conzya canadensis",
    "erigeron canadensis": "conzya canadensis",
    "borkhausenia intermedia": "scandosorbus intermedia",
    "centaurea nigra" : "centaurea jacea"
}

In [None]:
count

## Determine wether or not a species was detected within a geographic limit

The territory is divided into different segments. Flora-helvitica and WS maps have different geographic bounds. Here the presence or not of a species within the confines of one of the different boundaries is determined.

### Key the species to the different maps it was identified in


## Format date column to ISO standard