In [1]:
# math and data packages
import pandas as pd
import numpy as np
import math

# charting and graphics
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# os and file types
import os
import sys
import datetime as dt
import json
import csv

# images and display
import base64, io, IPython
from PIL import Image as PILImage
from IPython.display import Markdown as md
from IPython.display import display, Math, Latex

# import the master data
spd=pd.read_csv("resources/inprocess/master.csv")
# drop unneeded columns
spd = spd.drop(axis = 1,columns = ["Unnamed: 0","mattenstrasse-area-1"])

# remove duplicates from merging of survey databases
spd = spd[spd["locname"] != "ignore"]

In [2]:
spd.columns

Index(['date', 'species', 'year', 'location', 'biogeo', 'canton', 'river',
       'lot', 'forest', 'project', 'city', 'rch', 'rmp', 'rju', 'fam', 'res',
       'pri', 'inv', 'ode', 'ns', 'ongen', 'ni', 'cdf', 'biel', 'locname'],
      dtype='object')

In [93]:
# functions that will be used in each subgroup, part of analysis and in the whole data

# number of surveys per year per location (needs loc_date column)
    
def support_genus(x):
    try: 
        int_data = x.split("-")
        data = int_data[:1]
    except:
        data = "none"
    return data

def nSamplesPerYearPlace(data):
    return data.groupby(['year', 'place'], as_index=False).loc_date.nunique()

def top_y_species(x,y):
    "returns list of top y number of unique species in passed df x"
    return x['species'].value_counts()[:y].index.tolist()

def top_y_genus(x,y):
    "returns list of top y number of unique genus in passed df x"
    x["genus"] = x.species.map(lambda x: support_genus(x))
    return x['genus'].value_counts()[:y].index.tolist()

def top_y_group(x,y,col):
    "returns list of top y number of unique elements in column col in passed df x"
    return x[col].value_counts()[:y].index.tolist()

def red_list(x,y):
    """returns dictionary of species lists keyed on their redlist codes with a list of the species in passed df x in biogeo region y"""
    """e.g.: {"nt": [carduus-crispus, alcea-rosea], vu = ["cardamine-hirsuta"], LC = ....}"""
    return x

def not_indy(x):
    """returns a dictionary of species keyed on their non-native status in the passed df"""
    return x

def extract_status(x):
    """returns a dictionary of species keyed to the following status: ni, WL, BL, ode, """


In [94]:
# summary statistics on the key categories - to discuss !
spdg = top_y_genus(spd,10)
spdg

[['veronica'],
 ['trifolium'],
 ['ranunculus'],
 ['taraxacum'],
 ['geranium'],
 ['hedera'],
 ['medicago'],
 ['senecio'],
 ['galium'],
 ['plantago']]

In [95]:
# surveys per year per location:


# species per location/date combination (uniquely defines a sample)
def nSpeciesPerSample(data):
    return data.groupby(["date", "place"], as_index=False).species.count()

def aSpeciesPerSample(data, species=[]):
    """The per sample data for a species
    """
    new_data = nSpeciesPerSample(data[data["species"].isin(species)])
    new_data.rename(columns={"species":species[0]}, inplace=True)
    return new_data

# species per location
def speciesPlaceNsamps(data):
    return data.groupby(["species", "place"], as_index=False).loc_date.nunique()

def pivot_this(data, colnames, index, column, value):
    return data[colnames].pivot(index=index, columns=column, values=value).fillna(0)

def speciesPerLocation(data,colnames, index="species",columns="place",values="loc_date"):
    """Uses speciesPlaceNsamps to return a matrix of species and locations with value of
    number of times identified at that location
    """
    return pivot_this(data, colnames, index, columns, values)

def speciesPlace(species_data, soi):
    """Uses speciesPerlocation to report on one species"""
    return species_data.loc[soi, (species_data.loc[soi] > 0)]

In [96]:
# Date summary, year 1 and year 2 basic differences

Year 2 saw around 5x as many plants identified as in Year 1 due to the extra experience of the surveyor in identifying plants. In Year 3, given the extra experience of the surveyor, the expected number of observations will triple to around 18'000 - 22'000 observations through the year.

Not only has experience improved plant recognition significantly, but the survey method and data pipeline are now much smoother. On top of this the processes to organize and analyze the data are now in place so the survey team is much freer to focus on identifying plants and entering them in the infoflora app.

In [97]:
# number of unique species identified and changes in top 10 genus, species, identified, red_list species, not_indigenous species
# number of instances and number of surveys
# numbers per survey
# series of monthly data, then series of quarterly data (feb-mar), (apr-may-jun),  (july-august-september-october)



As we can see, the absolute and relative number of species identified went up as the year went on with a peak in early summer. Since the population diversity of herbaceous non grass flowering plants grows through the year and then peaks before going down again this is exactly to be expected. Notice that population of identified plants changes significantly through time as one might expect - each of the ten most common species and genus changes over time.

The second reason this is true is that the surveyor was not comfortable identifying most plants through leaves/rosettes and so species blooming later in the year would not be identified until then. This contribution to the skew in plants identified will go down in year three now that the surveyor has a strong basis in the local flora and can recognize future flowers based on leaf patterns for a significant number of the local species. If species cannot be identified, then genera can be also be more accurately noted.

An additional bias is that the surveyor collected a number of "empty" surveys of different areas with only some of the city trees in flower (such as corylus avellana) during February. This is not really an accurate represenation of the survey areas, even necessarily through the month of february and early march. As such, this data has been left out aside from a few early survey sites with some populations of flower plants. In other words, lots of empty values are left out. This will be addressed by including at least family level taxonomic identification (e.g. poaceae 1, poaceae 2, salix sp) even in places without the herbaceous vascual

The literature is clear that both late blooming and early blooming species are critically important for overwintering for small animals and especially insects. This will be taken into account to more accurately 

In [98]:
### Comparing lots, describing lots, there are four pulled out of the data here

lot_list = ["schwanen", "port", "taubenloch"]

sch = spd.loc[spd['lot'] == "schwanen"]

por = spd.loc[spd['lot'] == "port"]

tau = spd.loc[spd['lot'] == "taubenloch"]

fla = spd.loc[spd["lot"] == "flaess"]


# Compare numbers in key categories (here, not indigenous, watch list, black list, near threatened in MP)
nisch = sch.loc[sch['ni'] =='ni']
wlsch = sch.loc[sch['inv'] =='WL']
blsch = sch.loc[sch['inv'] =='BL']
odesch = sch.loc[sch['ode'] == "ode"]
ntmpsch =  sch.loc[sch['rmp'] =='NT']
vumpsch =  sch.loc[sch['rmp'] =='VU']

print(F"there is {ntmpsch.species.nunique()} species on the NT list")
print(F"there is {wlsch.species.nunique()} species on the watch list")
print(F"there is {blsch.species.nunique()} species on the black list")
print(F"there is {odesch.species.nunique()} species on the ode list")
print(F"there is {vumpsch.species.nunique()} species on the vulnerable list")
print(F"there are {nisch.species.nunique()} non indigenous non invasive species")


there is 1 species on the NT list
there is 0 species on the watch list
there is 4 species on the black list
there is 0 species on the ode list
there is 0 species on the vulnerable list
there are 3 non indigenous non invasive species


In [99]:
a = top_y_species(sch,15)
c = top_y_species(sch,10)

In [100]:
type(sch)

pandas.core.frame.DataFrame

In [101]:
b = top_y_genus(sch,15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x["genus"] = x.species.map(lambda x: support_genus(x))


In [102]:
spdy = top_y_genus(spd,25)


In [103]:
spd.project.unique()

array(['X', 'renat-suze', nan, 'renat-cygnes', 'pronat', 'renat-neumatt',
       'renat-sund', 'renat-vor'], dtype=object)

In [10]:
# Describe and compare the renaturalization areas

cyg = spd.loc[spd['project'] == "renat-cygnes"]

suz = spd.loc[spd['project'] == "renat-suze"]

pro = spd.loc[spd['project'] == "pronat"]

neu = spd.loc[spd["project"] == "renat-neumatt"]

prosp = pro.species.unique()
prosp

array(['medicago-lupulina', 'erophila-verna', 'senecio-vulgaris',
       'muscari-neglectum', 'taraxacum', 'primula-acaulis',
       'glechoma-hederacea', 'vinca-minor', 'prunus-spinosa',
       'veronica-persica', 'salix-caprea', 'ranunculus-acris',
       'cardamine-flexuosa', 'lamium-purpureum',
       'capsella-bursa-pastoris', 'ranunculus-bulbosus',
       'erodium-cicutarium', 'potentilla-verna', 'veronica-hederifolia',
       'geranium-pyrenaicum', 'valerianella-locusta',
       'arabidopsis-thaliana', 'sinapis-arvensis', 'cerastium-glomeratum',
       'salvia-pratensis', 'sisymbrium-officinale', 'sanguisorba-minor',
       'reseda-lutea', 'galium-mollugo', 'urtica-dioica',
       'erigeron-annuus', 'plantago-lanceolata', 'trifolium-pratense',
       'achillea-millefolium', 'veronica-arvensis', 'myosotis-arvensis',
       'galium-aparine', 'papaver-rhoeas', 'kolkwitzia-amabilis',
       'veronica-serpyllifolia', 'prunus-serotina', 'mahonia-aquifolium',
       'geum-urbanum', 'al

In [32]:
spd['species'].value_counts()[:15].index.tolist()
lots = spd[spd["lot"] != "X"]
lots['species'].value_counts()[:15].index.tolist()

['taraxacum',
 'veronica-persica',
 'erigeron-annuus',
 'tussilago-farfara',
 'lotus-corniculatus',
 'medicago-lupulina',
 'daucus-carota',
 'plantago-lanceolata',
 'veronica-hederifolia',
 'cardamine-hirsuta',
 'bellis-perennis',
 'urtica-dioica',
 'trifolium-repens',
 'achillea-millefolium',
 'galium-mollugo']

In [16]:
spd.river.unique()

array(['madretschkanal', 'schuessinsel', 'suze', nan, 'neuenburgersee',
       'bielersee', 'orvine', 'leugene', 'emme', 'doubs', 'thunersee',
       'X', 'none', 'thun'], dtype=object)

In [27]:
# describe and compare the rivers: madretschkanal, suze, schuessinsel, and all suze length combined vs total

suz = spd.loc[spd['river'] == "suze"]

mad = spd.loc[spd['river'] == "madretschkanal"]

sch = spd.loc[spd['river'] == "schuessinsel"]

suz.head()

Unnamed: 0,date,species,year,location,biogeo,canton,river,lot,forest,project,...,res,pri,inv,ode,ns,ongen,ni,cdf,biel,locname
66,09.06.2020,medicago-lupulina,1,c,mp,be,suze,X,X,X,...,X,X,X,X,X,X,X,cdf,biel,buerenstrasse-suze-1
67,09.06.2020,cichorium-intybus,1,c,mp,be,suze,X,X,X,...,X,X,X,X,X,X,X,cdf,biel,buerenstrasse-suze-1
68,09.06.2020,potentilla-reptans,1,c,mp,be,suze,X,X,X,...,X,X,X,X,X,X,X,X,biel,buerenstrasse-suze-1
69,09.06.2020,papaver-rhoeas,1,c,mp,be,suze,X,X,X,...,X,X,X,X,X,X,X,X,biel,buerenstrasse-suze-1
70,09.06.2020,hypericum-maculatum,1,c,mp,be,suze,X,X,X,...,X,X,X,X,X,X,X,cdf,biel,buerenstrasse-suze-1


In [None]:
# describe pro nature renaturalization areas, compare to lots, to other renats, to everything else "berm"

In [51]:
sch.species.nunique()

98

2