## Vermont lake monitoring Program overview

In [1]:
%load_ext watermark

import numpy as np
import pandas as pd

import altair as alt
from IPython.display import Markdown as md


alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
%watermark -p geopandas,pandas,numpy,plotly,pyproj -a ZORIN_ANŽE -d

Author: ZORIN_ANŽE

geopandas: 0.10.2
pandas   : 1.3.5
numpy    : 1.21.5
plotly   : 5.5.0
pyproj   : 3.3.0



We first want to explore the statisctics of measurments. Since this is a lay monitoring programm dependant on good will of the participants we expect a wide range of measurments dependant on lake, year, season etc..

In [3]:
# loading and transformation of chemical data dataset.

def get_data():
    chem_data = pd.read_csv(r'assets/ChemDataForJeffOlson.csv', parse_dates=['VisitDate'])
    chem_data['year'] = chem_data.VisitDate.dt.year

    seasons = {
        '0':'winter',
        '1':'spring',
        '2':'summer',
        '3':'fall'
    }
    # We add 'year' and 'season' columns for easier representation of data and counting of measurments.
    chem_data['season'] =  (chem_data.VisitDate.dt.month - 1) //3
    chem_data['season'] = chem_data.season.apply(lambda x: seasons[str(x)])
    chem_data['date'] = chem_data['VisitDate'].apply(lambda x:x.strftime("%Y-%m-01"))
    
    return chem_data

In [4]:
chem_data = get_data()

In [5]:
mean_lakes = int(np.mean(chem_data.groupby('year')['LakeID'].nunique().values))
num_charascteristics = len(list(chem_data.CharacteristicID.unique()))
num_lakes = len(chem_data.LakeID.unique())

md("There are {} lakes included in the Lay monitoring program.\
    Number of lakes actualy measured in any given year is quite smaller.\
    On average {} are tested each year, measuring {} different characteristics.".format(num_lakes, mean_lakes, num_charascteristics))

There are 445 lakes included in the Lay monitoring program.    Number of lakes actualy measured in any given year is quite smaller.    On average 121 are tested each year, measuring 81 different characteristics.

The plot bellow shows the number of diferent lakes included in the programm for each yesr.

In [None]:
bar = alt.Chart(chem_data).mark_bar(strokeWidth=0.5).encode(
        x=alt.X('year:O'),
        y=alt.Y(
            'distinct(LakeID)',
            axis=None)
).properties(width=600, height=200)

txt = bar.mark_text(
            align='center',
            baseline='top',
            fontSize=8, 
            dy=-10
        ).encode(
            text='distinct(LakeID)')

alt.layer(bar+txt).configure_view(strokeWidth=0)

In [None]:
# The mnajorit of records are measured in spring in summer
alt.Chart(chem_data).mark_bar().encode(
    x=alt.X('season:N'),
    y=alt.Y('count(Result):Q')
).properties(title='seasonal distribution of measurments',
            width=300,
            height=200)

Let us count the number of measurments for any given parameter across all years.

In [None]:
counts_all = chem_data.groupby(['CharacteristicID'])['Result'].count()
counts_all.sort_values()

What are the CharacteristicID that have the low number of measurments. Lets look at those with < 100 mesurments.

counts_all[counts_all.values <= 100].index

It looks like those are the heavy elemnts which are not easily tested by lay people and Ecoli, NPOC and SechiViewTube. Lets throw out those measurments.

In [None]:
# set treshold for number of measurments
TRESHOLD = 100

def get_frequent(df, TRESHOLD):
    counts = df.groupby(['CharacteristicID'])['Result'].count()
    mask = counts[counts.values <= TRESHOLD].index
    df = df.where(~df.isin(mask))
    return df
              
#data = get_frequent(chem_data, TRESHOLD)
data=chem_data

Next we are interested in distribution of measurments across lakes. Which lakes are the most intensly monitored and which are the least monitored. Let us look at the count of records for all

In [None]:
line = alt.Chart(data).mark_line(strokeWidth=0.5, color='black').encode(
        x=alt.X('year:O', axis=alt.Axis(title=None)),
        y=alt.Y('count()', axis=alt.Axis(title=None, grid=False)),
        ).properties(width=600, height=100)

bar = alt.Chart(data).mark_bar(strokeWidth=0.5).encode(
        x=alt.X('year:O', axis=None),
        y=alt.Y('count()', axis=alt.Axis(title=None, grid=False)),
        color=alt.Color('LakeID:N', legend=None),
        tooltip=alt.Tooltip(['LakeID', 'count()']),
        order=alt.Order(
            'count()',
            sort='ascending')
        ).interactive().properties(width=600, height=200)

text = line.mark_text(
            align='center',
            baseline='top',
            fontSize=8, 
            dy=-10
        ).encode(
            text='distinct(LakeID)')

bars = alt.layer(bar,text)

alt.vconcat(bars, line).configure_view(
    strokeWidth=0
)

In [None]:
# top five most measured lakes by year
df = chem_data.groupby(['LakeID', 'year'])['CharacteristicID'].count()
df = df.reset_index()
df = df.sort_values(by=['year', 'CharacteristicID'], ascending=False).groupby('year')

top = df.head(5)
bottom = df.tail(5)

In [None]:
# top 5 most measured lakes by year
alt.Chart(top).mark_bar().encode(
    x=alt.X('LakeID:N', sort='-y', title=None),
    y=alt.Y('CharacteristicID:Q', title=None, axis=alt.Axis(grid=False))
).properties(width=100,
    height=100
).facet('year:O', columns=5).resolve_scale(x='independent', y='independent').configure_view(strokeWidth=0
)


In [None]:
# bottom 5 least measured lakes by year
alt.Chart(bottom).mark_bar().encode(
    x=alt.X('LakeID:N', sort='-y', title=None),
    y=alt.Y('CharacteristicID:Q', title=None, axis=alt.Axis(grid=False))
).properties(width=100,
    height=100
).facet('year:O', columns=5).resolve_scale(x='independent', y='independent').configure_view(strokeWidth=0
)


In [None]:
# What is the distribution of measurments. Which parameters are most measured.
most = chem_data.groupby(['year','CharacteristicID'])['Result'].count()
most = most.reset_index()
most = most.sort_values(by=['year', 'Result'], ascending=False).groupby('year').head(3)
most

In [None]:
alt.Chart(most).mark_bar().encode(
    x='CharacteristicID',
    y='Result:Q').properties(width=100,
    height=100
).facet('year:O', columns=5).resolve_scale(x='independent', y='independent').configure_view(strokeWidth=0
)