In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import altair as alt
import ipywidgets as widgets
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

# enable correct rendering
alt.renderers.enable('default')

# uses intermediate json files to speed things up
alt.data_transformers.enable('json')


pd.set_option('display.max_columns', None)

In [3]:
chem_data = pd.read_csv('assets/chem_data_merged.csv', index_col=0)
print(chem_data.shape)
display(chem_data.head())
print(chem_data['Town'].unique().shape)


(284912, 26)


Unnamed: 0,LakeID,LakeStationNo,LakeStationType,Lat,Long,Town,ProjectID,VisitDate,VisitNumber,StartTime,CollectionMethodID,Depth,ActivityCategory,CharacteristicID,Symbol,Result,Calcs,ProjRemark,RemarkCode,DepthStratumCode,year,population,CharacteristicName,UnitCode,SampleFraction,NormResult
0,ABENAKI,1,Pelagic,43.8303,-72.2361,THETFORD,SpringTP,1988-04-22,1,,Secchi,,Reg,SECCHI,,2.8,Y,,B,,1988,2377.0,Secchi transparency,m,,0.176694
1,ABENAKI,1,Pelagic,43.8303,-72.2361,THETFORD,SpringTP,1988-04-22,1,,Kemmerer,1.4,Reg,TP,,9.0,Y,,,,1988,2377.0,Total Phosphorus,ug/l,Total,0.004051
2,ABENAKI,1,Pelagic,43.8303,-72.2361,THETFORD,SpringTP,1989-05-01,1,,Secchi,,Reg,SECCHI,,2.3,Y,,B,,1989,2417.0,Secchi transparency,m,,0.145028
3,ABENAKI,1,Pelagic,43.8303,-72.2361,THETFORD,SpringTP,1989-05-01,1,,Kemmerer,1.0,Reg,TP,,11.0,Y,,,,1989,2417.0,Total Phosphorus,ug/l,Total,0.005208
4,ABENAKI,1,Pelagic,43.8303,-72.2361,THETFORD,SpringTP,1990-04-20,1,,Secchi,,Reg,SECCHI,,3.1,Y,,B,,1990,2438.0,Secchi transparency,m,,0.195693


(177,)


In [23]:
chem_data[['year', 'population']].drop_duplicates().isnull().sum()

year           0
population    24
dtype: int64

In [63]:

def show_measurements(df):
    df['VisitDate'] = pd.to_datetime(df['VisitDate'])
    df['year'] = df['VisitDate'].dt.year
    df.dropna(subset=['population'], inplace=True)
    df = df[df['population'] != 0]

    df_for_measurements = df[['year', 'Town', 'CharacteristicID', 'CharacteristicName', 'UnitCode', 'Result', 'NormResult']].groupby(['Town', 'year', 'CharacteristicID', 'CharacteristicName', 'UnitCode']).mean().reset_index()
    df_for_count = df[['Town', 'CharacteristicID', 'CharacteristicName', 'UnitCode', 'Result']].groupby(['Town', 'CharacteristicID', 'CharacteristicName', 'UnitCode']).count().reset_index()
    df_for_population = df[['Town', 'year', 'population']].copy().drop_duplicates()
    
    

    options = df_for_population.sort_values('population', ascending=False)['Town'].unique()
    town_dropdown = alt.binding_select(options=options, name='Towns')
    selection_d = alt.selection_single(fields=['Town'], init={'Town': options[0]}, bind=town_dropdown)


    
    selection = alt.selection_multi(fields=['CharacteristicID'], clear=False)
    opacity_selection = alt.condition(selection, alt.value(1), alt.value(.2))

    total_measurements = alt.Chart(df_for_measurements).mark_line().encode(
        x=alt.X('year:O'),
        y=alt.Y('mean(NormResult):Q'),
        color=alt.Color('CharacteristicID:N', legend=None)
    )
    point_measure = total_measurements.mark_circle().encode(
        x=alt.X('year:O'),
        y=alt.Y('mean(NormResult):Q'),
        color=alt.Color('CharacteristicID:N', legend=None),
        tooltip=[alt.Tooltip('CharacteristicID'), alt.Tooltip('mean(NormResult)')]
    ).interactive()

    measures = (total_measurements + point_measure).add_selection(
        selection_d
    ).transform_filter(
        selection
    ).transform_filter(selection_d).properties(
        width=700
    )

    total_counts = alt.Chart(df_for_count).mark_bar().encode(
        y=alt.Y('CharacteristicID:N', sort='-x'),
        x=alt.X('sum(Result):Q'),
        tooltip=[alt.Tooltip('CharacteristicName'), alt.Tooltip('UnitCode'), alt.Tooltip('sum(Result)')],
        opacity=opacity_selection,
        color=alt.Color('CharacteristicID:N')
    )

    text = total_counts.mark_text(dx=20).encode(
        text='sum(Result):Q',
        opacity=opacity_selection
    )

    pop = alt.Chart(df_for_population).mark_line().encode(
        x=alt.X('year:O'),
        y=alt.Y('population:Q')
    ).interactive()

    pop_point = alt.Chart(df_for_population).mark_circle().encode(
        x=alt.X('year:O'),
        y=alt.Y('population:Q'),
        tooltip=alt.Tooltip('population')
    ).interactive()
    pop_charts = (pop + pop_point).add_selection(
        selection_d
    ).transform_filter(
        selection_d
    ).properties(
        width=700
    )
    counts_and_text = (total_counts + text).add_selection(
        selection_d, 
        selection
    ).transform_filter(
        selection_d
    ).properties(
        width=200
    )

    return ((measures & pop_charts) | counts_and_text).properties(
        title=alt.TitleParams(text='Lake Health Measures and Population',
                            subtitle='Is there more testing and are the values higher in larger population areas')
    )
show_measurements(chem_data)