## Chemical Measurements Data and Population
This Notebook contains the Chemical Data Measurements and population by town visualization and the health metric by Population Visualization
### Section 1: Data

In [1]:
import pandas as pd
import numpy as np
import altair as alt
import IPython
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

alt.renderers.enable('default')

alt.data_transformers.enable('json')


pd.set_option('display.max_columns', None)

In [2]:
chem_data = pd.read_csv('assets/chem_data_merged.csv', index_col=0)
health_df = pd.read_csv('assets/health_metric.csv', index_col=0)
print('The size of the chem_data dataframe')
print('columns:', chem_data.shape[1])
print('rows   :', chem_data.shape[0])
display(chem_data.sample(5))
print('The size of the health metric dataframe')
print('columns:', health_df.shape[1])
print('rows   :', health_df.shape[0])
display(health_df.sample(5))



The size of the chem_data dataframe
columns: 26
rows   : 284535


Unnamed: 0,LakeID,LakeStationNo,LakeStationType,Lat,Long,Town,ProjectID,VisitDate,VisitNumber,StartTime,CollectionMethodID,Depth,ActivityCategory,CharacteristicID,Symbol,Result,Calcs,ProjRemark,RemarkCode,DepthStratumCode,year,population,CharacteristicName,UnitCode,SampleFraction,NormResult
159643,MAIDSTONE,1,Pelagic,44.65036,-71.64738,MAIDSTONE,SpringTP,2017-05-09,1,1519.0,Hydrolab,16.98,Reg,DO,,9.85,Y,,,,2017,199.0,Dissolved Oxygen,mg/l,,0.112571
149481,LITTLE ROCK,1,Pelagic,43.4,-72.9567,WALLINGFORD,AcidLake,1989-05-15,1,1230.0,Thermister,1.0,Reg,TEMPC,,10.5,Y,,,,1989,2158.0,Temperature,deg C,,0.365248
74363,ELMORE,1,Pelagic,44.5347,-72.5256,ELMORE,SpringTP,1996-05-13,1,1402.0,Kemmerer,1.0,Reg,TCA,,4.73,Y,,,,1996,745.0,Total Calcium,mg/l,Total,0.06103
122196,HORTONIA,1,Pelagic,43.7553,-73.2022,HUBBARDTON,SpringTP,2005-04-18,1,956.0,Hydrolab,16.0,Reg,DO,,6.71,Y,,,,2005,729.0,Dissolved Oxygen,mg/l,,0.076686
13949,BIG MUD,1,Pelagic,43.3144,-72.9311,MOUNT TABOR,AcidLake,2017-07-25,1,840.0,BottleGrab,0.5,Reg,DSO4,,1.05,Y,,,E,2017,259.0,Dissolved Sulfate,mg/l,Dissolved,0.072398


The size of the health metric dataframe
columns: 2
rows   : 90


Unnamed: 0,Lake,Health_Score
52,ARROWHEAD MOUNTAIN,4.387066
2,ELMORE,1.115306
5,EAST LONG,1.161514
56,LITTLE (WELLS),2.111516
61,JOES (DANVLL),0.374875


### Section 2: Clean For Visualization

In [3]:
chem_data['VisitDate'] = pd.to_datetime(chem_data['VisitDate'])
chem_data['year'] = chem_data['VisitDate'].dt.year
chem_data.dropna(subset=['population'], inplace=True)
chem_data = chem_data[chem_data['population'] != 0]

### Section 3: Chemical Metrics and Population by town Visualization

In [4]:

def show_measurements(df):
    df_for_measurements = df[['year', 'Town', 'CharacteristicID', 'CharacteristicName', 'UnitCode', 'Result', 'NormResult']].groupby(['Town', 'year', 'CharacteristicID', 'CharacteristicName', 'UnitCode']).mean().reset_index()
    df_for_count = df[['Town', 'CharacteristicID', 'CharacteristicName', 'UnitCode', 'Result']].groupby(['Town', 'CharacteristicID', 'CharacteristicName', 'UnitCode']).count().reset_index()
    df_for_population = df[['Town', 'year', 'population']].copy().drop_duplicates()
    
    

    options = list(chem_data.groupby('Town')['Town'].count().sort_values(ascending=False).index)
    town_dropdown = alt.binding_select(options=options, name='Towns')
    selection_d = alt.selection_single(fields=['Town'], init={'Town': options[0]}, bind=town_dropdown)


    
    selection = alt.selection_multi(fields=['CharacteristicID'], clear=False)
    opacity_selection = alt.condition(selection, alt.value(1), alt.value(.2))

    total_measurements = alt.Chart(df_for_measurements).mark_line().encode(
        x=alt.X('year:O'),
        y=alt.Y('mean(NormResult):Q'),
        color=alt.Color('CharacteristicID:N', legend=None)
    )
    point_measure = total_measurements.mark_circle().encode(
        x=alt.X('year:O'),
        y=alt.Y('mean(NormResult):Q'),
        color=alt.Color('CharacteristicID:N', legend=None),
        tooltip=[alt.Tooltip('CharacteristicID'), alt.Tooltip('mean(NormResult)')]
    ).interactive()

    measures = (total_measurements + point_measure).add_selection(
        selection_d
    ).transform_filter(
        selection
    ).transform_filter(selection_d).properties(
        width=700
    )

    total_counts = alt.Chart(df_for_count).mark_bar().encode(
        y=alt.Y('CharacteristicID:N', sort='-x'),
        x=alt.X('sum(Result):Q'),
        tooltip=[alt.Tooltip('CharacteristicName'), alt.Tooltip('UnitCode'), alt.Tooltip('sum(Result)')],
        opacity=opacity_selection,
        color=alt.Color('CharacteristicID:N')
    )

    text = total_counts.mark_text(dx=20).encode(
        text='sum(Result):Q',
        opacity=opacity_selection
    )

    pop = alt.Chart(df_for_population).mark_line().encode(
        x=alt.X('year:O'),
        y=alt.Y('population:Q')
    ).interactive()

    pop_point = alt.Chart(df_for_population).mark_circle().encode(
        x=alt.X('year:O'),
        y=alt.Y('population:Q'),
        tooltip=alt.Tooltip('population')
    ).interactive()
    pop_charts = (pop + pop_point).add_selection(
        selection_d
    ).transform_filter(
        selection_d
    ).properties(
        width=700
    )
    counts_and_text = (total_counts + text).add_selection(
        selection_d, 
        selection
    ).transform_filter(
        selection_d
    ).properties(
        width=200
    )

    return ((measures & pop_charts).resolve_scale(x='shared') | counts_and_text).properties(
        title=alt.TitleParams(text='Lake Health Measures and Population',
                            subtitle='Is there more testing and are the values higher in larger population areas')
    )
show_measurements(chem_data)

### Section 4: Health Metric and 2019 population data merging

In [5]:
# now to explore the health metric and population of towns
health_and_pop = health_df.merge(chem_data[['LakeID', 'Town', 'year', 'population']], how='left', left_on='Lake', right_on='LakeID').drop('LakeID', axis=1)
health_and_pop

Unnamed: 0,Lake,Health_Score,Town,year,population
0,LYFORD,1.873349,WALDEN,1990.0,703.0
1,LYFORD,1.873349,WALDEN,1990.0,703.0
2,LYFORD,1.873349,WALDEN,1997.0,762.0
3,LYFORD,1.873349,WALDEN,1997.0,762.0
4,LYFORD,1.873349,WALDEN,1997.0,762.0
...,...,...,...,...,...
149678,ISLAND,1.047180,BRIGHTON,2018.0,1188.0
149679,ISLAND,1.047180,BRIGHTON,2018.0,1188.0
149680,ISLAND,1.047180,BRIGHTON,2018.0,1188.0
149681,ISLAND,1.047180,BRIGHTON,2018.0,1188.0


In [6]:
pop_to_use = health_and_pop[health_and_pop['year'] == 2019]
grouped_health = pop_to_use[['Town', 'population', 'Health_Score']].groupby('Town').mean().reset_index().sort_values(by='population', ascending=False)
# grouped_health['Health_Score'] = np.log(grouped_health['Health_Score'])
corr = grouped_health.corr().iloc[0, 1]

### Section 5: Health Metric and 2019 population scatterplot

In [7]:
base = alt.Chart(grouped_health).mark_circle().encode(
    x=alt.X('population:Q'),
    y=alt.Y('Health_Score:Q'),
    tooltip=alt.Tooltip('Town')
).properties(
    title=alt.TitleParams('Town population and Health Score',
                    subtitle=f'Correlation of {round(corr, 5)}')
)

base

## Documentation

In [8]:
%load_ext watermark
%watermark --iversions

sys    : 3.9.9 | packaged by conda-forge | (main, Dec 20 2021, 02:36:06) [MSC v.1929 64 bit (AMD64)]
pandas : 1.3.4
IPython: 7.29.0
numpy  : 1.21.5
altair : 4.1.0

