## Chemical Measurements and Land Use
Visualizations here deal with the land use survey and the interaction/correlation with the chemical measurements data
### Section 1: Data

In [15]:
import pandas as pd
import numpy as np
import altair as alt
import IPython
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

alt.renderers.enable('default')
alt.data_transformers.enable('json')

# show Max Rows
pd.set_option('display.max_columns', None)

In [16]:
# import the data
chem_data = pd.read_csv('assets/chem_data_merged.csv', index_col=0)
survey_data = pd.read_csv('assets/combined_tables.csv', index_col=0)
health_metric = pd.read_csv('assets/health_metric.csv', index_col=0)
print('The size of the chem_data dataframe')
print('columns:', chem_data.shape[1])
print('rows   :', chem_data.shape[0])
display(chem_data.sample(5))
print('The size of the survey_data dataframe')
print('columns:', survey_data.shape[1])
print('rows   :', survey_data.shape[0])
display(survey_data.sample(5))
print('The size of the health_metric dataframe')
print('columns:', health_metric.shape[1])
print('rows   :', health_metric.shape[0])
display(health_metric.sample(5))

The size of the chem_data dataframe
columns: 26
rows   : 284535


Unnamed: 0,LakeID,LakeStationNo,LakeStationType,Lat,Long,Town,ProjectID,VisitDate,VisitNumber,StartTime,CollectionMethodID,Depth,ActivityCategory,CharacteristicID,Symbol,Result,Calcs,ProjRemark,RemarkCode,DepthStratumCode,year,population,CharacteristicName,UnitCode,SampleFraction,NormResult
250758,SUNSET (MARLBR),1,Pelagic,42.9178,-72.6833,MARLBORO,SpringTP,2019-05-02,1,1522.0,PlasticKemm,10.0,Reg,TN,,0.15,Y,,,,2019,1039.0,Total Nitrogen,mg/l,Total,0.017319
102030,GROUT,1,Pelagic,43.04451,-72.94366,STRATTON,SpringTP,2020-05-01,1,1230.0,PlasticKemm,6.8,Reg,TN,,0.27,Y,,,,2020,,Total Nitrogen,mg/l,Total,0.033307
144084,LITTLE (WOODFD),1,Pelagic,42.925,-73.0656,WOODFORD,AcidLake,2009-10-28,1,1415.0,PlasticKemm,1.0,Reg,DOC,,2.96,Y,,,,2009,424.0,Dissolved Organic Carbon,mg/l,Dissolved,0.16158
252845,TICKLENAKED,1,Pelagic,44.19067,-72.0989,RYEGATE,LakeAsmt,1999-10-13,1,,Kemmerer,3.0,Reg,TP,,39.0,Y,,,,1999,1141.0,Total Phosphorus,ug/l,Total,0.021412
57061,CRYSTAL (BARTON),1,Pelagic,44.7328,-72.1533,BARTON,LakeAsmt,2013-08-09,1,1112.0,Hydrolab,11.0,Reg,DO%,,80.4,Y,,,,2013,2728.0,Dissolved Oxygen Saturation,%,,0.445923


The size of the survey_data dataframe
columns: 35
rows   : 773


Unnamed: 0,LakeID,Description,Lat,Long,Town,Shape_Length,Shape_Area,TREE_CANOPY_acres,GRASS_SHRUBS_acres,BARE_SOIL_acres,WATER_acres,BUILDINGS_acres,ROADS_acres,OTHER_PAVED_acres,RAILROADS_acres,Ag_Hay_acres,Ag_Crops_acres,Ag_Pasture_acres,Ag_Total_acres,Imp_Bare_Soil_acres,Imp_Buildings_acres,Imp_Other_Paved_acres,Imp_Road_acres,Imp_Railroad_acres,Imp_Total_acres,Shrub_Shrubs_acres,Shrub_Total_acres,TC_Coniferous_acres,TC_Deciduous_acres,TC_Total_acres,Wet_Emergent_acres,Wet_Forested_acres,Wet_Scrub_Shrub_acres,Wet_Total_acres,from_file
278,GROTON,Buffer100ftWBFL,44.2789,-72.2672,GROTON,120051.84566,3428386.0,635.820563,124.188672,0.830705,76.780836,2.858202,2.482726,4.07402,0.0,0.032494,0.0,0.0,0.032494,1.077266,3.133614,5.093703,5.017061,0.0,14.321644,21.506135,21.506135,187.715697,448.931865,636.647562,76.754031,249.223437,63.404529,389.381997,AOIs_GROTON.xls
3,ABENAKI,Buffer100ftWBFL,43.8303,-72.2361,THETFORD,11439.865203,265556.5,58.155297,6.164034,0.017421,0.867339,0.049915,0.233947,0.122749,0.0,0.002157,0.0,0.0,0.002157,0.067456,0.053636,0.155814,0.337027,0.0,0.613933,0.0,0.0,28.361753,29.837036,58.198789,4.854378,5.758748,0.182035,10.795161,AOIs_ABENAKI.xls
145,DERBY,Watershed,44.9556,-72.0383,DERBY,15759.000532,3334721.0,384.85498,384.80735,0.185391,4.31433,11.057269,10.713731,28.096703,0.0,166.795567,49.462762,0.0,216.258329,0.205452,11.49027,29.861814,12.02525,0.0,53.582786,10.859281,10.859281,100.370652,285.278785,385.649436,14.709584,57.791469,6.437116,78.938169,AOIs_DERBY.xls
167,ECHO (HUBDTN),Waterbody100ft,43.7475,-73.1828,HUBBARDTON,4707.862877,70651.18,13.07031,3.757973,0.010008,0.399692,0.108664,0.0,0.10922,0.0,0.001483,0.0,0.0,0.001483,0.017103,0.107591,0.114083,0.0,0.0,0.238777,0.0,0.0,6.399895,6.718559,13.118454,0.040698,0.970192,0.206734,1.217624,AOIs_ECHOHUBDTN.xls
565,PERCH (BENSON),Buffer100ftWBFL,43.7503,-73.2808,BENSON,3933.138052,64044.56,14.617373,0.691338,0.004448,0.07302,0.174641,0.194719,0.067398,0.0,0.0,0.0,0.0,0.0,0.008272,0.245733,0.089705,0.896207,0.0,1.239917,0.0,0.0,11.183225,3.475858,14.659083,0.0,2.322183,0.0,2.322183,AOIs_PERCHBENSON.xlsx


The size of the health_metric dataframe
columns: 2
rows   : 90


Unnamed: 0,Lake,Health_Score
38,ECHO (HUBDTN),3.073741
8,PERCH (BENSON),3.589842
4,CARMI,4.063059
48,ST. CATHERINE,1.141285
37,ELFIN,0.978887


### Section 2: Merging the Data for Heatmap Visulaization

In [17]:
# for this visulaization we need the chem_data from 2013 - 2016 and the LakeID to merge on, the 'CharacteristicID,
# CharacteristicName, and the Result
chem_data_to_merge = chem_data[((chem_data['year'] <= 2016) & (chem_data['year'] >= 2013))]
chem_data_to_merge = chem_data_to_merge[['LakeID', 'CharacteristicID', 'CharacteristicName', 'Result']].copy()

# Now we Groupby LakeID as we want one measurement: the mean result of each measurement type per lake.
chem_data_to_merge = chem_data_to_merge.groupby(['LakeID', 'CharacteristicID', 'CharacteristicName']).mean().reset_index()
chem_data_to_merge.sample(10)

Unnamed: 0,LakeID,CharacteristicID,CharacteristicName,Result
197,BIG,TAL,Total Aluminum,207.6
67,BAKER (BRKFLD),TN,Total Nitrogen,0.26
1765,LILY (VERNON),TURBNTU,Turbidity,1.64
2623,NORTH SPRINGFIELD,TMG,Total Magnesium,2.039
2359,MUD (MORGAN)-N,PH,pH,7.67
332,BRANCH,DOC,Dissolved Organic Carbon,6.707647
2279,MOLLYS FALLS,TMG,Total Magnesium,1.042
2783,PLEASANT VALLEY,COND,Conductivity,69.54
1351,HARTWELL,SECCHI,Secchi transparency,3.75
3229,SOMERSET,DCL,Dissolved Chloride,0.78


In [18]:
# Combine the subcategories in the description
cols_from_survey = ['LakeID', 'Description', 'TREE_CANOPY_acres', 'GRASS_SHRUBS_acres', 'BARE_SOIL_acres', 'WATER_acres', 
                    'BUILDINGS_acres','ROADS_acres', 'OTHER_PAVED_acres', 'RAILROADS_acres', 'Ag_Hay_acres', 
                    'Ag_Crops_acres', 'Ag_Pasture_acres', 'Ag_Total_acres', 'Imp_Bare_Soil_acres', 
                    'Imp_Buildings_acres', 'Imp_Other_Paved_acres', 'Imp_Road_acres', 'Imp_Railroad_acres', 
                    'Imp_Total_acres', 'Shrub_Shrubs_acres', 'Shrub_Total_acres', 'TC_Coniferous_acres', 
                    'TC_Deciduous_acres', 'TC_Total_acres', 'Wet_Emergent_acres', 'Wet_Forested_acres', 
                    'Wet_Scrub_Shrub_acres', 'Wet_Total_acres']
# survey_data_grouped = survey_data[cols_from_survey].groupby('LakeID').sum().reset_index()
survey_data_grouped = survey_data[cols_from_survey]
survey_data_grouped.sample(10)

Unnamed: 0,LakeID,Description,TREE_CANOPY_acres,GRASS_SHRUBS_acres,BARE_SOIL_acres,WATER_acres,BUILDINGS_acres,ROADS_acres,OTHER_PAVED_acres,RAILROADS_acres,Ag_Hay_acres,Ag_Crops_acres,Ag_Pasture_acres,Ag_Total_acres,Imp_Bare_Soil_acres,Imp_Buildings_acres,Imp_Other_Paved_acres,Imp_Road_acres,Imp_Railroad_acres,Imp_Total_acres,Shrub_Shrubs_acres,Shrub_Total_acres,TC_Coniferous_acres,TC_Deciduous_acres,TC_Total_acres,Wet_Emergent_acres,Wet_Forested_acres,Wet_Scrub_Shrub_acres,Wet_Total_acres
342,IROQUOIS,Flowline100ft,83.16467,30.144401,0.006919,2.029844,0.146163,0.592743,0.848188,0.0,15.737224,0.0,0.0,15.737224,0.022096,0.145604,0.949681,0.874062,0.0,1.991442,2.040967,2.040967,16.216191,67.044285,83.260476,17.821025,48.408923,1.289445,67.519393
313,HARVEYS,Waterbody100ft,28.674867,11.842507,0.128556,1.000157,2.313706,0.905887,2.067219,0.0,2e-06,0.0,0.0,2e-06,0.158409,2.547407,2.421992,1.67309,0.0,6.800899,0.0,0.0,19.708513,9.134481,28.842994,0.0,1.406878,0.0,1.406878
231,FOSTERS,Flowline100ft,10.185853,4.446469,0.06227,0.683616,0.055784,0.143444,0.292881,0.0,0.0,0.0,0.0,0.0,0.059732,0.055831,0.343445,0.265462,0.0,0.72447,3.237802,3.237802,5.531304,4.708933,10.240237,1.98941,4.070872,0.49123,6.551512
292,HALLS,Waterbody100ft,15.767152,6.273625,0.019212,0.821624,0.798396,1.043277,0.776342,0.0,0.000306,0.0,0.0,0.000306,0.021399,0.957221,1.010904,2.136921,0.0,4.126446,0.0,0.0,11.795493,4.056946,15.852439,0.258629,3.099913,0.0,3.358543
685,STCATHERINE,Buffer100ftWBFL,382.575621,159.124562,1.398614,21.853966,8.752521,6.580962,5.630349,0.0,42.927104,2.672151,0.0,45.599255,1.495499,10.392997,6.62498,11.390147,0.0,29.903624,17.040344,17.040344,160.103474,224.030479,384.133953,43.347983,88.658457,10.14073,142.147171
114,COLES,Buffer250ftWaterbody,61.981038,20.638457,0.007166,1.447109,1.764268,1.502584,1.196791,0.0,0.0,0.0,0.0,0.0,0.006983,1.979748,1.416794,2.871548,0.0,6.275074,2.157377,2.157377,39.497372,22.706154,62.203526,3.983573,31.08668,8.087774,43.158027
162,ECHO (CHARTN),Waterbody100ft,28.747763,15.769562,0.279105,1.881705,1.414305,1.763527,1.773225,0.0,2.424662,0.0,0.0,2.424662,0.322078,1.52016,2.056316,2.132835,0.0,6.031389,0.0,0.0,15.120139,13.784907,28.905046,1.676447,11.551497,0.265099,13.493043
179,EDEN,Buffer250ftWaterbody,100.167533,34.977157,0.25285,3.402883,3.560783,3.243562,9.068259,0.0,0.0,0.0,0.0,0.0,0.300489,3.98095,10.71419,4.914102,0.0,19.909731,3.554862,3.554862,27.53778,73.201583,100.739363,2.05912,10.451401,5.583573,18.094094
30,BERLIN,Watershed,5213.349938,921.712152,2.712039,25.884125,9.533064,89.243206,26.134813,0.0,326.430352,0.0,11.340395,337.770746,2.887773,10.064351,28.65112,100.38711,0.0,141.990354,130.16157,130.16157,1168.406063,4056.396292,5224.802355,106.787159,328.952724,83.638491,519.378374
76,CASPIAN,Flowline100ft,162.24173,102.545857,0.184958,10.26258,0.236603,2.881121,1.129146,0.0,66.602431,2.48838,0.040333,69.131144,0.215957,0.254642,1.346563,3.37404,0.0,5.191202,4.647195,4.647195,96.627852,66.359247,162.987099,15.942629,64.874784,8.261433,89.078845


In [19]:
merged_data = survey_data_grouped.merge(chem_data_to_merge, how='inner', on='LakeID').drop('LakeID', axis=1)
print(merged_data.shape)

(6950, 31)


### Section 3: Calculating the Correlations for Heatmap Visulaization

In [20]:
MEASUREMENT_THREASHOLD = 20
corrs = {}
for desc in merged_data['Description'].unique():
    corr_df = pd.DataFrame(index=merged_data.columns[:-3])
    for char in merged_data['CharacteristicID'].unique():
        df = merged_data[((merged_data['CharacteristicID'] == char) & (merged_data['Description'] == desc))]
        # because we are calculating correlations if there are measuremetns with less than 5 measurements
        #  we will drop them as the correlation wont make sense or will be misleading
        if df.shape[0] < MEASUREMENT_THREASHOLD:
            continue
        else:
            df = df.corr()
            df.drop('Result', axis=0, inplace=True)
            df.rename(columns={'Result': char}, inplace=True)
            corr_df[char] = df[char]

    # replace all nan values with 0
    corr_df.fillna(0, inplace=True)
    # drop a column if the entire column is 0
    for col in corr_df.columns:
        if corr_df[col].sum() == 0:
            corr_df.drop(col, axis=1, inplace=True)
    corr_df.drop('Description', axis=0, inplace=True)
    corr_df = corr_df.reset_index().rename(columns={'index': 'survey_type'})
    
    corrs[desc] = corr_df

In [21]:
for key, cor in corrs.items():
    cor['Description'] = key
    print(key)
    print(cor.shape)

Watershed
(27, 23)
Flowline100ft
(27, 23)
Waterbody100ft
(27, 23)
Buffer100ftWBFL
(27, 23)
Buffer250ftWaterbody
(27, 23)


In [22]:
corrs_final = pd.concat(corrs, axis=0, ignore_index=True)
print(corrs_final.shape)

(135, 23)


In [23]:
# melt the data for visualization
corr_df_for_vis = pd.melt(corrs_final, id_vars=['survey_type', 'Description'], var_name='CharacteristicID', value_name='correlation')
# merge CharacteristicName
corr_df_for_vis = corr_df_for_vis.merge(merged_data[['CharacteristicID', 'CharacteristicName']].drop_duplicates(), how='left', on='CharacteristicID')
# lets take the abs value of the corelation column so that we can compare them
# corr_df_for_vis['correlation'] = np.abs(corr_df_for_vis['correlation'])
print(corr_df.shape)
corr_df_for_vis.sample(5)

(27, 23)


Unnamed: 0,survey_type,Description,CharacteristicID,correlation,CharacteristicName
191,BARE_SOIL_acres,Waterbody100ft,SECCHI,-0.043662,Secchi transparency
2807,Wet_Total_acres,Buffer100ftWBFL,TURBNTU,0.019433,Turbidity
1878,Imp_Road_acres,Buffer250ftWaterbody,TCL,-0.088235,Total Chloride
703,GRASS_SHRUBS_acres,Flowline100ft,TEMPC,-0.135608,Temperature
595,GRASS_SHRUBS_acres,Waterbody100ft,PH,0.060983,pH


### Section 4: Heatmap Visualization

In [24]:
options = ['Watershed', 'Buffer100ftWBFL', 'Buffer250ftWaterbody']
desc_dropdown = alt.binding_select(options=options, name='Description ')
desc_condition = alt.selection_single(fields=['Description'], init={'Description': options[0]}, bind=desc_dropdown)

chart = alt.Chart(corr_df_for_vis).mark_rect().encode(
    x=alt.X('survey_type:N', axis=alt.Axis(title=None)),
    y=alt.Y('CharacteristicName:N', axis=alt.Axis(title=None)),
    color=alt.condition(alt.datum.correlation == 0, alt.value('white'), alt.Color('correlation:Q')),
    tooltip=[alt.Tooltip('CharacteristicName'), alt.Tooltip('CharacteristicID'), alt.Tooltip('survey_type', title='Survey Type'), alt.Tooltip('correlation')]
).add_selection(
    desc_condition
).transform_filter(
    desc_condition
).properties(
    title=alt.TitleParams('Correlation heatmap of chemical measurements and Land use')
)
display(chart)

### Section 5: Merging the Data and calculating the correlations for Health Metric Bar chart

In [25]:
# bar chart with health metric
survey = survey_data[cols_from_survey]
survey = survey.merge(health_metric, how='inner', left_on='LakeID', right_on='Lake').drop('Lake', axis=1)

In [26]:
results_df = pd.DataFrame(index=survey.columns[2 : -1])
for desc in survey['Description'].unique():
    df = survey[survey['Description'] == desc]
    corr_df = df.corr()
    corr_df.drop('Health_Score', axis=0, inplace=True)
    results_df[desc] = corr_df['Health_Score']
results_df = results_df.reset_index().rename(columns={'index': 'survey_type'})
results_df = pd.melt(results_df, id_vars='survey_type', var_name='Description', value_name='correlation')
results_df

Unnamed: 0,survey_type,Description,correlation
0,TREE_CANOPY_acres,Watershed,-0.035824
1,GRASS_SHRUBS_acres,Watershed,0.067897
2,BARE_SOIL_acres,Watershed,0.061495
3,WATER_acres,Watershed,0.038861
4,BUILDINGS_acres,Watershed,0.130100
...,...,...,...
130,TC_Total_acres,Buffer250ftWaterbody,-0.035261
131,Wet_Emergent_acres,Buffer250ftWaterbody,0.381516
132,Wet_Forested_acres,Buffer250ftWaterbody,-0.038068
133,Wet_Scrub_Shrub_acres,Buffer250ftWaterbody,0.086818


### Section 6: Health metric and land use survey bar chart

In [27]:
alt.Chart(results_df).mark_bar().encode(
    x=alt.X('correlation:Q'),
    y=alt.Y('survey_type', sort='-x')
).add_selection(
    desc_condition
).transform_filter(
    desc_condition
).properties(
    title=alt.TitleParams('Survey categories correlated with the derived Health Meric')
)

## Documentation

In [28]:
%load_ext watermark
%watermark --iversions

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
sys    : 3.9.9 | packaged by conda-forge | (main, Dec 20 2021, 02:36:06) [MSC v.1929 64 bit (AMD64)]
pandas : 1.3.4
numpy  : 1.21.5
IPython: 7.29.0
altair : 4.1.0

