## Chemical Measurements and Land Use
Visualizations here deal with the land use survey and the interaction/correlation with the chemical measurements data
### Section 1: Data

In [1]:
import pandas as pd
import numpy as np
import altair as alt
import IPython
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

alt.renderers.enable('default')
alt.data_transformers.enable('json')

# show Max Rows
pd.set_option('display.max_columns', None)

In [2]:
# import the data
chem_data = pd.read_csv('assets/chem_data_merged.csv', index_col=0)
survey_data = pd.read_csv('assets/combined_tables.csv', index_col=0)
health_metric = pd.read_csv('assets/health_metric.csv', index_col=0)
print('The size of the chem_data dataframe')
print('columns:', chem_data.shape[1])
print('rows   :', chem_data.shape[0])
display(chem_data.sample(5))
print('The size of the survey_data dataframe')
print('columns:', survey_data.shape[1])
print('rows   :', survey_data.shape[0])
display(survey_data.sample(5))
print('The size of the health_metric dataframe')
print('columns:', health_metric.shape[1])
print('rows   :', health_metric.shape[0])
display(health_metric.sample(5))

The size of the chem_data dataframe
columns: 26
rows   : 284535


Unnamed: 0,LakeID,LakeStationNo,LakeStationType,Lat,Long,Town,ProjectID,VisitDate,VisitNumber,StartTime,CollectionMethodID,Depth,ActivityCategory,CharacteristicID,Symbol,Result,Calcs,ProjRemark,RemarkCode,DepthStratumCode,year,population,CharacteristicName,UnitCode,SampleFraction,NormResult
204598,RUNNEMEDE,1,Pelagic,43.48467,-72.38874,WINDSOR,LayMon,2008-06-11,1,1830.0,BottleGrab,0.5,Reg,CHLA,,4.16,Y,,,,2008,3595.0,Chlorophyll-a,ug/l,,0.016524
150096,LITTLE ROCK,1,Pelagic,43.4,-72.9567,WALLINGFORD,SpringTP,2015-05-26,1,1826.0,Hydrolab,4.03,Reg,PH,,6.47,Y,,,,2015,2026.0,pH,,,0.417187
23410,BRANCH,1,Pelagic,43.08079,-73.01964,SUNDERLAND,AcidLake,1995-07-18,1,1300.0,PlasticKemm,1.0,Reg,DCA,,0.905,Y,,,E,1995,869.0,Dissolved Calcium,mg/l,Dissolved,0.049053
213022,SEYMOUR,1,Pelagic,44.8856,-71.9797,MORGAN,SpringTP,2013-05-06,1,1240.0,Hydrolab,21.98,Reg,DO,,11.0,Y,,,,2013,745.0,Dissolved Oxygen,mg/l,,0.125714
42663,CASPIAN,1,Pelagic,44.58897,-72.30878,GREENSBORO,NLA,2007-07-26,1,1122.0,Hydrolab,42.0,Reg,COND,,149.6,Y,,,,2007,771.0,Conductivity,umho/cm,,0.014179


The size of the survey_data dataframe
columns: 36
rows   : 773


Unnamed: 0,LakeID,Description,Lat,Long,Town,Shape_Length,Shape_Area,TREE_CANOPY_acres,GRASS_SHRUBS_acres,BARE_SOIL_acres,WATER_acres,BUILDINGS_acres,ROADS_acres,OTHER_PAVED_acres,RAILROADS_acres,Ag_Hay_acres,Ag_Crops_acres,Ag_Pasture_acres,Ag_Total_acres,Imp_Bare_Soil_acres,Imp_Buildings_acres,Imp_Other_Paved_acres,Imp_Road_acres,Imp_Railroad_acres,Imp_Total_acres,Shrub_Shrubs_acres,Shrub_Total_acres,TC_Coniferous_acres,TC_Deciduous_acres,TC_Total_acres,Wet_Emergent_acres,Wet_Forested_acres,Wet_Scrub_Shrub_acres,Wet_Total_acres,from_file,2016_population
83,CEDAR,Buffer100ftWBFL,44.25017,-73.13338,MONKTON,14811.694582,374113.5,45.871954,40.746441,0.201267,2.700116,0.772821,0.947771,1.188575,0.0,4.956356,0.0,0.150212,5.106568,0.194483,0.955752,1.245668,1.068606,0.0,3.464509,13.988702,13.988702,13.15429,32.816253,45.970542,20.464452,4.15365,2.311633,26.929735,AOIs_CEDAR.xls,2083.0
358,JOES (DANVLL),Flowline100ft,44.4081,-72.2208,DANVILLE,198220.018329,6250963.0,1095.550458,309.2739,0.758489,125.00208,1.144776,8.122032,4.485574,0.0,61.589683,0.0,0.050623,61.640306,1.180909,1.15536,4.85826,9.527255,0.0,16.721784,48.280976,48.280976,605.661408,492.39904,1098.060449,158.097182,519.546762,90.386579,768.030523,AOIs_JOESDANVLL.xls,2170.0
612,SADAWGA,Watershed,42.7819,-72.8778,WHITINGHAM,21416.72735,9320305.0,1806.144499,436.394658,0.620913,9.663226,7.847684,23.009306,19.415102,0.0,249.374528,0.0,7.07167,256.446197,0.661106,8.124192,22.573559,29.60976,0.0,60.968617,33.17381,33.17381,764.793646,1043.328147,1808.121793,41.318166,455.846026,22.821886,519.986078,AOIs_SADAWGA.xlsx,1321.0
212,FAIRLEE,Waterbody100ft,43.88294,-72.22941,THETFORD,22235.4773,336238.2,52.294028,18.617323,0.312155,2.997137,2.47766,3.173631,3.205261,0.0,0.0,0.0,0.0,0.0,0.378204,2.914384,3.979806,4.207353,0.0,11.479747,0.019661,0.019661,32.061437,20.35065,52.412087,2.516551,9.219591,2.470196,14.206338,AOIs_FAIRLEE.xls,2578.0
229,FOREST (CALAIS),Buffer250ftWaterbody,44.4075,-72.4389,CALAIS,7936.092138,287875.0,61.14292,7.044964,0.024834,0.307399,0.833238,0.719199,1.057053,0.0,0.0,0.0,0.0,0.0,0.036655,1.055713,1.282536,2.218353,0.0,4.593257,0.0,0.0,32.14696,29.171636,61.318596,0.067042,1.624426,0.0,1.691468,AOIs_FORESTCALAIS.xls,1596.0


The size of the health_metric dataframe
columns: 2
rows   : 90


Unnamed: 0,Lake,Health_Score
30,WILLOUGHBY,2.240003
15,SABIN,0.555088
21,PENSIONER,5.267679
27,SUNSET (BENSON),4.25791
8,PERCH (BENSON),3.589842


### Section 2: Merging the Data for Heatmap Visulaization

In [3]:
# for this visulaization we need the chem_data from 2013 - 2016 and the LakeID to merge on, the 'CharacteristicID,
# CharacteristicName, and the Result
chem_data_to_merge = chem_data[((chem_data['year'] <= 2016) & (chem_data['year'] >= 2013))].drop('population', axis=1)
chem_data_to_merge = chem_data_to_merge[['LakeID', 'CharacteristicID', 'CharacteristicName', 'Result']].copy()

# Now we Groupby LakeID as we want one measurement: the mean result of each measurement type per lake.
chem_data_to_merge = chem_data_to_merge.groupby(['LakeID', 'CharacteristicID', 'CharacteristicName']).mean().reset_index()
chem_data_to_merge.sample(10)

Unnamed: 0,LakeID,CharacteristicID,CharacteristicName,Result
3749,TURTLE,DNO3,Dissolved Nitrate Nitrogen,0.065
843,ECHO (PLYMTH),TOTALHARD,Total Hardness,24.984
1804,LITTLE (WELLS),DO%,Dissolved Oxygen Saturation,102.3
1233,HALFWAY,DMG,Dissolved Magnesium,0.244
309,BOURN,MAL,Monomeric Aluminum,192.16
553,COLBY,PH,pH,6.806
1134,GREENWOOD,CHLAPROBE,Chlorophyll-a (probe),3.754545
3522,STAR,SECCHI,Secchi transparency,2.2
176,BEECHER,CHLA,Chlorophyll-a,11.29
3811,UNKNOWN (WOODFD),COND,Conductivity,13.358


In [4]:
# Combine the subcategories in the description
cols_from_survey = ['LakeID', 'Description', 'TREE_CANOPY_acres', 'GRASS_SHRUBS_acres', 'BARE_SOIL_acres', 'WATER_acres', 
                    'BUILDINGS_acres','ROADS_acres', 'OTHER_PAVED_acres', 'RAILROADS_acres', 'Ag_Hay_acres', 
                    'Ag_Crops_acres', 'Ag_Pasture_acres', 'Ag_Total_acres', 'Imp_Bare_Soil_acres', 
                    'Imp_Buildings_acres', 'Imp_Other_Paved_acres', 'Imp_Road_acres', 'Imp_Railroad_acres', 
                    'Imp_Total_acres', 'Shrub_Shrubs_acres', 'Shrub_Total_acres', 'TC_Coniferous_acres', 
                    'TC_Deciduous_acres', 'TC_Total_acres', 'Wet_Emergent_acres', 'Wet_Forested_acres', 
                    'Wet_Scrub_Shrub_acres', 'Wet_Total_acres']
# survey_data_grouped = survey_data[cols_from_survey].groupby('LakeID').sum().reset_index()
survey_data_grouped = survey_data[cols_from_survey]
survey_data_grouped.sample(10)

Unnamed: 0,LakeID,Description,TREE_CANOPY_acres,GRASS_SHRUBS_acres,BARE_SOIL_acres,WATER_acres,BUILDINGS_acres,ROADS_acres,OTHER_PAVED_acres,RAILROADS_acres,Ag_Hay_acres,Ag_Crops_acres,Ag_Pasture_acres,Ag_Total_acres,Imp_Bare_Soil_acres,Imp_Buildings_acres,Imp_Other_Paved_acres,Imp_Road_acres,Imp_Railroad_acres,Imp_Total_acres,Shrub_Shrubs_acres,Shrub_Total_acres,TC_Coniferous_acres,TC_Deciduous_acres,TC_Total_acres,Wet_Emergent_acres,Wet_Forested_acres,Wet_Scrub_Shrub_acres,Wet_Total_acres
400,LILY (POULTY),Buffer100ftWBFL,53.898357,66.724342,0.169761,5.696079,0.432063,1.023385,1.069594,0.0,16.777359,0.0,0.0,16.777359,0.168413,0.471284,1.108864,1.121446,0.0,2.870006,16.208747,16.208747,27.707319,26.426102,54.133421,26.544888,7.846949,2.841979,37.233816
101,COLBY,Flowline100ft,6.031957,5.953563,0.008896,1.961581,0.0,0.202873,0.00383,0.0,0.000494,0.0,4.537259,4.537753,0.028526,0.0,0.003024,0.277302,0.0,0.308852,0.0,0.0,2.906623,3.15426,6.060883,1.704532,2.618031,1.136769,5.459332
475,MILL (BENSON),Buffer100ftWBFL,611.623794,519.40983,0.531461,269.548929,0.758118,5.941393,2.859252,0.0,234.114813,23.668235,2.219023,260.00207,0.588278,0.879554,3.090369,8.069218,0.0,12.627419,76.830491,76.830491,231.132135,383.332284,614.464419,219.22805,171.90564,65.40489,456.53858
216,FERN,Flowline100ft,13.623393,3.864846,0.0,0.302889,0.148448,0.20973,0.362503,0.0,0.0,0.0,0.0,0.0,0.0,0.179165,0.392307,0.281028,0.0,0.8525,0.0,0.0,1.911018,11.737119,13.648137,0.478173,0.308215,0.0,0.786388
539,OSMORE,Waterbody100ft,19.013803,1.004235,0.046394,0.408526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.046427,0.0,0.0,0.0,0.0,0.046427,0.0,0.0,13.445556,5.607709,19.053265,0.0,8.465954,0.74344,9.209394
5,ADAMS (WOODFD),Watershed,720.717254,27.260191,0.029158,10.739863,0.069313,0.286889,0.427801,0.0,0.039413,0.0,0.0,0.039413,0.05798,0.084033,1.077251,1.354879,0.0,2.574144,5.385283,5.385283,120.203413,600.657779,720.861192,25.335799,90.819363,3.840259,119.995421
718,TURTLEHEAD,Flowline100ft,163.9727,55.018917,0.0,18.027175,0.0,0.26014,0.0,0.0,0.0,0.0,0.0,0.0,0.03296,0.0,0.0,0.422686,0.0,0.455646,18.231971,18.231971,54.642342,109.658346,164.300687,47.662511,65.953412,16.737459,130.353382
123,CRYSTAL (BARTON),Buffer100ftWBFL,1116.757627,209.199649,3.787502,110.609881,2.046832,16.26371,7.622571,5.259197,8.712872,0.0,0.0,8.712872,5.892175,2.244263,8.728501,17.947063,6.103063,40.915066,57.1181,57.1181,256.391624,861.755051,1118.146675,112.564429,183.514767,25.330682,321.409879
596,ROOD,Buffer250ftWaterbody,20.485499,7.187358,0.0,0.193483,0.0,0.136958,0.423353,0.0,0.73295,0.0,0.0,0.73295,0.0,0.0,0.479905,0.190712,0.0,0.670617,0.0,0.0,9.780279,10.74789,20.528169,2.988483,7.441122,1.275151,11.704755
301,HARRIMAN (NEWBRY),Flowline100ft,69.568583,39.768523,0.029653,6.604499,0.039969,0.799879,0.263537,0.0,10.560917,0.0,0.0,10.560917,0.040344,0.039521,0.280461,0.914402,0.0,1.274728,19.622719,19.622719,38.954987,30.722189,69.677175,23.599293,24.565805,8.687511,56.85261


In [5]:
merged_data = survey_data_grouped.merge(chem_data_to_merge, how='inner', on='LakeID').drop('LakeID', axis=1)
print(merged_data.shape)

(6950, 31)


### Section 3: Calculating the Correlations for Heatmap Visulaization

In [6]:
MEASUREMENT_THREASHOLD = 20
corrs = {}
for desc in merged_data['Description'].unique():
    corr_df = pd.DataFrame(index=merged_data.columns[:-3])
    for char in merged_data['CharacteristicID'].unique():
        df = merged_data[((merged_data['CharacteristicID'] == char) & (merged_data['Description'] == desc))]
        # because we are calculating correlations if there are measuremetns with less than 5 measurements
        #  we will drop them as the correlation wont make sense or will be misleading
        if df.shape[0] < MEASUREMENT_THREASHOLD:
            continue
        else:
            df = df.corr()
            df.drop('Result', axis=0, inplace=True)
            df.rename(columns={'Result': char}, inplace=True)
            corr_df[char] = df[char]

    # replace all nan values with 0
    corr_df.fillna(0, inplace=True)
    # drop a column if the entire column is 0
    for col in corr_df.columns:
        if corr_df[col].sum() == 0:
            corr_df.drop(col, axis=1, inplace=True)
    corr_df.drop('Description', axis=0, inplace=True)
    corr_df = corr_df.reset_index().rename(columns={'index': 'survey_type'})
    
    corrs[desc] = corr_df

In [7]:
for key, cor in corrs.items():
    cor['Description'] = key
    print(key)
    print(cor.shape)

Watershed
(27, 23)
Flowline100ft
(27, 23)
Waterbody100ft
(27, 23)
Buffer100ftWBFL
(27, 23)
Buffer250ftWaterbody
(27, 23)


In [8]:
corrs_final = pd.concat(corrs, axis=0, ignore_index=True)
print(corrs_final.shape)

(135, 23)


In [9]:
# melt the data for visualization
corr_df_for_vis = pd.melt(corrs_final, id_vars=['survey_type', 'Description'], var_name='CharacteristicID', value_name='correlation')
# merge CharacteristicName
corr_df_for_vis = corr_df_for_vis.merge(merged_data[['CharacteristicID', 'CharacteristicName']].drop_duplicates(), how='left', on='CharacteristicID')
# lets take the abs value of the corelation column so that we can compare them
# corr_df_for_vis['correlation'] = np.abs(corr_df_for_vis['correlation'])
print(corr_df.shape)
corr_df_for_vis.sample(5)

(27, 23)


Unnamed: 0,survey_type,Description,CharacteristicID,correlation,CharacteristicName
1947,WATER_acres,Waterbody100ft,TFE,0.182584,Total Iron
185,Wet_Emergent_acres,Flowline100ft,SECCHI,-0.112311,Secchi transparency
2181,TC_Deciduous_acres,Watershed,TMG,-0.178448,Total Magnesium
245,BARE_SOIL_acres,Buffer250ftWaterbody,SECCHI,-0.026124,Secchi transparency
334,Ag_Pasture_acres,Waterbody100ft,TP,-0.055692,Total Phosphorus


### Section 4: Heatmap Visualization

In [10]:
options = ['Watershed', 'Buffer100ftWBFL', 'Buffer250ftWaterbody']
desc_dropdown = alt.binding_select(options=options, name='Description ')
desc_condition = alt.selection_single(fields=['Description'], init={'Description': options[0]}, bind=desc_dropdown)

chart = alt.Chart(corr_df_for_vis).mark_rect().encode(
    x=alt.X('survey_type:N', axis=alt.Axis(title=None)),
    y=alt.Y('CharacteristicName:N', axis=alt.Axis(title=None)),
    color=alt.condition(alt.datum.correlation == 0, alt.value('white'), alt.Color('correlation:Q')),
    tooltip=[alt.Tooltip('CharacteristicName'), alt.Tooltip('CharacteristicID'), alt.Tooltip('survey_type', title='Survey Type'), alt.Tooltip('correlation')]
).add_selection(
    desc_condition
).transform_filter(
    desc_condition
).properties(
    title=alt.TitleParams('Correlation heatmap of chemical measurements and Land use')
)
display(chart)

### Section 5: Merging the Data and calculating the correlations for Health Metric Bar chart

In [11]:
# bar chart with health metric
survey = survey_data[cols_from_survey]
survey = survey.merge(health_metric, how='inner', left_on='LakeID', right_on='Lake').drop('Lake', axis=1)

In [12]:
results_df = pd.DataFrame(index=survey.columns[2 : -1])
for desc in survey['Description'].unique():
    df = survey[survey['Description'] == desc]
    corr_df = df.corr()
    corr_df.drop('Health_Score', axis=0, inplace=True)
    results_df[desc] = corr_df['Health_Score']
results_df = results_df.reset_index().rename(columns={'index': 'survey_type'})
results_df = pd.melt(results_df, id_vars='survey_type', var_name='Description', value_name='correlation')
results_df

Unnamed: 0,survey_type,Description,correlation
0,TREE_CANOPY_acres,Watershed,-0.035824
1,GRASS_SHRUBS_acres,Watershed,0.067897
2,BARE_SOIL_acres,Watershed,0.061495
3,WATER_acres,Watershed,0.038861
4,BUILDINGS_acres,Watershed,0.130100
...,...,...,...
130,TC_Total_acres,Buffer250ftWaterbody,-0.035261
131,Wet_Emergent_acres,Buffer250ftWaterbody,0.381516
132,Wet_Forested_acres,Buffer250ftWaterbody,-0.038068
133,Wet_Scrub_Shrub_acres,Buffer250ftWaterbody,0.086818


### Section 6: Health metric and land use survey bar chart

In [13]:
alt.Chart(results_df).mark_bar().encode(
    x=alt.X('correlation:Q'),
    y=alt.Y('survey_type', sort='-x')
).add_selection(
    desc_condition
).transform_filter(
    desc_condition
).properties(
    title=alt.TitleParams('Survey categories correlated with the derived Health Meric')
)

## Documentation

In [14]:
%load_ext watermark
%watermark --iversions

altair : 4.1.0
pandas : 1.3.4
sys    : 3.9.9 | packaged by conda-forge | (main, Dec 20 2021, 02:36:06) [MSC v.1929 64 bit (AMD64)]
numpy  : 1.21.5
IPython: 7.29.0

