## Chemical Measurements and Land Use
Visualizations here deal with the land use survey and the interaction/correlation with the chemical measurements data
### Section 1: Data

In [1]:
import pandas as pd
import numpy as np
import altair as alt
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

alt.renderers.enable('default')
alt.data_transformers.enable('json')

# show Max Rows
pd.set_option('display.max_columns', None)

In [25]:
# import the data
chem_data = pd.read_csv('assets/chem_data_merged.csv', index_col=0)
survey_data = pd.read_csv('assets/combined_tables.csv', index_col=0)
health_metric = pd.read_csv('assets/health_metric.csv', index_col=0)
print('The size of the chem_data dataframe')
print('columns:', chem_data.shape[1])
print('rows   :', chem_data.shape[0])
display(chem_data.sample(5))
print('The size of the survey_data dataframe')
print('columns:', survey_data.shape[1])
print('rows   :', survey_data.shape[0])
display(survey_data.sample(5))
print('The size of the health_metric dataframe')
print('columns:', health_metric.shape[1])
print('rows   :', health_metric.shape[0])
display(health_metric.sample(5))

The size of the chem_data dataframe
columns: 26
rows   : 284535


Unnamed: 0,LakeID,LakeStationNo,LakeStationType,Lat,Long,Town,ProjectID,VisitDate,VisitNumber,StartTime,CollectionMethodID,Depth,ActivityCategory,CharacteristicID,Symbol,Result,Calcs,ProjRemark,RemarkCode,DepthStratumCode,year,population,CharacteristicName,UnitCode,SampleFraction,NormResult
171072,MOREY,1,Pelagic,43.9247,-72.1533,FAIRLEE,Laymon,1991-06-24,1,1405.0,Secchi,,Reg,SECCHI,,7.0,Y,,,,1991,891.0,Secchi transparency,m,,0.442685
141330,LITTLE (WINHLL),1,Pelagic,43.1236,-72.9428,WINHALL,AcidLake,1983-08-31,1,940.0,TygonHose,1.0,Reg,TK,,0.44,Y,,,,1983,,Total Potassium,mg/l,Total,0.056278
153401,LONG (WESTMR),1,Pelagic,44.75274,-72.01827,WESTMORE,SpringTP,2010-04-19,1,1118.0,Hydrolab,20.0,Reg,DO,,10.87,Y,,,,2010,350.0,Dissolved Oxygen,mg/l,,0.124229
65645,EAST LONG,1,Pelagic,44.4475,-72.3525,WOODBURY,SpringTP,1998-05-01,1,1300.0,Hydrolab,26.7,Reg,DO%,,46.8,Y,,,,1998,801.0,Dissolved Oxygen Saturation,%,,0.259567
64707,DUNMORE,1,Pelagic,43.9122,-73.0764,SALISBURY,LaymonQC,2018-08-14,1,1056.0,Hydrolab,5.86,Reg,DO,,7.99,Y,,,,2018,1117.0,Dissolved Oxygen,mg/l,,0.091314


The size of the survey_data dataframe
columns: 36
rows   : 773


Unnamed: 0,LakeID,Description,Lat,Long,Town,Shape_Length,Shape_Area,TREE_CANOPY_acres,GRASS_SHRUBS_acres,BARE_SOIL_acres,WATER_acres,BUILDINGS_acres,ROADS_acres,OTHER_PAVED_acres,RAILROADS_acres,Ag_Hay_acres,Ag_Crops_acres,Ag_Pasture_acres,Ag_Total_acres,Imp_Bare_Soil_acres,Imp_Buildings_acres,Imp_Other_Paved_acres,Imp_Road_acres,Imp_Railroad_acres,Imp_Total_acres,Shrub_Shrubs_acres,Shrub_Total_acres,TC_Coniferous_acres,TC_Deciduous_acres,TC_Total_acres,Wet_Emergent_acres,Wet_Forested_acres,Wet_Scrub_Shrub_acres,Wet_Total_acres,from_file,2016_population
64,BUCK,Buffer250ftWaterbody,44.46248,-72.39824,WOODBURY,4418.648756,158386.6,35.611599,2.702711,0.10817,0.703446,0.0,0.0,0.008278,0.0,0.0,0.0,0.0,0.0,0.114316,0.0,0.01634,0.0,0.0,0.130655,0.509824,0.509824,3.903034,31.739491,35.642525,1.046024,8.486563,0.073158,9.605745,AOIs_BUCK.xls,886.0
128,CURTIS,Buffer100ftWBFL,44.3881,-72.4939,CALAIS,13923.088631,274975.8,42.84393,18.874374,0.312402,2.398772,0.666133,1.64887,1.196482,0.0,3.973191,0.0,0.0,3.973191,0.328933,0.92057,1.382622,2.810414,0.0,5.442538,2.828668,2.828668,17.191755,25.965868,43.157623,5.339568,9.322313,2.967047,17.628928,AOIs_CURTIS.xls,1596.0
560,PENSIONER,Buffer100ftWBFL,44.8783,-72.0567,CHARLESTON,825589.912308,26232250.0,4538.421303,1136.946167,8.791316,724.852432,4.8962,33.955995,29.270761,4.101634,332.731141,16.094631,0.0,348.825771,10.833432,5.125612,32.269959,39.419253,4.152762,91.801019,249.367307,249.367307,1510.073595,3036.42754,4546.501135,560.530688,1441.884054,290.080561,2292.495303,AOIs_PENSIONER.xlsx,996.0
302,HARRIMAN (NEWBRY),Waterbody100ft,44.10304,-72.07947,NEWBURY,3322.682479,48937.26,2.974032,8.071314,0.018162,0.79852,0.009946,0.198672,0.020324,0.0,0.0,0.0,0.0,0.0,0.017592,0.00973,0.019439,0.283864,0.0,0.330625,2.301596,2.301596,2.015788,0.978861,2.994649,8.653627,1.454506,0.0,10.108133,AOIs_HARRIMANNEWBRY.xls,2175.0
576,PINNEO,Buffer250ftWaterbody,43.6517,-72.4314,HARTFORD,4116.583197,102172.2,4.897498,16.967527,1.102088,0.088525,0.210225,0.16624,1.813318,0.0,0.0,0.0,0.0,0.0,1.14074,0.210968,1.971112,0.178825,0.0,3.501646,0.0,0.0,1.547716,3.393753,4.941469,0.15108,0.0,0.0,0.15108,AOIs_PINNEO.xlsx,9671.0


The size of the health_metric dataframe
columns: 2
rows   : 90


Unnamed: 0,Lake,Health_Score
86,SOUTH (EDEN),2.125465
24,NICHOLS,1.725785
16,FAIRLEE,1.147697
21,PENSIONER,5.267679
5,EAST LONG,1.161514


### Section 2: Merging the Data for Heatmap Visulaization

In [3]:
# for this visulaization we need the chem_data from 2013 - 2016 and the LakeID to merge on, the 'CharacteristicID,
# CharacteristicName, and the Result
chem_data_to_merge = chem_data[((chem_data['year'] <= 2016) & (chem_data['year'] >= 2013))].drop('population', axis=1)
chem_data_to_merge = chem_data_to_merge[['LakeID', 'CharacteristicID', 'CharacteristicName', 'Result']].copy()

# Now we Groupby LakeID as we want one measurement: the mean result of each measurement type per lake.
chem_data_to_merge = chem_data_to_merge.groupby(['LakeID', 'CharacteristicID', 'CharacteristicName']).mean().reset_index()
chem_data_to_merge.sample(10)

Unnamed: 0,LakeID,CharacteristicID,CharacteristicName,Result
1480,HOWE,PH,pH,5.978718
2249,MIRROR,TOTALHARD,Total Hardness,69.623
3011,SADAWGA,TN,Total Nitrogen,0.194
3913,VERNON HATCHERY;,TOTALHARD,Total Hardness,19.599
395,BURBEE,DNO3,Dissolved Nitrate Nitrogen,0.03
2861,RESCUE,TNA,Total Sodium,4.328
2015,LONG (SHEFLD),TEMPC,Temperature,8.826538
2547,NICHOLS,SECCHI,Secchi transparency,7.330952
2116,LOWER,DO,Dissolved Oxygen,12.563333
1164,GRIFFITH,PH,pH,5.63


In [4]:
# Combine the subcategories in the description
cols_from_survey = ['LakeID', 'Description', 'TREE_CANOPY_acres', 'GRASS_SHRUBS_acres', 'BARE_SOIL_acres', 'WATER_acres', 
                    'BUILDINGS_acres','ROADS_acres', 'OTHER_PAVED_acres', 'RAILROADS_acres', 'Ag_Hay_acres', 
                    'Ag_Crops_acres', 'Ag_Pasture_acres', 'Ag_Total_acres', 'Imp_Bare_Soil_acres', 
                    'Imp_Buildings_acres', 'Imp_Other_Paved_acres', 'Imp_Road_acres', 'Imp_Railroad_acres', 
                    'Imp_Total_acres', 'Shrub_Shrubs_acres', 'Shrub_Total_acres', 'TC_Coniferous_acres', 
                    'TC_Deciduous_acres', 'TC_Total_acres', 'Wet_Emergent_acres', 'Wet_Forested_acres', 
                    'Wet_Scrub_Shrub_acres', 'Wet_Total_acres']
# survey_data_grouped = survey_data[cols_from_survey].groupby('LakeID').sum().reset_index()
survey_data_grouped = survey_data[cols_from_survey]
survey_data_grouped.sample(10)

Unnamed: 0,LakeID,Description,TREE_CANOPY_acres,GRASS_SHRUBS_acres,BARE_SOIL_acres,WATER_acres,BUILDINGS_acres,ROADS_acres,OTHER_PAVED_acres,RAILROADS_acres,Ag_Hay_acres,Ag_Crops_acres,Ag_Pasture_acres,Ag_Total_acres,Imp_Bare_Soil_acres,Imp_Buildings_acres,Imp_Other_Paved_acres,Imp_Road_acres,Imp_Railroad_acres,Imp_Total_acres,Shrub_Shrubs_acres,Shrub_Total_acres,TC_Coniferous_acres,TC_Deciduous_acres,TC_Total_acres,Wet_Emergent_acres,Wet_Forested_acres,Wet_Scrub_Shrub_acres,Wet_Total_acres
168,ECHO (HUBDTN),Buffer100ftWBFL,28.700813,7.661305,0.010008,2.520656,0.108664,0.0,0.10922,0.0,0.001483,0.0,0.0,0.001483,0.017103,0.107591,0.114083,0.0,0.0,0.238777,2.077317,2.077317,10.981979,17.811814,28.793793,2.469032,6.558501,0.346414,9.373947
759,WOODWARD,Flowline100ft,100.124907,3.815857,0.032865,2.52844,0.09773,0.369546,0.173159,0.0,0.395736,0.0,0.0,0.395736,0.044161,0.118647,0.175845,0.571775,0.0,0.910428,1.804519,1.804519,15.455304,84.699492,100.154796,0.0,3.786434,1.762558,5.548992
713,TILDYS,Flowline100ft,35.01447,22.785552,0.055722,2.101196,0.157097,1.413997,0.249267,0.0,0.0,2.019763,0.0,2.019763,0.090528,0.16681,0.247956,1.466869,0.0,1.972164,0.0,0.0,20.180046,14.945042,35.125088,21.394135,6.08623,0.0,27.480365
568,PIGEON,Flowline100ft,39.289448,1.920624,0.0,0.376897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015968,0.0,0.0,0.0,0.0,0.015968,0.0,0.0,13.934701,25.377019,39.311719,1.870029,13.682845,0.143692,15.696566
196,EMERALD,Flowline100ft,221.626621,46.810521,0.230981,9.689728,0.806798,1.157007,3.513957,1.434074,16.6555,0.0,0.532329,17.18783,0.280035,0.841594,3.718856,3.332171,1.578606,9.751262,1.133712,1.133712,26.044405,195.936018,221.980422,37.619947,5.925213,1.259249,44.80441
174,ECHO (PLYMTH),Buffer250ftWaterbody,54.520753,14.658639,0.662056,1.156884,1.525812,3.101538,2.609367,0.0,0.0,0.0,0.0,0.0,0.720627,1.695815,3.074184,3.799134,0.0,9.28976,0.0,0.0,11.925863,42.715266,54.641128,0.109794,7.590782,2.555944,10.25652
635,SHADOW (GLOVER),Buffer100ftWBFL,189.336114,48.181336,1.107463,23.67062,2.070122,2.627591,3.278589,0.0,13.885572,1.352652,0.379971,15.618195,1.288115,2.37653,3.822608,3.968646,0.0,11.455899,10.60439,10.60439,99.14647,90.532777,189.679246,13.897248,75.37486,11.17799,100.450098
358,JOES (DANVLL),Flowline100ft,1095.550458,309.2739,0.758489,125.00208,1.144776,8.122032,4.485574,0.0,61.589683,0.0,0.050623,61.640306,1.180909,1.15536,4.85826,9.527255,0.0,16.721784,48.280976,48.280976,605.661408,492.39904,1098.060449,158.097182,519.546762,90.386579,768.030523
436,LOWELL,Buffer250ftWaterbody,66.819045,12.125813,0.0,1.545148,0.059799,0.0,0.094827,0.0,0.0,0.0,0.0,0.0,0.0,0.078729,0.168552,0.0,0.0,0.247281,2.097111,2.097111,48.545801,18.320179,66.86598,10.587505,6.587958,0.179742,17.355204
772,ZACK WOODS,Buffer250ftWaterbody,23.058109,0.155244,0.0,0.344032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.112898,0.0,0.112898,0.0,0.0,17.155167,5.910696,23.065863,0.0,0.322526,0.0,0.322526


In [5]:
merged_data = survey_data_grouped.merge(chem_data_to_merge, how='inner', on='LakeID').drop('LakeID', axis=1)
print(merged_data.shape)

(6950, 31)


### Section 3: Calculating the Correlations for Heatmap Visulaization

In [6]:
MEASUREMENT_THREASHOLD = 20
corrs = {}
for desc in merged_data['Description'].unique():
    corr_df = pd.DataFrame(index=merged_data.columns[:-3])
    for char in merged_data['CharacteristicID'].unique():
        df = merged_data[((merged_data['CharacteristicID'] == char) & (merged_data['Description'] == desc))]
        # because we are calculating correlations if there are measuremetns with less than 5 measurements
        #  we will drop them as the correlation wont make sense or will be misleading
        if df.shape[0] < MEASUREMENT_THREASHOLD:
            continue
        else:
            df = df.corr()
            df.drop('Result', axis=0, inplace=True)
            df.rename(columns={'Result': char}, inplace=True)
            corr_df[char] = df[char]

    # replace all nan values with 0
    corr_df.fillna(0, inplace=True)
    # drop a column if the entire column is 0
    for col in corr_df.columns:
        if corr_df[col].sum() == 0:
            corr_df.drop(col, axis=1, inplace=True)
    corr_df.drop('Description', axis=0, inplace=True)
    corr_df = corr_df.reset_index().rename(columns={'index': 'survey_type'})
    
    corrs[desc] = corr_df

In [7]:
for key, cor in corrs.items():
    cor['Description'] = key
    print(key)
    print(cor.shape)

Watershed
(27, 23)
Flowline100ft
(27, 23)
Waterbody100ft
(27, 23)
Buffer100ftWBFL
(27, 23)
Buffer250ftWaterbody
(27, 23)


In [8]:
corrs_final = pd.concat(corrs, axis=0, ignore_index=True)
print(corrs_final.shape)

(135, 23)


In [9]:
# melt the data for visualization
corr_df_for_vis = pd.melt(corrs_final, id_vars=['survey_type', 'Description'], var_name='CharacteristicID', value_name='correlation')
# merge CharacteristicName
corr_df_for_vis = corr_df_for_vis.merge(merged_data[['CharacteristicID', 'CharacteristicName']].drop_duplicates(), how='left', on='CharacteristicID')
# lets take the abs value of the corelation column so that we can compare them
# corr_df_for_vis['correlation'] = np.abs(corr_df_for_vis['correlation'])
print(corr_df.shape)
corr_df_for_vis.sample(5)

(27, 23)


Unnamed: 0,survey_type,Description,CharacteristicID,correlation,CharacteristicName
2107,GRASS_SHRUBS_acres,Buffer100ftWBFL,TK,0.091583,Total Potassium
2437,RAILROADS_acres,Watershed,TN,-0.09651,Total Nitrogen
227,Ag_Total_acres,Buffer100ftWBFL,SECCHI,-0.088248,Secchi transparency
163,GRASS_SHRUBS_acres,Flowline100ft,SECCHI,-0.091669,Secchi transparency
538,Wet_Scrub_Shrub_acres,Buffer250ftWaterbody,COND,0.05561,Conductivity


### Section 4: Heatmap Visualization

In [21]:
options = ['Watershed', 'Buffer100ftWBFL', 'Buffer250ftWaterbody']
desc_dropdown = alt.binding_select(options=options, name='Description ')
desc_condition = alt.selection_single(fields=['Description'], init={'Description': options[0]}, bind=desc_dropdown)

chart = alt.Chart(corr_df_for_vis).mark_rect().encode(
    x=alt.X('survey_type:N', axis=alt.Axis(title=None)),
    y=alt.Y('CharacteristicName:N', axis=alt.Axis(title=None)),
    color=alt.condition(alt.datum.correlation == 0, alt.value('white'), alt.Color('correlation:Q')),
    tooltip=[alt.Tooltip('CharacteristicName'), alt.Tooltip('CharacteristicID'), alt.Tooltip('survey_type', title='Survey Type'), alt.Tooltip('correlation')]
).add_selection(
    desc_condition
).transform_filter(
    desc_condition
).properties(
    title=alt.TitleParams('Correlation heatmap of chemical measurements and Land use')
)
display(chart)

### Section 5: Merging the Data and calculating the correlations for Health Metric Bar chart

In [22]:
# bar chart with health metric
survey = survey_data[cols_from_survey]
survey = survey.merge(health_metric, how='inner', left_on='LakeID', right_on='Lake').drop('Lake', axis=1)

In [23]:
results_df = pd.DataFrame(index=survey.columns[2 : -1])
for desc in survey['Description'].unique():
    df = survey[survey['Description'] == desc]
    corr_df = df.corr()
    corr_df.drop('Health_Score', axis=0, inplace=True)
    results_df[desc] = corr_df['Health_Score']
results_df = results_df.reset_index().rename(columns={'index': 'survey_type'})
results_df = pd.melt(results_df, id_vars='survey_type', var_name='Description', value_name='correlation')
results_df

Unnamed: 0,survey_type,Description,correlation
0,TREE_CANOPY_acres,Watershed,-0.035824
1,GRASS_SHRUBS_acres,Watershed,0.067897
2,BARE_SOIL_acres,Watershed,0.061495
3,WATER_acres,Watershed,0.038861
4,BUILDINGS_acres,Watershed,0.130100
...,...,...,...
130,TC_Total_acres,Buffer250ftWaterbody,-0.035261
131,Wet_Emergent_acres,Buffer250ftWaterbody,0.381516
132,Wet_Forested_acres,Buffer250ftWaterbody,-0.038068
133,Wet_Scrub_Shrub_acres,Buffer250ftWaterbody,0.086818


### Section 6: Health metric and land use survey bar chart

In [24]:
alt.Chart(results_df).mark_bar().encode(
    x=alt.X('correlation:Q'),
    y=alt.Y('survey_type', sort='-x')
).add_selection(
    desc_condition
).transform_filter(
    desc_condition
).properties(
    title=alt.TitleParams('Survey categories correlated with the derived Health Meric')
)