## Chemical Measurements and Land Use
Visualizations here deal with the land use survey and the interaction/correlation with the chemical measurements data
### Section 1: Data

In [1]:
import pandas as pd
import numpy as np
import altair as alt
import IPython
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

alt.renderers.enable('default')
alt.data_transformers.enable('json')

# show Max Rows
pd.set_option('display.max_columns', None)

In [2]:
# import the data
chem_data = pd.read_csv('assets/chem_data_merged.csv', index_col=0)
survey_data = pd.read_csv('assets/combined_tables.csv', index_col=0)
health_metric = pd.read_csv('assets/health_metric.csv', index_col=0)
print('The size of the chem_data dataframe')
print('columns:', chem_data.shape[1])
print('rows   :', chem_data.shape[0])
display(chem_data.sample(5))
print('The size of the survey_data dataframe')
print('columns:', survey_data.shape[1])
print('rows   :', survey_data.shape[0])
display(survey_data.sample(5))
print('The size of the health_metric dataframe')
print('columns:', health_metric.shape[1])
print('rows   :', health_metric.shape[0])
display(health_metric.sample(5))

The size of the chem_data dataframe
columns: 26
rows   : 284535


Unnamed: 0,LakeID,LakeStationNo,LakeStationType,Lat,Long,Town,ProjectID,VisitDate,VisitNumber,StartTime,CollectionMethodID,Depth,ActivityCategory,CharacteristicID,Symbol,Result,Calcs,ProjRemark,RemarkCode,DepthStratumCode,year,population,CharacteristicName,UnitCode,SampleFraction,NormResult
164301,METCALF,1,Pelagic,44.72883,-72.8833,FLETCHER,Laymon,1981-08-18,1,1745.0,Secchi,,Reg,SECCHI,,3.0,Y,,,,1981,,Secchi transparency,m,,0.18936
200832,ROOD,1,Pelagic,44.07672,-72.58721,WILLIAMSTOWN,SpringTP,1998-04-28,1,953.0,Hydrolab,15.5,Reg,PH,,7.24,Y,,,,1998,3153.0,pH,,,0.5375
131455,ISLAND,1,Pelagic,44.8075,-71.8733,BRIGHTON,Laymon,2013-06-01,1,1000.0,Secchi,,Reg,SECCHI,,2.5,Y,,,,2013,1202.0,Secchi transparency,m,,0.157695
55777,CRESCENT,1,Pelagic,43.8025,-72.4083,SHARON,SpringTP,2003-05-06,1,,Kemmerer,1.0,Reg,REGALK,,60.9,Y,,J,,2003,1454.0,Alkalinity,mg/l,,0.323576
239841,STAMFORD,1,Pelagic,42.8222,-73.0656,STAMFORD,AcidLake,2012-04-04,1,918.0,PlasticKemm,1.0,Reg,PH,,5.67,Y,,,,2012,818.0,pH,,,0.292187


The size of the survey_data dataframe
columns: 35
rows   : 773


Unnamed: 0,LakeID,Description,Lat,Long,Town,Shape_Length,Shape_Area,TREE_CANOPY_acres,GRASS_SHRUBS_acres,BARE_SOIL_acres,WATER_acres,BUILDINGS_acres,ROADS_acres,OTHER_PAVED_acres,RAILROADS_acres,Ag_Hay_acres,Ag_Crops_acres,Ag_Pasture_acres,Ag_Total_acres,Imp_Bare_Soil_acres,Imp_Buildings_acres,Imp_Other_Paved_acres,Imp_Road_acres,Imp_Railroad_acres,Imp_Total_acres,Shrub_Shrubs_acres,Shrub_Total_acres,TC_Coniferous_acres,TC_Deciduous_acres,TC_Total_acres,Wet_Emergent_acres,Wet_Forested_acres,Wet_Scrub_Shrub_acres,Wet_Total_acres,from_file
446,LYFORD,Buffer250ftWaterbody,44.44499,-72.25037,WALDEN,5049.037711,178035.0,25.966226,14.772987,0.146039,0.739894,0.842443,0.768188,0.752867,0.0,4.270728,0.0,0.0,4.270728,0.143128,0.922725,0.803939,1.457908,0.0,3.3277,1.363696,1.363696,17.08315,8.997516,26.080667,1.689867,9.961012,1.44246,13.093339,AOIs_LYFORD.xls
430,LONG (WESTMR),Buffer100ftWBFL,44.75274,-72.01827,WESTMORE,11082.692468,219620.2,44.030404,6.078845,0.083027,3.568752,0.285097,0.05461,0.15895,0.0,0.0,0.0,0.0,0.0,0.095084,0.311064,0.174565,0.2501,0.0,0.830813,0.0,0.0,26.960429,17.142511,44.102941,1.535361,6.712663,2.559917,10.807941,AOIs_LONGWESTMR.xls
497,MOREY,Watershed,43.9247,-72.1533,FAIRLEE,29184.038434,16111380.0,3666.238383,258.456077,1.23954,6.284621,7.04484,26.466552,15.47631,0.0,24.816965,0.0,0.0,24.816965,1.448227,8.327905,17.109996,38.801259,0.0,65.687387,22.864266,22.864266,1445.031792,2223.468782,3668.500574,19.231006,94.247493,18.853389,132.331888,AOIs_MOREY.xls
690,STOUGHTON,Buffer100ftWBFL,43.3781,-72.5,WEATHERSFIELD,247471.498256,7654625.0,1391.678928,370.430163,3.627254,99.228719,2.399266,16.702012,7.218678,0.0,151.723982,0.072109,20.421005,172.217096,4.125407,2.511842,7.997021,28.833995,0.0,43.468265,47.429428,47.429428,458.688012,935.247023,1393.935035,98.072321,310.733524,82.795904,491.601749,AOIs_STOUGHTON.xlsx
162,ECHO (CHARTN),Waterbody100ft,44.86231,-71.99507,CHARLESTON,13820.829492,208967.8,28.747763,15.769562,0.279105,1.881705,1.414305,1.763527,1.773225,0.0,2.424662,0.0,0.0,2.424662,0.322078,1.52016,2.056316,2.132835,0.0,6.031389,0.0,0.0,15.120139,13.784907,28.905046,1.676447,11.551497,0.265099,13.493043,AOIs_ECHOCHARTN.xls


The size of the health_metric dataframe
columns: 2
rows   : 90


Unnamed: 0,Lake,Health_Score
47,MILES,1.840927
14,SEYMOUR,1.839372
11,GREENWOOD,1.076986
85,DUNMORE,0.089848
59,CRYSTAL (BARTON),2.01326


### Section 2: Merging the Data for Heatmap Visulaization

In [3]:
# for this visulaization we need the chem_data from 2013 - 2016 and the LakeID to merge on, the 'CharacteristicID,
# CharacteristicName, and the Result
chem_data_to_merge = chem_data[((chem_data['year'] <= 2016) & (chem_data['year'] >= 2013))]
chem_data_to_merge = chem_data_to_merge[['LakeID', 'CharacteristicID', 'CharacteristicName', 'Result']].copy()

# Now we Groupby LakeID as we want one measurement: the mean result of each measurement type per lake.
chem_data_to_merge = chem_data_to_merge.groupby(['LakeID', 'CharacteristicID', 'CharacteristicName']).mean().reset_index()
chem_data_to_merge.sample(10)

Unnamed: 0,LakeID,CharacteristicID,CharacteristicName,Result
3620,SUGAR HOLLOW,SECCHI,Secchi transparency,1.1
3580,STRATTON,GRANALK,Alkalinity measured using Gran Alkalinity,1.743333
1024,GILLETT,CHLA,Chlorophyll-a,3.43
2976,RUNNEMEDE,TN,Total Nitrogen,0.21
2148,LOWER WINOOSKI;,TURBNTU,Turbidity,23.875
358,BROWN,TAL,Total Aluminum,50.0
422,BURR (PITTFD),SECCHI,Secchi transparency,5.0
1602,KETTLE (BRATTLEBORO),TNA,Total Sodium,119.8
2738,PECKS,TURBNTU,Turbidity,0.0
2927,ROUND (HOLLND),TP,Total Phosphorus,11.955


In [4]:
# Combine the subcategories in the description
cols_from_survey = ['LakeID', 'Description', 'TREE_CANOPY_acres', 'GRASS_SHRUBS_acres', 'BARE_SOIL_acres', 'WATER_acres', 
                    'BUILDINGS_acres','ROADS_acres', 'OTHER_PAVED_acres', 'RAILROADS_acres', 'Ag_Hay_acres', 
                    'Ag_Crops_acres', 'Ag_Pasture_acres', 'Ag_Total_acres', 'Imp_Bare_Soil_acres', 
                    'Imp_Buildings_acres', 'Imp_Other_Paved_acres', 'Imp_Road_acres', 'Imp_Railroad_acres', 
                    'Imp_Total_acres', 'Shrub_Shrubs_acres', 'Shrub_Total_acres', 'TC_Coniferous_acres', 
                    'TC_Deciduous_acres', 'TC_Total_acres', 'Wet_Emergent_acres', 'Wet_Forested_acres', 
                    'Wet_Scrub_Shrub_acres', 'Wet_Total_acres']
# survey_data_grouped = survey_data[cols_from_survey].groupby('LakeID').sum().reset_index()
survey_data_grouped = survey_data[cols_from_survey]
survey_data_grouped.sample(10)

Unnamed: 0,LakeID,Description,TREE_CANOPY_acres,GRASS_SHRUBS_acres,BARE_SOIL_acres,WATER_acres,BUILDINGS_acres,ROADS_acres,OTHER_PAVED_acres,RAILROADS_acres,Ag_Hay_acres,Ag_Crops_acres,Ag_Pasture_acres,Ag_Total_acres,Imp_Bare_Soil_acres,Imp_Buildings_acres,Imp_Other_Paved_acres,Imp_Road_acres,Imp_Railroad_acres,Imp_Total_acres,Shrub_Shrubs_acres,Shrub_Total_acres,TC_Coniferous_acres,TC_Deciduous_acres,TC_Total_acres,Wet_Emergent_acres,Wet_Forested_acres,Wet_Scrub_Shrub_acres,Wet_Total_acres
321,HOLLAND,Watershed,4173.599805,225.055388,1.868237,107.125577,1.209826,4.31538,3.010604,0.0,26.556409,0.0,0.0,26.556409,2.125118,1.328209,3.699084,7.298394,0.0,14.450805,49.533898,49.533898,1586.558591,2589.134115,4175.692706,136.683387,1146.262926,45.153158,1328.099471
13,AMHERST,Buffer100ftWBFL,933.931124,108.734354,2.579467,48.460503,1.008991,10.993207,4.160322,0.0,4.370632,0.0,1.431078,5.801711,3.184252,1.096061,4.650435,16.147106,0.0,25.077854,37.126099,37.126099,165.607611,769.279091,934.886702,56.426981,241.015514,49.503565,346.946061
38,BIG,Buffer100ftWBFL,43.447793,5.801346,0.0,0.683184,1.03086,0.520774,1.521611,0.0,0.0,0.0,0.0,0.0,0.0,1.065426,1.759635,0.570527,0.0,3.395588,0.0,0.0,13.823144,29.688281,43.511425,0.0,6.42655,0.0,6.42655
17,ATHENS,Waterbody100ft,7.496857,1.891342,0.017977,0.440218,0.0,0.144062,0.072278,0.0,0.0,0.0,0.0,0.0,0.017234,0.0,0.071809,0.561686,0.0,0.650729,0.0,0.0,3.973165,3.54099,7.514155,1.217739,0.831646,0.311238,2.360624
530,NORTON,Buffer100ftWBFL,1320.619252,93.230987,1.561271,44.281463,0.656929,4.342685,2.81607,3.953,0.0,0.0,0.0,0.0,1.906303,0.853834,3.460954,6.387541,4.263762,16.872395,6.290469,6.290469,349.068474,972.989197,1322.057671,72.790009,282.687835,23.263467,378.741311
163,ECHO (CHARTN),Buffer100ftWBFL,504.6757,158.021485,0.86042,177.81855,2.486309,5.8714,6.050119,0.0,67.9615,0.0,0.0,67.9615,1.033193,2.638996,6.653821,7.430658,0.0,17.756668,10.357996,10.357996,215.532652,290.23117,505.763822,49.947719,166.5797,27.37692,243.904339
7,ADAMS (WOODFD),Waterbody100ft,15.77506,0.583353,0.029158,1.074907,0.0,0.0,0.042996,0.0,0.0,0.0,0.0,0.0,0.05798,0.0,0.163275,0.0,0.0,0.221256,0.0,0.0,7.591926,8.177718,15.769644,0.0,2.195926,0.0,2.195926
748,WINONA,Watershed,1833.36781,1239.345985,5.494318,23.828768,7.041072,10.596295,23.540211,0.0,198.80208,388.739568,4.643474,592.185122,5.652009,7.332573,24.322106,15.359321,0.0,52.666009,510.011264,510.011264,430.463379,1403.866829,1834.330207,511.983282,140.670321,33.952064,686.605667
691,STOUGHTON,Buffer250ftWaterbody,35.793901,6.899048,0.18712,0.576496,0.023351,0.51021,0.314812,0.0,0.00616,0.0,0.0,0.00616,0.192712,0.02368,0.31679,0.715795,0.0,1.248977,3.848891,3.848891,29.489583,6.345927,35.83551,1.280229,2.067088,3.556576,6.903893
583,RESCUE,Flowline100ft,1565.557079,172.970349,3.083623,153.455047,1.785828,15.340217,6.463711,0.0,7.064728,0.448112,6.403616,13.916455,3.740297,1.935613,7.1833,28.2652,0.0,41.124409,39.652373,39.652373,298.858161,1268.233735,1567.091896,81.702966,368.274013,66.028919,516.005898


In [5]:
merged_data = survey_data_grouped.merge(chem_data_to_merge, how='inner', on='LakeID').drop('LakeID', axis=1)
print(merged_data.shape)

(6950, 31)


### Section 3: Calculating the Correlations for Heatmap Visulaization

In [6]:
MEASUREMENT_THREASHOLD = 20
corrs = {}
for desc in merged_data['Description'].unique():
    corr_df = pd.DataFrame(index=merged_data.columns[:-3])
    for char in merged_data['CharacteristicID'].unique():
        df = merged_data[((merged_data['CharacteristicID'] == char) & (merged_data['Description'] == desc))]
        # because we are calculating correlations if there are measuremetns with less than 5 measurements
        #  we will drop them as the correlation wont make sense or will be misleading
        if df.shape[0] < MEASUREMENT_THREASHOLD:
            continue
        else:
            df = df.corr()
            df.drop('Result', axis=0, inplace=True)
            df.rename(columns={'Result': char}, inplace=True)
            corr_df[char] = df[char]

    # replace all nan values with 0
    corr_df.fillna(0, inplace=True)
    # drop a column if the entire column is 0
    for col in corr_df.columns:
        if corr_df[col].sum() == 0:
            corr_df.drop(col, axis=1, inplace=True)
    corr_df.drop('Description', axis=0, inplace=True)
    corr_df = corr_df.reset_index().rename(columns={'index': 'survey_type'})
    
    corrs[desc] = corr_df

In [7]:
for key, cor in corrs.items():
    cor['Description'] = key
    print(key)
    print(cor.shape)

Watershed
(27, 23)
Flowline100ft
(27, 23)
Waterbody100ft
(27, 23)
Buffer100ftWBFL
(27, 23)
Buffer250ftWaterbody
(27, 23)


In [8]:
corrs_final = pd.concat(corrs, axis=0, ignore_index=True)
print(corrs_final.shape)

(135, 23)


In [9]:
# melt the data for visualization
corr_df_for_vis = pd.melt(corrs_final, id_vars=['survey_type', 'Description'], var_name='CharacteristicID', value_name='correlation')
# merge CharacteristicName
corr_df_for_vis = corr_df_for_vis.merge(merged_data[['CharacteristicID', 'CharacteristicName']].drop_duplicates(), how='left', on='CharacteristicID')
# lets take the abs value of the corelation column so that we can compare them
# corr_df_for_vis['correlation'] = np.abs(corr_df_for_vis['correlation'])
print(corr_df.shape)
corr_df_for_vis.sample(5)

(27, 23)


Unnamed: 0,survey_type,Description,CharacteristicID,correlation,CharacteristicName
2190,WATER_acres,Flowline100ft,TMG,-0.113697,Total Magnesium
2432,BARE_SOIL_acres,Watershed,TN,-0.06649,Total Nitrogen
1655,Ag_Hay_acres,Flowline100ft,TCA,0.055866,Total Calcium
975,WATER_acres,Flowline100ft,CHLAPROBE,-0.145619,Chlorophyll-a (probe)
579,Imp_Bare_Soil_acres,Flowline100ft,PH,-0.150901,pH


### Section 4: Heatmap Visualization

In [10]:
options = corr_df_for_vis['Description'].unique()
desc_dropdown = alt.binding_select(options=options, name='Description ')
desc_condition = alt.selection_single(fields=['Description'], init={'Description': options[0]}, bind=desc_dropdown)

chart = alt.Chart(corr_df_for_vis).mark_rect().encode(
    x=alt.X('survey_type:N', axis=alt.Axis(title=None)),
    y=alt.Y('CharacteristicName:N', axis=alt.Axis(title=None)),
    color=alt.condition(alt.datum.correlation == 0, alt.value('white'), alt.Color('correlation:Q')),
    tooltip=[alt.Tooltip('CharacteristicName'), alt.Tooltip('CharacteristicID'), alt.Tooltip('survey_type', title='Survey Type'), alt.Tooltip('correlation')]
).add_selection(
    desc_condition
).transform_filter(
    desc_condition
).properties(
    title=alt.TitleParams('Correlation heatmap of chemical measurements and Land use')
)
display(chart)

### Section 5: Merging the Data and calculating the correlations for Health Metric Bar chart

In [11]:
# bar chart with health metric
survey = survey_data[cols_from_survey]
survey = survey.merge(health_metric, how='inner', left_on='LakeID', right_on='Lake').drop('Lake', axis=1)

In [12]:
results_df = pd.DataFrame(index=survey.columns[2 : -1])
for desc in survey['Description'].unique():
    df = survey[survey['Description'] == desc]
    corr_df = df.corr()
    corr_df.drop('Health_Score', axis=0, inplace=True)
    results_df[desc] = corr_df['Health_Score']
results_df = results_df.reset_index().rename(columns={'index': 'survey_type'})
results_df = pd.melt(results_df, id_vars='survey_type', var_name='Description', value_name='correlation')
results_df

Unnamed: 0,survey_type,Description,correlation
0,TREE_CANOPY_acres,Watershed,-0.035824
1,GRASS_SHRUBS_acres,Watershed,0.067897
2,BARE_SOIL_acres,Watershed,0.061495
3,WATER_acres,Watershed,0.038861
4,BUILDINGS_acres,Watershed,0.130100
...,...,...,...
130,TC_Total_acres,Buffer250ftWaterbody,-0.035261
131,Wet_Emergent_acres,Buffer250ftWaterbody,0.381516
132,Wet_Forested_acres,Buffer250ftWaterbody,-0.038068
133,Wet_Scrub_Shrub_acres,Buffer250ftWaterbody,0.086818


### Section 6: Health metric and land use survey bar chart

In [13]:
alt.Chart(results_df).mark_bar().encode(
    x=alt.X('correlation:Q'),
    y=alt.Y('survey_type', sort='-x')
).add_selection(
    desc_condition
).transform_filter(
    desc_condition
).properties(
    title=alt.TitleParams('Survey categories correlated with the derived Health Meric')
)

## Documentation

In [14]:
%load_ext watermark
%watermark --iversions

altair : 4.1.0
sys    : 3.9.9 | packaged by conda-forge | (main, Dec 20 2021, 02:36:06) [MSC v.1929 64 bit (AMD64)]
IPython: 7.29.0
numpy  : 1.21.5
pandas : 1.3.4

