# Data Preperation
The resulting data from this notebook will combine the positional data (latitude, longitude, town) of the ChemDataForJeffOlson.csv and all of the tables from the Land use survey.
## Merge all tables from the land use survey
Here we run through the files from the land use survey. All files have the same colum names and thus we just need to append them all. I created a dataframe from the first file, then ran through the rest of them turning each into a dataframe then appending it to the first.

In [1]:
import os
import re
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

In [2]:
# set the path to the folder with all the tables
land_use_folder = 'assets/Tables'

# set the path to the chem data file
chem_data_file_path = 'assets/ChemDataForJeffOlson.csv'

# set the path to the population data (Optional)
use_population = True
population_path = 'assets/HS-STAT-Population-of-Vermont-towns-1930-2019.xls'

# set the path to the characteristics data
characteristics_path = 'assets/Characteristic.csv'

# set the save path to the resulting cleaned chem data
chem_data_save_path = 'assets/chem_data_merged.csv'

# set the save path to the resulting durvey data file
survay_save_path = 'assets/combined_tables.csv'

In [3]:
# combine all tables from Table Folder in assets folder

def merge_tables_folder(tables_folder):
    # get the first file as data frame to append to
    file_1 = os.listdir(tables_folder)[0]
    tables_df = pd.read_excel(os.path.join(tables_folder, file_1))
    
    # add file name to df
    tables_df['from_file'] = file_1
    
    # run through the rest of the files and append them to the data frame
    for file in os.listdir(tables_folder)[1:]:
        df = pd.read_excel(os.path.join(tables_folder, file))
        df['from_file'] = file
        tables_df = pd.concat((tables_df, df), axis=0)
    
    # drop OBJECTID column and reset index
    tables_df = tables_df.drop('OBJECTID', axis=1).reset_index(drop=True)
    return tables_df

In [4]:
combined_tables_df = merge_tables_folder(land_use_folder)

In [5]:
combined_tables_df.sample(5)

Unnamed: 0,Description,Shape_Length,Shape_Area,TREE_CANOPY_acres,GRASS_SHRUBS_acres,BARE_SOIL_acres,WATER_acres,BUILDINGS_acres,ROADS_acres,OTHER_PAVED_acres,RAILROADS_acres,Ag_Crops_acres,Ag_Hay_acres,Ag_Pasture_acres,Ag_Total_acres,Imp_Bare_Soil_acres,Imp_Buildings_acres,Imp_Other_Paved_acres,Imp_Road_acres,Imp_Railroad_acres,Imp_Total_acres,Shrub_Shrubs_acres,Shrub_Total_acres,TC_Coniferous_acres,TC_Deciduous_acres,TC_Total_acres,Wet_Emergent_acres,Wet_Forested_acres,Wet_Scrub_Shrub_acres,Wet_Total_acres,from_file
597,Watershed_CENTER,22070.660319,15302980.0,3421.538518,297.761278,14.718315,19.502206,2.70475,11.968531,13.249152,0.0,0.0,25.989071,0.0,25.989071,35.962134,2.967231,15.876573,21.877637,0.0,76.683575,19.521338,19.521338,1079.179106,2346.367504,3425.54661,24.007259,343.810364,6.863712,374.681335,AOIs_CENTER.xls
200,VALLEY_Buffer100ftWBFL,13895.646044,252109.4,50.627984,4.78698,0.057514,6.127648,0.227337,0.232217,0.231167,0.0,0.0,0.0,0.0,0.0,0.065753,0.266498,0.275028,0.482944,0.0,1.090223,0.121568,0.121568,25.748574,25.015865,50.764439,0.97981,4.294422,0.354032,5.628263,AOIs_VALLEY.xlsx
107,HALFMOON_Waterbody100ft,3619.318491,54020.72,11.267864,1.697055,0.0,0.192001,0.013405,0.128556,0.047321,0.0,0.0,0.0,0.0,0.0,0.0,0.022023,0.059463,0.471715,0.0,0.553202,0.0,0.0,6.839524,4.499734,11.339258,0.421819,1.63673,0.0,2.058549,AOIs_HALFMOON.xls
359,Waterbody100ft_SHADOWGLOVER,8272.732374,123753.4,16.571356,7.104145,0.137205,1.089424,1.610939,1.81684,2.247111,0.0,0.0,0.0,0.0,0.0,0.164473,1.88083,2.649203,2.314708,0.0,7.009214,0.0,0.0,11.793413,4.869642,16.663055,0.0,2.397023,0.0,2.397023,AOIs_SHADOWGLOVER.xls
690,Waterbody100ft_LITTLEAVERILL,13622.925627,202731.8,39.026714,7.783499,0.413839,2.498355,0.163028,0.0,0.198734,0.0,0.0,0.0,0.0,0.0,0.594093,0.190534,0.249892,0.131643,0.0,1.166162,2.444498,2.444498,26.092319,12.992356,39.084675,6.600185,7.407025,1.054873,15.062084,AOIs_LITTLEAVERILL.xls


## Split the Description column to get LakeIDs
The Description column has the LakeID attached to a description. depending on the lake they may be LakeID then description or description then LakeID. the function below splits on an underscore and takes the part of the newly created list that is all uppercase as the LakeID and leaves the rest as the description.

In [6]:
# now I want to solit the description column in to two columns 
# one corresponds to the title area and the other corresponds to the description
# depending on the folder they are in different orders

def split_description(df):
    df['Description'] = df['Description'].str.split('_')
    df['LakeID'] = df['Description'].apply(lambda row: row[0] if row[0].isupper() else row[1])
    df['Description'] = df['Description'].apply(lambda row: row[0] if not row[0].isupper() else row[1])
    return df

In [7]:
combined_tables_df = split_description(combined_tables_df)
combined_tables_df.sample(5)

Unnamed: 0,Description,Shape_Length,Shape_Area,TREE_CANOPY_acres,GRASS_SHRUBS_acres,BARE_SOIL_acres,WATER_acres,BUILDINGS_acres,ROADS_acres,OTHER_PAVED_acres,RAILROADS_acres,Ag_Crops_acres,Ag_Hay_acres,Ag_Pasture_acres,Ag_Total_acres,Imp_Bare_Soil_acres,Imp_Buildings_acres,Imp_Other_Paved_acres,Imp_Road_acres,Imp_Railroad_acres,Imp_Total_acres,Shrub_Shrubs_acres,Shrub_Total_acres,TC_Coniferous_acres,TC_Deciduous_acres,TC_Total_acres,Wet_Emergent_acres,Wet_Forested_acres,Wet_Scrub_Shrub_acres,Wet_Total_acres,from_file,LakeID
349,Waterbody100ft,11265.775925,172264.0,29.673295,8.87978,0.102363,0.556295,0.160804,3.008874,0.182549,0.0,0.0,0.261202,0.0,0.261202,0.121394,0.179635,0.192327,3.367415,0.0,3.860771,0.0,0.0,9.814316,19.918627,29.732943,0.042003,0.815254,0.0,0.857257,AOIs_WOODWARD.xlsx,WOODWARD
571,Buffer250ftWaterbody,6657.147863,245115.1,43.526187,14.494191,0.0,1.809241,0.176865,0.390364,0.169514,0.0,0.0,0.0,0.0,0.0,0.0,0.17492,0.241462,0.83455,0.0,1.250931,2.797608,2.797608,20.629263,23.075595,43.704857,3.269034,16.330403,9.756385,29.355822,AOIs_TURTLEHEAD.xlsx,TURTLEHEAD
690,Waterbody100ft,13622.925627,202731.8,39.026714,7.783499,0.413839,2.498355,0.163028,0.0,0.198734,0.0,0.0,0.0,0.0,0.0,0.594093,0.190534,0.249892,0.131643,0.0,1.166162,2.444498,2.444498,26.092319,12.992356,39.084675,6.600185,7.407025,1.054873,15.062084,AOIs_LITTLEAVERILL.xls,LITTLEAVERILL
369,Waterbody100ft,10008.032801,148109.8,22.017055,13.35028,0.013405,0.906258,0.203615,0.028294,0.076479,0.0,0.0,0.0,0.0,0.0,0.013822,0.21434,0.080713,0.07658,0.0,0.385455,8.407963,8.407963,8.582187,13.50193,22.084117,14.41419,1.876452,0.920049,17.210691,AOIs_NINEVAH.xlsx,NINEVAH
352,Watershed,10246.114857,1459007.0,314.915863,33.925107,0.196263,0.919354,1.885967,3.606374,5.07807,0.0,0.0,0.0,0.0,0.0,0.231707,2.029064,5.893192,6.358925,0.0,14.512888,11.494238,11.494238,120.180496,195.13578,315.316276,0.292679,37.480178,0.0,37.772856,AOIs_NEWARK.xlsx,NEWARK


## Merge relevant columns from the chem data and population datasets to survey data
For now I will just take the 'LakeID', 'Lat', 'Long', 'Town' from the chem data dataset. for other use later i will also extract the measurements when I know what year the land use survey was done. I will need to fix the LakeIDs in the combined_tables_df dataframe so that they match up with the LakeIDs in the chem dataset. This is the column that i will join on.

In [8]:
chem_data_df = pd.read_csv(chem_data_file_path)
chem_data_df.sample(5)

Unnamed: 0,LakeID,LakeStationNo,LakeStationType,Lat,Long,Town,ProjectID,VisitDate,VisitNumber,StartTime,CollectionMethodID,Depth,ActivityCategory,CharacteristicID,Symbol,Result,Calcs,ProjRemark,RemarkCode,DepthStratumCode
55708,CRANBERRY MEADOW,1,Pelagic,44.4211,-72.4575,Woodbury,SpringTP,1992-05-27 00:00:00,1,,Hose,6.0,Reg,TP,,15.0,Y,,,
47991,CHIPMAN,1,Pelagic,43.4089,-73.0317,Tinmouth,Laymon,2018-07-23 00:00:00,1,1850.0,Secchi,,Reg,Secchi,,3.1,Y,,B,
131380,ISLAND,1,Pelagic,44.8075,-71.8733,Brighton,NLA,2007-07-12 00:00:00,1,1101.0,Hydrolab,10.0,Reg,TempC,,9.3,Y,,,
209235,SALEM,1,Pelagic,44.93,-72.1044,Derby,Laymon,2014-07-22 00:00:00,1,1420.0,Secchi,,Reg,Secchi,,4.3,Y,,,
185439,PARKER,1,Pelagic,44.71926,-72.23425,Glover,Laymon,1995-07-12 00:00:00,1,1735.0,Hose,8.8,R1,Chla,,7.0,Y,,,


In [9]:
if use_population:    
    population = pd.read_excel(population_path, skiprows=4, index_col='CTC')
    # lets take the relevent columns from the population dataset and melt them so we ca easily join the dataframes
    population = population[['NAME'] + list(population.columns[-32:])]
    population.rename(columns={'NAME': 'Town'}, inplace=True)
    population.sample(5)

In [10]:
# now lets convert town names in the chem dataset to uppercase to match the town names in the population data set
chem_data_df['Town'] = chem_data_df['Town'].str.upper()
# we also need to extract the year from the chem data set so that we can join the population data
# to the chem data by year and Town
chem_data_df['VisitDate'] = pd.to_datetime(chem_data_df['VisitDate'])
chem_data_df['year'] = chem_data_df['VisitDate'].dt.year

In [11]:
chem_to_add = chem_data_df[['LakeID', 'Lat', 'Long', 'Town']].copy()
chem_to_add.drop_duplicates(inplace=True)

# get all lake ids that are also in usage survey and fix them
for lake_id in np.unique(chem_to_add['LakeID']):
    lake_id_fixed = re.sub('[()\s;]', '', lake_id)
    index = combined_tables_df[combined_tables_df['LakeID'] == lake_id_fixed].index
    combined_tables_df.loc[index, 'LakeID'] = lake_id


In [12]:
# merge with the chem_to_add data frame
new_data_df = combined_tables_df.merge(chem_to_add, how='left', on='LakeID')

# reorder columns
cols = list(new_data_df.columns)

to_front = ['LakeID', 'Description', 'Lat', 'Long', 'Town']
for col in to_front:
    cols.remove(col)

cols = to_front + cols
new_data_df = new_data_df.reindex(columns=cols)

new_data_df.head(5)

Unnamed: 0,LakeID,Description,Lat,Long,Town,Shape_Length,Shape_Area,TREE_CANOPY_acres,GRASS_SHRUBS_acres,BARE_SOIL_acres,WATER_acres,BUILDINGS_acres,ROADS_acres,OTHER_PAVED_acres,RAILROADS_acres,Ag_Crops_acres,Ag_Hay_acres,Ag_Pasture_acres,Ag_Total_acres,Imp_Bare_Soil_acres,Imp_Buildings_acres,Imp_Other_Paved_acres,Imp_Road_acres,Imp_Railroad_acres,Imp_Total_acres,Shrub_Shrubs_acres,Shrub_Total_acres,TC_Coniferous_acres,TC_Deciduous_acres,TC_Total_acres,Wet_Emergent_acres,Wet_Forested_acres,Wet_Scrub_Shrub_acres,Wet_Total_acres,from_file
0,HORSE,Watershed,44.6169,-72.2108,GREENSBORO,9262.63362,2107941.0,468.319079,45.532062,0.034904,3.322265,0.043058,3.534652,0.097545,0.0,0.0,0.0,0.0,0.0,0.067911,0.042648,0.116058,5.195567,0.0,5.422184,0.946536,0.946536,210.794069,258.350785,469.144853,14.288349,77.086939,5.588561,96.963848,AOIs_HORSE.xls
1,HORSE,Flowline100ft,44.6169,-72.2108,GREENSBORO,1500.131136,41761.64,4.545867,3.129955,0.0,2.491621,0.0,0.150858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151524,0.0,0.151524,0.279845,0.279845,2.970089,1.629099,4.599187,3.341299,2.996817,0.908847,7.246963,AOIs_HORSE.xls
2,HORSE,Waterbody100ft,44.6169,-72.2108,GREENSBORO,4447.43486,62695.52,6.33812,8.10455,0.0,0.834165,0.022425,0.093591,0.096186,0.0,0.0,0.0,0.0,0.0,0.0,0.022314,0.114463,0.17765,0.0,0.314428,0.011841,0.011841,5.278762,1.092946,6.371707,5.845567,2.616113,2.488963,10.950643,AOIs_HORSE.xls
3,HORSE,Buffer100ftWBFL,44.6169,-72.2108,GREENSBORO,5715.5097,102000.7,10.868234,10.706442,0.0,3.262898,0.022425,0.244449,0.096186,0.0,0.0,0.0,0.0,0.0,0.0,0.022314,0.114463,0.329174,0.0,0.465952,0.291685,0.291685,8.234746,2.720674,10.95542,8.581423,5.612931,3.397809,17.592163,AOIs_HORSE.xls
4,HORSE,Buffer250ftWaterbody,44.6169,-72.2108,GREENSBORO,4647.79828,154538.8,23.735053,12.987901,0.0,0.874999,0.022425,0.466102,0.097545,0.0,0.0,0.0,0.0,0.0,0.0,0.022314,0.116058,0.614319,0.0,0.752692,0.60402,0.60402,16.834859,7.006856,23.841715,9.646655,8.976926,3.505087,22.128668,AOIs_HORSE.xls


In [13]:
# now since the land use survey was conducted from 2013 - 2016 I will merge the population data for 2016 joining on the town names
if use_population:    
    pop2016 = population[['Town', 2016]].copy()
    pop2016.rename(columns={2016: '2016_population'}, inplace=True)
    new_data_df = new_data_df.merge(pop2016, how='left', on='Town')
    print('Final survey data DataFrame shape: ', new_data_df.shape)
    new_data_df.sample(5)

Final survey data DataFrame shape:  (773, 36)


## Save the survey dataframe
Lets save the resulting dataframe as a .csv for later use

In [14]:
# save the data frame as a .csv
new_data_df.to_csv(survay_save_path)

## Merge the population dataset and the chem data dataset
Here we will take the population dataset and melt it so that the years are also in the rows sp that we can merge on both town and year data.

In [15]:
# melt the population dataframe
if use_population:
    population_1 = population.melt(id_vars='Town', var_name='year', value_name='population')
    chem_data_df = chem_data_df.merge(population_1, how='left', on=['Town', 'year'])
    chem_data_df.sample(5)

In [16]:
# Now so that we have measurements 

characteristics = pd.read_csv(characteristics_path)
characteristics.columns = ['CharacteristicID', 'CharacteristicName', 'UnitCode', 'SampleFraction']
characteristics.sample(5)

Unnamed: 0,CharacteristicID,CharacteristicName,UnitCode,SampleFraction
73,TCl,Total Chloride,mg/l,Total
82,TK,Total Potassium,mg/l,Total
1,BOD5,"Biological Oxygen Demand, 5 day",mg/l,
75,TCr,Total Chromium,ug/l,Total
66,TAl,Total Aluminum,ug/l,Total


In [17]:
# Merge characteristics df
chem_data_df_final = chem_data_df.merge(characteristics, how='left', on='CharacteristicID')
print('Final chem data Data frame shape: ', chem_data_df_final.shape)
chem_data_df_final.sample(5)

Final chem data Data frame shape:  (284912, 25)


Unnamed: 0,LakeID,LakeStationNo,LakeStationType,Lat,Long,Town,ProjectID,VisitDate,VisitNumber,StartTime,CollectionMethodID,Depth,ActivityCategory,CharacteristicID,Symbol,Result,Calcs,ProjRemark,RemarkCode,DepthStratumCode,year,population,CharacteristicName,UnitCode,SampleFraction
91429,GREAT AVERILL,1,Pelagic,44.98632,-71.7001,NORTON,SpringTP,2017-05-16,1,1101.0,Hydrolab,13.07,Reg,pH,,6.65,Y,,,,2017,161.0,pH,,
195728,RAPONDA,1,Pelagic,42.87608,-72.81786,WILMINGTON,LayMon,2011-07-18,1,1135.0,BottleGrab,0.5,R1,Chla,,3.16,Y,,,,2011,1865.0,Chlorophyll-a,ug/l,
90990,GREAT AVERILL,1,Pelagic,44.98632,-71.7001,NORTON,SpringTP,2009-05-01,1,1538.0,Hydrolab,15.0,Reg,TempC,,5.74,Y,,,,2009,172.0,Temperature,deg C,
129820,IROQUOIS,1,Pelagic,44.3658,-73.0833,HINESBURG,LayMon,2011-08-05,1,1820.0,Secchi,,Reg,Secchi,,2.6,Y,,,,2011,4425.0,Secchi transparency,m,
53669,COLES,1,Pelagic,44.5058,-72.215,WALDEN,Laymon,2018-08-04,1,1135.0,Secchi,,Reg,Secchi,,3.4,Y,,,,2018,949.0,Secchi transparency,m,


## Save the chem data DataFrame
Lets save the resulting dataframe as a .csv for later use

In [18]:
chem_data_df_final.to_csv(chem_data_save_path)

## Documantation

In [19]:
%load_ext watermark
%watermark --iversions

re    : 2.2.1
pandas: 1.3.4
numpy : 1.21.2

