# Data Preperation
The resulting data from this notebook will combine the positional data (latitude, longitude, town) of the ChemDataForJeffOlson.csv and all of the tables from the Land use survey.
## Merge all tables from the land use survey
Here we run through the files from the land use survey. All files have the same colum names and thus we just need to append them all. I created a dataframe from the first file, then ran through the rest of them turning each into a dataframe then appending it to the first.

In [24]:
import os
import re
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

In [25]:
# set the path to the folder with all the tables
land_use_folder = 'assets/Tables'

# set the path to the chem data file
chem_data_file_path = 'assets/ChemDataForJeffOlson.csv'

# set the path to the population data (Optional)
use_population = True
population_path = 'assets/HS-STAT-Population-of-Vermont-towns-1930-2019.xls'

# set the path to the characteristics data
characteristics_path = 'assets/Characteristic.csv'

# set the save path to the resulting cleaned chem data
chem_data_save_path = 'assets/chem_data_merged.csv'

# set the save path to the resulting durvey data file
survay_save_path = 'assets/combined_tables.csv'

In [26]:
# combine all tables from Table Folder in assets folder

def merge_tables_folder(tables_folder):
    # get the first file as data frame to append to
    file_1 = os.listdir(tables_folder)[0]
    tables_df = pd.read_excel(os.path.join(tables_folder, file_1))
    
    # add file name to df
    tables_df['from_file'] = file_1
    
    # run through the rest of the files and append them to the data frame
    for file in os.listdir(tables_folder)[1:]:
        df = pd.read_excel(os.path.join(tables_folder, file))
        df['from_file'] = file
        tables_df = pd.concat((tables_df, df), axis=0)
    
    # drop OBJECTID column and reset index
    tables_df = tables_df.drop('OBJECTID', axis=1).reset_index(drop=True)
    return tables_df

In [27]:
combined_tables_df = merge_tables_folder(land_use_folder)

In [28]:
combined_tables_df.sample(5)

Unnamed: 0,Description,Shape_Length,Shape_Area,TREE_CANOPY_acres,GRASS_SHRUBS_acres,BARE_SOIL_acres,WATER_acres,BUILDINGS_acres,ROADS_acres,OTHER_PAVED_acres,RAILROADS_acres,Ag_Hay_acres,Ag_Crops_acres,Ag_Pasture_acres,Ag_Total_acres,Imp_Bare_Soil_acres,Imp_Buildings_acres,Imp_Other_Paved_acres,Imp_Road_acres,Imp_Railroad_acres,Imp_Total_acres,Shrub_Shrubs_acres,Shrub_Total_acres,TC_Coniferous_acres,TC_Deciduous_acres,TC_Total_acres,Wet_Emergent_acres,Wet_Forested_acres,Wet_Scrub_Shrub_acres,Wet_Total_acres,from_file
480,Buffer100ftWBFL_MILLER,9435.66188,214097.9,45.669328,6.439865,0.048247,0.616527,0.0,0.0,0.121514,0.0,0.0,0.0,0.0,0.0,0.048088,0.0,0.179358,0.0,0.0,0.227446,0.0,0.0,13.820943,31.888449,45.709392,3.676575,12.269789,1.310502,17.256865,AOIs_MILLER.xls
373,KENT_Flowline100ft,15840.418304,483649.2,109.393136,2.901507,0.0,6.474089,0.0,0.469438,0.258101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.28811,0.644015,0.0,0.932125,1.109614,1.109614,14.48733,94.96288,109.45021,0.543125,25.286517,1.433561,27.263203,AOIs_KENT.xls
163,ECHOCHARTN_Buffer100ftWBFL,117855.926503,3463652.0,504.6757,158.021485,0.86042,177.81855,2.486309,5.8714,6.050119,0.0,67.9615,0.0,0.0,67.9615,1.033193,2.638996,6.653821,7.430658,0.0,17.756668,10.357996,10.357996,215.532652,290.23117,505.763822,49.947719,166.5797,27.37692,243.904339,AOIs_ECHOCHARTN.xls
387,LEVI_Watershed,4472.352813,415389.3,100.841635,1.192652,0.0,0.609052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010713,0.0,0.0,0.0,0.010713,0.0,0.0,25.916626,74.960983,100.877608,0.188685,9.756769,1.004683,10.950137,AOIs_LEVI.xls
335,HORTONIA_Buffer250ftWaterbody,28286.456472,1021231.0,195.438557,43.989694,0.038301,2.943391,3.577772,2.468023,3.879919,0.0,2.39945,0.0,0.0,2.39945,0.067546,4.931689,4.392031,4.429904,0.0,13.82117,7.262658,7.262658,71.002189,125.367381,196.36957,7.392063,20.74313,1.588798,29.72399,AOIs_HORTONIA.xls


## Split the Description column to get LakeIDs
The Description column has the LakeID attached to a description. depending on the lake they may be LakeID then description or description then LakeID. the function below splits on an underscore and takes the part of the newly created list that is all uppercase as the LakeID and leaves the rest as the description.

In [29]:
# now I want to solit the description column in to two columns 
# one corresponds to the title area and the other corresponds to the description
# depending on the folder they are in different orders

def split_description(df):
    df['Description'] = df['Description'].str.split('_')
    df['LakeID'] = df['Description'].apply(lambda row: row[0] if row[0].isupper() else row[1])
    df['Description'] = df['Description'].apply(lambda row: row[0] if not row[0].isupper() else row[1])
    return df

In [30]:
combined_tables_df = split_description(combined_tables_df)
combined_tables_df.sample(5)

Unnamed: 0,Description,Shape_Length,Shape_Area,TREE_CANOPY_acres,GRASS_SHRUBS_acres,BARE_SOIL_acres,WATER_acres,BUILDINGS_acres,ROADS_acres,OTHER_PAVED_acres,RAILROADS_acres,Ag_Hay_acres,Ag_Crops_acres,Ag_Pasture_acres,Ag_Total_acres,Imp_Bare_Soil_acres,Imp_Buildings_acres,Imp_Other_Paved_acres,Imp_Road_acres,Imp_Railroad_acres,Imp_Total_acres,Shrub_Shrubs_acres,Shrub_Total_acres,TC_Coniferous_acres,TC_Deciduous_acres,TC_Total_acres,Wet_Emergent_acres,Wet_Forested_acres,Wet_Scrub_Shrub_acres,Wet_Total_acres,from_file,LakeID
518,Flowline100ft,32548.794856,1012339.0,158.517919,28.643114,0.610164,62.317348,0.0,0.041019,0.0,0.0,0.392109,0.0,0.0,0.392109,1.014645,0.0,0.0,0.055089,0.0,1.069735,3.948503,3.948503,55.527056,103.455637,158.982693,15.560348,70.082383,3.460039,89.102771,AOIs_NICHOLS.xlsx,NICHOLS
286,Flowline100ft,4196.497458,120391.8,24.156799,3.445323,0.0,2.035342,0.02366,0.08383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022659,0.0,0.217809,0.0,0.240467,0.0,0.0,5.83697,18.386139,24.223109,2.465962,3.033065,0.0,5.499027,AOIs_HALFMOON.xls,HALFMOON
457,Watershed,13931.777848,3907960.0,859.721168,98.341427,1.265301,5.381576,0.205282,0.307646,0.451584,0.0,58.605959,0.0,0.0,58.605959,1.365985,0.204772,0.523686,0.316876,0.0,2.411318,15.071677,15.071677,231.290978,628.685411,859.976389,16.084713,89.945276,18.407254,124.437243,AOIs_MAY.xlsx,MAY
196,Flowline100ft,37631.777379,1154576.0,221.626621,46.810521,0.230981,9.689728,0.806798,1.157007,3.513957,1.434074,16.6555,0.0,0.532329,17.18783,0.280035,0.841594,3.718856,3.332171,1.578606,9.751262,1.133712,1.133712,26.044405,195.936018,221.980422,37.619947,5.925213,1.259249,44.80441,AOIs_EMERALD.xls,EMERALD
53,Buffer100ftWBFL,329711.737088,9185242.0,1537.48904,422.0289,4.206098,251.542388,11.999851,23.985556,18.186248,0.0,80.736935,2.606202,0.852307,84.195444,4.571723,14.134738,20.299655,32.339444,0.0,71.345559,82.485766,82.485766,473.851586,1069.581563,1543.433149,117.503155,400.246463,110.974176,628.723793,AOIs_BOMOSEEN.xls,BOMOSEEN


## Merge relevant columns from the chem data and population datasets to survey data
For now I will just take the 'LakeID', 'Lat', 'Long', 'Town' from the chem data dataset. for other use later i will also extract the measurements when I know what year the land use survey was done. I will need to fix the LakeIDs in the combined_tables_df dataframe so that they match up with the LakeIDs in the chem dataset. This is the column that i will join on.

In [31]:
chem_data_df = pd.read_csv(chem_data_file_path)
chem_data_df.sample(5)

Unnamed: 0,LakeID,LakeStationNo,LakeStationType,Lat,Long,Town,ProjectID,VisitDate,VisitNumber,StartTime,CollectionMethodID,Depth,ActivityCategory,CharacteristicID,Symbol,Result,Calcs,ProjRemark,RemarkCode,DepthStratumCode
164759,METCALF,1,Pelagic,44.72883,-72.8833,Fletcher,Laymon,1985-08-16 00:00:00,1,930.0,Hose,6.0,Reg,Chla,,4.87,Y,,,
246760,SUNSET (BRKFLD),1,Pelagic,44.0436,-72.6039,Brookfield,Laymon,1989-08-30 00:00:00,1,1725.0,Hose,4.8,R1,Chla,,9.4,Y,,,
167579,MILES,1,Pelagic,44.4472,-71.8139,Concord,LaymonQC,2017-08-30 00:00:00,1,1251.0,Hydrolab,9.1,Reg,DO%,,39.8,Y,,,
72965,ELFIN,1,Pelagic,43.4694,-72.9881,Wallingford,SpringTP,2016-04-13 00:00:00,1,1310.0,PlasticKemm,1.0,Reg,TCl,,7.95,Y,,,
230833,SOUTH BAY,1,Pelagic,44.9203,-72.2097,Newport City,LakeAsmt,2011-10-25 00:00:00,1,1135.0,BottleGrab,0.2,Reg,TN,,0.4,Y,,,


In [32]:
if use_population:    
    population = pd.read_excel(population_path, skiprows=4, index_col='CTC')
    # lets take the relevent columns from the population dataset and melt them so we ca easily join the dataframes
    population = population[['NAME'] + list(population.columns[-32:])]
    population.rename(columns={'NAME': 'Town'}, inplace=True)
    population.sample(5)

In [33]:
# now lets convert town names in the chem dataset to uppercase to match the town names in the population data set
chem_data_df['Town'] = chem_data_df['Town'].str.upper()
# we also need to extract the year from the chem data set so that we can join the population data
# to the chem data by year and Town
chem_data_df['VisitDate'] = pd.to_datetime(chem_data_df['VisitDate'])
chem_data_df['year'] = chem_data_df['VisitDate'].dt.year

In [34]:
chem_to_add = chem_data_df[['LakeID', 'Lat', 'Long', 'Town']].copy()
chem_to_add.drop_duplicates(inplace=True)

# get all lake ids that are also in usage survey and fix them
for lake_id in np.unique(chem_to_add['LakeID']):
    lake_id_fixed = re.sub('[()\s;]', '', lake_id)
    index = combined_tables_df[combined_tables_df['LakeID'] == lake_id_fixed].index
    combined_tables_df.loc[index, 'LakeID'] = lake_id


In [35]:
# merge with the chem_to_add data frame
new_data_df = combined_tables_df.merge(chem_to_add, how='left', on='LakeID')

# reorder columns
cols = list(new_data_df.columns)

to_front = ['LakeID', 'Description', 'Lat', 'Long', 'Town']
for col in to_front:
    cols.remove(col)

cols = to_front + cols
new_data_df = new_data_df.reindex(columns=cols)

new_data_df.head(5)

Unnamed: 0,LakeID,Description,Lat,Long,Town,Shape_Length,Shape_Area,TREE_CANOPY_acres,GRASS_SHRUBS_acres,BARE_SOIL_acres,WATER_acres,BUILDINGS_acres,ROADS_acres,OTHER_PAVED_acres,RAILROADS_acres,Ag_Hay_acres,Ag_Crops_acres,Ag_Pasture_acres,Ag_Total_acres,Imp_Bare_Soil_acres,Imp_Buildings_acres,Imp_Other_Paved_acres,Imp_Road_acres,Imp_Railroad_acres,Imp_Total_acres,Shrub_Shrubs_acres,Shrub_Total_acres,TC_Coniferous_acres,TC_Deciduous_acres,TC_Total_acres,Wet_Emergent_acres,Wet_Forested_acres,Wet_Scrub_Shrub_acres,Wet_Total_acres,from_file
0,ABENAKI,Watershed,43.8303,-72.2361,THETFORD,9683.033459,2335672.0,532.331074,39.85606,0.254456,1.075957,0.821377,1.521425,1.297239,0.0,16.7354,0.0,0.0,16.7354,0.309766,0.894497,1.551584,3.145226,0.0,5.901073,0.0,0.0,266.678081,265.987483,532.665564,5.913928,29.521352,0.191924,35.627204,AOIs_ABENAKI.xls
1,ABENAKI,Flowline100ft,43.8303,-72.2361,THETFORD,6359.510779,186453.1,41.212481,4.014035,0.0,0.485129,0.02885,0.233947,0.091244,0.0,0.002157,0.0,0.0,0.002157,0.0,0.030748,0.111193,0.337027,0.0,0.478968,0.0,0.0,19.499594,21.754112,41.253706,3.738571,3.145864,0.0,6.884435,AOIs_ABENAKI.xls
2,ABENAKI,Waterbody100ft,43.8303,-72.2361,THETFORD,5631.483675,84493.26,17.34881,2.912812,0.017421,0.545608,0.021066,0.0,0.031506,0.0,0.0,0.0,0.0,0.0,0.067456,0.022888,0.044621,0.0,0.0,0.134965,0.0,0.0,9.12222,8.232879,17.355099,2.099783,2.956447,0.182035,5.238265,AOIs_ABENAKI.xls
3,ABENAKI,Buffer100ftWBFL,43.8303,-72.2361,THETFORD,11439.865203,265556.5,58.155297,6.164034,0.017421,0.867339,0.049915,0.233947,0.122749,0.0,0.002157,0.0,0.0,0.002157,0.067456,0.053636,0.155814,0.337027,0.0,0.613933,0.0,0.0,28.361753,29.837036,58.198789,4.854378,5.758748,0.182035,10.795161,AOIs_ABENAKI.xls
4,ABENAKI,Buffer250ftWaterbody,43.8303,-72.2361,THETFORD,5838.954038,212589.7,44.545557,5.968327,0.017421,0.683245,0.293499,0.782087,0.23685,0.0,0.0,0.0,0.0,0.0,0.067456,0.316489,0.349577,1.192506,0.0,1.926027,0.0,0.0,21.194604,23.420545,44.615149,3.266376,7.531832,0.191924,10.990132,AOIs_ABENAKI.xls


In [36]:
# now since the land use survey was conducted from 2013 - 2016 I will merge the population data for 2016 joining on the town names
if use_population:    
    pop2016 = population[['Town', 2016]].copy()
    pop2016.rename(columns={2016: '2016_population'}, inplace=True)
    new_data_df = new_data_df.merge(pop2016, how='left', on='Town')
    print('Final survey data DataFrame shape: ', new_data_df.shape)
    new_data_df.sample(5)

Final survey data DataFrame shape:  (773, 36)


## Save the survey dataframe
Lets save the resulting dataframe as a .csv for later use

In [37]:
# save the data frame as a .csv
new_data_df.to_csv(survay_save_path)

## Merge the population dataset, the chem data dataset and the characteristics dataset
Here we will take the population dataset and melt it so that the years are also in the rows sp that we can merge on both town and year data.

In [38]:
# melt the population dataframe
if use_population:
    population_1 = population.melt(id_vars='Town', var_name='year', value_name='population')
    chem_data_df = chem_data_df.merge(population_1, how='left', on=['Town', 'year'])
    chem_data_df.sample(5)

In [39]:
# Now so that we have measurements
characteristics = pd.read_csv(characteristics_path)
characteristics.columns = ['CharacteristicID', 'CharacteristicName', 'UnitCode', 'SampleFraction']
characteristics.head()

Unnamed: 0,CharacteristicID,CharacteristicName,UnitCode,SampleFraction
0,AshFreeDryMass,Ash Free Dry Mass,mg,
1,BOD5,"Biological Oxygen Demand, 5 day",mg/l,
2,BottomDepth,Bottom depth,m,
3,BottomSecchi,Bottom secchi,,
4,CBOD5,"Carbonaceous Biological Oxygen Demand, 5 day",mg/l,


In [40]:
# we are going to merge on CharacteristicID so to make sure that they are the same across dataframes
# we will make all if them uppercase in both dataframes
characteristics['CharacteristicID'] = characteristics['CharacteristicID'].str.upper()
chem_data_df['CharacteristicID'] = chem_data_df['CharacteristicID'].str.upper()

In [41]:
# Merge characteristics df
chem_data_df_final = chem_data_df.merge(characteristics, how='left', on='CharacteristicID')
print('Final chem data Data frame shape: ', chem_data_df_final.shape)
chem_data_df_final.sample(5)

Final chem data Data frame shape:  (284912, 25)


Unnamed: 0,LakeID,LakeStationNo,LakeStationType,Lat,Long,Town,ProjectID,VisitDate,VisitNumber,StartTime,CollectionMethodID,Depth,ActivityCategory,CharacteristicID,Symbol,Result,Calcs,ProjRemark,RemarkCode,DepthStratumCode,year,population,CharacteristicName,UnitCode,SampleFraction
149832,LITTLE ROCK,1,Pelagic,43.4,-72.9567,WALLINGFORD,AcidLake,1989-07-17,1,1250.0,PlasticKemm,12.0,Reg,DAL,,59.0,Y,,,H,1989,2158.0,Dissolved Aluminum,ug/l,Dissolved
148152,LITTLE AVERILL,1,Pelagic,44.95165,-71.71435,AVERILL,SpringTP,2018-05-23,1,1015.0,PlasticKemm,1.0,Reg,DIC,,2.042,Y,,,,2018,25.0,Dissolved Inorganic Carbon,mg/l,Dissolved
282355,WOODWARD,1,Pelagic,43.565,-72.7597,PLYMOUTH,LakeAsmt,1998-08-24,1,1028.0,Hydrolab,8.0,Reg,DO%,,27.3,Y,,,,1998,534.0,Dissolved Oxygen Saturation,%,
151349,LONG (GRNSBO),1,Pelagic,44.62591,-72.26272,GREENSBORO,LakeAsmt,2018-07-12,1,1448.0,Hydrolab,6.03,Reg,PH,,6.8,Y,DS5,,,2018,706.0,pH,,
36292,CARMI,1,Pelagic,44.97393,-72.87549,FRANKLIN,TMDL,2018-08-23,1,1050.0,BottleGrab,0.2,Reg,TMN,,118.46,Y,,,,2018,1434.0,Total Manganese,ug/l,Total


In [42]:
_list = ['Oxygen', 'secchi', 'Chlorophyll', 'Conductivity', 'Silver',
         'Cadmium', 'Chloride', 'Cobalt', 'Chromium', 'Copper', 'Iron', 'Mercury',
         'Carbon', 'Hardness', 'Potassium', 'Magnesium', 'Manganese', 'Molybdenum', 
         'Aluminum', 'Arsenic', 'Barium', 'Beryllium', 'Carbon', 'Calcium',
         'Sodium', 'Nickel', 'Nitrogen', 'Carbon', 'Phosphorus', 'Lead', 'Antimony', 
         'Selenium', 'Silica', 'Sulfate', 'Strontium', 'Thallium', 'Uranium', 'Vanadium', 
         'Zinc', 'E. Coli Bacteria', 'Alkalinity', 
         'pH', 'Secchi', 'Color', 'Tannin in water', 'Solids', 'Temperature', 'Fluoride', 
         'Coliform', 'Tin', 'Turbidity']

def to_order(x):
    x = x.split()
    for word in x:
        if word in _list:
            x.remove(word)
            x = [word,] + x
            return ' '.join(x)
    return ' '.join(x)

pd.set_option('display.max_rows', None)
chars = chem_data_df_final[['CharacteristicID', 'CharacteristicName', 'UnitCode']].copy().drop_duplicates()

chars['for_sorting'] = chars['CharacteristicName'].apply(to_order)
chars = chars.sort_values(by='for_sorting').drop('for_sorting', axis=1).reset_index(drop=True)
chars

Unnamed: 0,CharacteristicID,CharacteristicName,UnitCode
0,REGALK,Alkalinity,mg/l
1,GRANALK,Alkalinity measured using Gran Alkalinity,mg/l
2,DAL,Dissolved Aluminum,ug/l
3,IMAL,Inorganic Monomeric Aluminum,ug/l
4,MAL,Monomeric Aluminum,ug/l
5,OMAL,Organic Monomeric Aluminum,ug/l
6,TAL,Total Aluminum,ug/l
7,TSB,Total Antimony,ug/l
8,TAS,Total Arsenic,ug/l
9,TBA,Total Barium,ug/l


In [43]:
# reset view option
pd.set_option('display.max_rows', 10)

## Combine Relevent Measures


## Save the chem data DataFrame
Lets save the resulting dataframe as a .csv for later use

In [44]:
chem_data_df_final.to_csv(chem_data_save_path)

## Documantation

In [45]:
%load_ext watermark
%watermark --iversions

sys   : 3.9.9 | packaged by conda-forge | (main, Dec 20 2021, 02:36:06) [MSC v.1929 64 bit (AMD64)]
re    : 2.2.1
numpy : 1.21.5
pandas: 1.3.4

