<a href="https://colab.research.google.com/github/webb-e/S2_Landsat_Comparison/blob/main/postprocessing_lakes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## google drive setup
from google.colab import drive
drive.mount('/content/drive')

## import libraries
import numpy as np
import pandas as pd
import math


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Read in data, filter, and put in wide format

In [None]:
region = 'AKCP' ## options = AKCP, YKD, YKF, AND, TUK, MRD

# Read the CSV files

## csv file with lake area time series
df_string = 'ALPOD/Lakewise_csvs/' + region + '_lake_areas.csv'
df =  pd.read_csv(df_string)

## csv file with region-specific factal dimensions
fractaldf = pd.read_csv('ALPOD/fractal_dimensions.csv')

## csv file with landsat time series
landsat_string = 'ALPOD/Lakewise_csvs/Landsat_lake_areas_' + region + '.csv'
landsat = pd.read_csv(landsat_string)

## csv file with cloudiness
cloud_string = 'ALPOD/Lakewise_csvs/Lakewise_cloudiness_' + region + '.csv'
clouddf = pd.read_csv(cloud_string)

In [None]:
### get data in the right format

# Create the 'year_week' column
df['year_week'] = df['year'].astype(str) + df['week'].astype(str)

# filter rows where area_km2 > 0.001 (the detection limit of Landsat)
df_filtered = df[df['area_km2'] > 0.001]

# convert from m2 to km2
df_filtered['S2_water_km2'] = df_filtered['S2_water_m2'] / 1000000
df_filtered = df_filtered.drop(['S2_water_m2'], axis=1)

# transform the data to wide format
dflt = df_filtered.pivot(index='year_week', columns='lake_id', values='S2_water_km2')
dflt.head()

Apply percentile filter; everything in the top or lower 10% for each lake is deleted

In [None]:
def percentile_filter(x, lower_percentile=10, upper_percentile=90):
    # Ensure that x is a Series
    x = pd.Series(x)

    # Calculate the percentiles
    lower_bound = np.percentile(x.dropna(), lower_percentile)
    upper_bound = np.percentile(x.dropna(), upper_percentile)

    # Filter based on percentiles
    filtered_data = x.copy()
    filtered_data[(x < lower_bound) | (x > upper_bound)] = np.nan

    return filtered_data

# Apply the  percentile filter to each column
filtered_df = dflt.apply(lambda x: percentile_filter(x, lower_percentile=10, upper_percentile=90), axis=0)

df_percent = pd.DataFrame(data = filtered_df.values, index=dflt.index, columns=dflt.columns.values).reset_index()
df_percent['year'] = df_percent['year_week'].str.slice(0, 4)
df_percent.drop(['year_week'], axis=1, inplace=True)
df_percent.head()

Get the annual lake-wise surface water mean, min, max, and standard deviation.

In [None]:
stats = df_percent.groupby('year').agg(['mean', 'min', 'max', 'std'])
## change from muliti-level to the format we want; reset index to turn year into a column
df_stacked = stats.stack(level=0).reset_index(name='lake_id')


Calculate shoreline complexity; use equation form Seekell et al., 2022

In [None]:
## from the original dataframe, get columns we want to combine with the lake-wise stats
picked_df = df_filtered.groupby('lake_id').first().reset_index()
picked_df = picked_df.drop(['year_week', 'S2_water_km2', 'week', 'year'], axis=1)

## merge with fractal df to get the region-wide fractal dimension
df_merged = pd.merge(picked_df, fractaldf, left_on='region', right_on = 'Region', how='left')
df_merged = df_merged.drop(['Region', 'Intercept', 'R-squared', 'P-value', 'Std Error'], axis=1)

## calculate shoreline complexity
denominator =  2 * math.sqrt(math.pi) * df_merged['area_km2']**(df_merged['Fractal Dimension']/2)
df_merged['shoreline_complexity'] = df_merged['perim_km'] / denominator


Combine with lake-wise properties from the original dataframe and with the Landsat dataframe


In [None]:
### merge df with shoreline complexity and lake properties with df on lake stats
df_fin  = pd.merge(df_stacked, df_merged, left_on='level_1', right_on = 'lake_id', how='left')
df_fin  = df_fin.drop(['level_1', 'Fractal Dimension'], axis=1)

## rename to names we want
df_fin = df_fin.rename({'mean': 'S2mean', 'max': 'S2max','min': 'S2min','std': 'S2std'}, axis=1)

## calculate coefficient of variation
df_fin['S2cv']  = df_fin['S2std'] / df_fin['S2mean']

## update year type for merge
df_fin['year'] = df_fin['year'].astype(int)

Get landsat dataframe into shape and then combine with other data

In [None]:
### get landsat dataframe to not have repeated lake_id/year combinations
landsat2 = landsat.groupby(['lake_id', 'year']).apply(lambda group: group.ffill().bfill()).drop_duplicates()
landsat2.reset_index(drop=True, inplace=True)

### convert from m2 to km2
landsat2['Landsat_Pickens'] = landsat2['Landsat_Pickens']/1000000
landsat2['Landsat_Pekel'] = landsat2['Landsat_Pekel']/1000000

df_pluslandsat= df_fin.merge(landsat2, on=['lake_id', 'year'], how='left')


In [None]:
final_df = df_pluslandsat.merge(clouddf, on=['lake_id', 'year'], how='left')
final_df.head()

In [None]:
## calculate error between S2 and Landsat
final_df['Pickens_error_abs'] = final_df['Landsat_Pickens'] - final_df['S2max']
final_df['Pekel_error_abs'] = final_df['Landsat_Pekel'] - final_df['S2max']

final_df['Pickens_error_per'] = (final_df['Landsat_Pickens'] - final_df['S2max'])/final_df['S2max']
final_df['Pekel_error_per'] =   (final_df['Landsat_Pekel'] - final_df['S2max'])/final_df['S2max']


Save to CSV!

In [None]:
export_string = 'ALPOD/Lakewise_csvs/analysis_ready_' + region + '.csv'
final_df.to_csv(export_string, index=False)