<a href="https://colab.research.google.com/github/webb-e/S2_Landsat_Comparison/blob/main/MLmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## import libraries
import numpy as np
import pandas as pd
import glob
import plotnine as pn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.inspection import permutation_importance, partial_dependence


Mounted at /content/drive


In [None]:
#### read in dataframe
df = pd.read_csv('Landsat_analysis_data.csv')


print(list(df.columns))

['year', 'S2mean', 'S2min', 'S2max', 'S2std', 'lake_id', 'area_km2', 'perim_km', 'n_lakes', 'region', 'shoreline_complexity', 'S2cv', 'Landsat_Pekel', 'Landsat_Pickens', 'cloudiness', 'Pickens_error_abs', 'Pekel_error_abs', 'Pickens_error_per', 'Pekel_error_per', 'proportion_of_max']


In [None]:
# Group by region and year, then calculate S2_total, rankyear, and prop_max
df_sums = (df.groupby(['region', 'year'], as_index=False)
             .agg(S2_total=('S2max', 'sum'))
          )

df_sums['rankyear'] = df_sums['S2_total'].rank(method='dense').astype(int)
df_sums['prop_max'] = df_sums['S2_total'] / df_sums['S2_total'].max()
df_sums['rankyear'] = df_sums['rankyear'].astype(str)

# Add lake-wise data and high/low year classifications, then pivot to long format
lakewise_dat = (df.merge(df_sums, on=['region', 'year'], how='left')
                 .assign(highlow=lambda x: x['rankyear'].map({
                     '1': 'dry', '2': 'dry', '3': 'dry',
                     '4': 'wet', '5': 'wet', '6': 'wet'
                 }))
              )

# Melt to long format for columns ending with '_per'
lakewise_dat = lakewise_dat.melt(
    id_vars=['lake_id', 'region', 'year', 'highlow'],
    value_vars=[col for col in df.columns if col.endswith('_per')],
    var_name='product', value_name='difference'
)

# Calculate averages and difference for "Pickens_error_per"
diff_df_pick = (lakewise_dat[lakewise_dat['product'] == 'Pickens_error_per']
                 .groupby(['lake_id', 'highlow'])
                 .agg(avg_diff=('difference', 'mean'))
                 .reset_index()
              )

# Pivot the results to create separate columns for wet and dry
diff_df_pick = diff_df_pick.pivot(index='lake_id', columns='highlow', values='avg_diff').reset_index()
diff_df_pick = diff_df_pick.rename(columns={'wet': 'avg_diff_wet', 'dry': 'avg_diff_dry'})
diff_df_pick['diff_wet_minus_dry'] = diff_df_pick['avg_diff_wet'] - diff_df_pick['avg_diff_dry']

# Calculate averages and difference for "Pekel_error_per"
diff_df_pekel = (lakewise_dat[lakewise_dat['product'] == 'Pekel_error_per']
                  .groupby(['lake_id', 'highlow'])
                  .agg(avg_diff=('difference', 'mean'))
                  .reset_index()
               )

# Pivot the results to create separate columns for wet and dry
diff_df_pekel = diff_df_pekel.pivot(index='lake_id', columns='highlow', values='avg_diff').reset_index()
diff_df_pekel = diff_df_pekel.rename(columns={'wet': 'avg_diff_wet', 'dry': 'avg_diff_dry'})
diff_df_pekel['diff_wet_minus_dry'] = diff_df_pekel['avg_diff_wet'] - diff_df_pekel['avg_diff_dry']


# # Calculate mean cloudiness for each lake
mean_cloudiness = df.groupby('lake_id', as_index=False)['cloudiness'].mean()


# Merge additional lake attributes and rename the difference column for Pickens
diff_lakes_pick = (diff_df_pick[['lake_id', 'diff_wet_minus_dry']]
                    .merge(df[['lake_id', 'area_km2', 'perim_km', 'region', 'shoreline_complexity']]
                           .drop_duplicates(), on='lake_id', how='left')
                    .merge(mean_cloudiness, on='lake_id', how='left')
                    .rename(columns={'diff_wet_minus_dry': 'difference'})
                 )

# Merge additional lake attributes, including mean cloudiness, and rename the difference column for Pekel
diff_lakes_pekel = (diff_df_pekel[['lake_id', 'diff_wet_minus_dry']]
                    .merge(df[['lake_id', 'area_km2', 'perim_km', 'region', 'shoreline_complexity']]
                           .drop_duplicates(), on='lake_id', how='left')
                    .merge(mean_cloudiness, on='lake_id', how='left')
                    .rename(columns={'diff_wet_minus_dry': 'difference'})
                 )

In [None]:
diff_lakes_pekel.head()

Unnamed: 0,lake_id,difference,area_km2,perim_km,region,shoreline_complexity,cloudiness
0,ANDaaaa00,0.204679,0.020002,1.128,AND,2.698619,0.561955
1,ANDaaaa01,-0.003443,0.009566,0.454,AND,1.625365,0.564168
2,ANDaaaa02,0.019563,0.021687,0.843,AND,1.929586,0.560021
3,ANDaaaa03,0.258332,0.011903,0.632,AND,2.007874,0.577849
4,ANDaaaa04,0.475275,0.004243,0.304,AND,1.697106,0.564482


In [None]:
y_pekel = diff_lakes_pekel['difference']
x_pekel = diff_lakes_pekel[['area_km2', 'perim_km', 'region', 'shoreline_complexity', 'cloudiness']]
y_pickens = diff_lakes_pick['difference']
x_pickens =diff_lakes_pick[['area_km2', 'perim_km', 'region', 'shoreline_complexity', 'cloudiness']]

In [None]:
### split into train and test data;
### stratify by region to make sure we have the same fraction of regions in each train/test dataset
pekel_train, pekel_test, pekel_Ytrain, pekel_Ytest = train_test_split(x_pekel, y_pekel, test_size = 0.20, \
                                                                      stratify = x_pekel['region'],  random_state = 10)
pickens_train, pickens_test, pickens_Ytrain, pickens_Ytest = train_test_split(x_pickens, y_pickens, test_size = 0.20,\
                                                                       stratify = x_pickens['region'],random_state = 10)

print(pekel_train.shape,pekel_test.shape)
print(pickens_train.shape, pickens_test.shape)


(8673, 5) (2169, 5)
(8673, 5) (2169, 5)


## Model fitting

### Pekel Model

In [None]:
##### first do a large search for parameters
# define the grid of values to search
gridpek = dict()

gridpek['learning_rate'] = [0.0001,0.001, 0.01, 0.1, 1]
gridpek['max_depth'] = [10, 50, 90, 120]
gridpek['max_bins'] = [10, 20, 40, 60,80,100]
gridpek['max_leaf_nodes'] =[ None]

# Use a smaller subset of your data for the grid search
subset_size = int(len(pekel_train) * 0.5)  # Use 50% of the data
pekel_train_subset = pekel_train.sample(n=subset_size, random_state=42)
pekel_Ytrain_subset = pekel_Ytrain.sample(n=subset_size, random_state=42)

grid_search_pek = GridSearchCV(estimator=HistGradientBoostingRegressor(categorical_features=['region']),
                           param_grid=gridpek,cv=10, scoring='neg_mean_squared_error')

grid_result_pek = grid_search_pek.fit(pekel_train_subset, pekel_Ytrain_subset)

# summarize the best score and configuration
print(grid_result_pek.best_params_)
#{'learning_rate': 0.001, 'max_bins': 100, 'max_depth': 50, 'max_leaf_nodes': None}



{'learning_rate': 0.001, 'max_bins': 100, 'max_depth': 50, 'max_leaf_nodes': None}


In [None]:
#### now refine grid search values
gridpek = dict()

gridpek['learning_rate'] = [0.0005, 0.0008, 0.001, 0.002, 0.005]
gridpek['max_depth'] = [25, 40, 50, 60,75]
gridpek['max_bins'] = [100,255]
gridpek['max_leaf_nodes'] =[ None]

# Use a smaller subset of your data for the grid search
subset_size = int(len(pekel_train) * 0.4)  # Use 40% of the data
pekel_train_subset = pekel_train.sample(n=subset_size, random_state=42)
pekel_Ytrain_subset = pekel_Ytrain.sample(n=subset_size, random_state=42)

grid_search_pek = GridSearchCV(estimator=HistGradientBoostingRegressor(categorical_features=['region']),
                           param_grid=gridpek,cv=10, scoring='neg_mean_squared_error')

grid_result_pek = grid_search_pek.fit(pekel_train_subset, pekel_Ytrain_subset)

# summarize the best score and configuration
print(grid_result_pek.best_params_)
#{'learning_rate': 0.0005, 'max_bins': 100, 'max_depth': 25, 'max_leaf_nodes': None}



{'learning_rate': 0.0005, 'max_bins': 100, 'max_depth': 25, 'max_leaf_nodes': None}


In [None]:
pekelmodel = HistGradientBoostingRegressor(
             categorical_features =['region'],
                max_leaf_nodes= None,
                max_bins= 100,
                learning_rate=0.0005,
                max_depth=25)

## fit the model
pekelmodel.fit(pekel_train, pekel_Ytrain)

In [None]:
## evaluate model
pekel_pred = pekelmodel.predict(pekel_test)


In [None]:
# Calculate Mean Squared Error
mse_pekel = mean_squared_error(pekel_Ytest, pekel_pred)

# Calculate Mean Absolute Error
mae_pekel = mean_absolute_error(pekel_Ytest, pekel_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse_pekel = np.sqrt(mse_pekel)

# Calculate R-squared
r2_pekel = r2_score(pekel_Ytest, pekel_pred)

# Calculate percentage error for each prediction
percentage_error_pekel = np.mean((np.abs((pekel_Ytest - pekel_pred) / pekel_Ytest) * 100))

# Print the evaluation metrics
print("Mean Squared Error (MSE):", mse_pekel.round(2))
print("Mean Absolute Error (MAE):", mae_pekel.round(2), "km2")
print("Root Mean Squared Error (RMSE):", rmse_pekel.round(2), "km2")
print("R-squared (R²):", r2_pekel)#.round(2))
print("Mean Percentage Error (%):", percentage_error_pekel.round(2))

Mean Squared Error (MSE): 259.37
Mean Absolute Error (MAE): 2.28 km2
Root Mean Squared Error (RMSE): 16.11 km2
R-squared (R²): -0.003944636324205719
Mean Percentage Error (%): inf


In [None]:
##### first do a large search for parameters
# define the grid of values to search
gridpick = dict()

gridpick['learning_rate'] = [0.0001,0.001, 0.01, 0.1, 1]
gridpick['max_depth'] = [10, 50, 90, 120]
gridpick['max_bins'] = [10, 20, 40, 60,80,100]
gridpick['max_leaf_nodes'] =[ None]

# Use a smaller subset of your data for the grid search
subset_size = int(len(pickens_train) * 0.4)  # Use 40% of the data
pick_train_subset = pickens_train.sample(n=subset_size, random_state=42)
pick_Ytrain_subset = pickens_Ytrain.sample(n=subset_size, random_state=42)

grid_search_pick = GridSearchCV(estimator=HistGradientBoostingRegressor(categorical_features =['region']),
                           param_grid=gridpick, n_jobs=-1, cv=10)

grid_search_pick = grid_search_pick.fit(pick_train_subset, pick_Ytrain_subset)

# summarize the best score and configuration
print(grid_search_pick.best_params_)

##{'learning_rate': 0.0001, 'max_bins': 20, 'max_depth': 50, 'max_leaf_nodes': None}

{'learning_rate': 0.0001, 'max_bins': 20, 'max_depth': 50, 'max_leaf_nodes': None}


In [None]:
#### now refine grid search values
gridpick = dict()

gridpick['learning_rate'] = [0.0001,0.0003, 0.0005]
gridpick['max_depth'] = [10, 50, 90, 120]
gridpick['max_bins'] = [20]
gridpick['max_leaf_nodes'] =[ None]

# Use a smaller subset of your data for the grid search
subset_size = int(len(pickens_train) * 0.4)  # Use 40% of the data
pick_train_subset = pickens_train.sample(n=subset_size, random_state=42)
pick_Ytrain_subset = pickens_Ytrain.sample(n=subset_size, random_state=42)

grid_search_pick = GridSearchCV(estimator=HistGradientBoostingRegressor(categorical_features =['region']),
                           param_grid=gridpick, n_jobs=-1, cv=10)

grid_search_pick = grid_search_pick.fit(pick_train_subset, pick_Ytrain_subset)


# summarize the best score and configuration
print(grid_search_pick.best_params_)
#{'learning_rate': 0.0001, 'max_bins': 20, 'max_depth': 50, 'max_leaf_nodes': None}

{'learning_rate': 0.0001, 'max_bins': 20, 'max_depth': 50, 'max_leaf_nodes': None}


In [None]:
pickensmodel = HistGradientBoostingRegressor(
             categorical_features =['region'],
                max_leaf_nodes= None,
                max_bins= 20,
                learning_rate=0.0001,
                max_depth=50)

## fit the model
pickensmodel.fit(pickens_train, pickens_Ytrain)

In [None]:
## evaluate model
pickens_pred = pickensmodel.predict(pickens_test)


In [None]:
# Calculate Mean Squared Error
mse_pickens = mean_squared_error(pickens_Ytest, pickens_pred)

# Calculate Mean Absolute Error
mae_pickens = mean_absolute_error(pickens_Ytest, pickens_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse_pickens = np.sqrt(mse_pickens)

# Calculate R-squared
r2_pickens = r2_score(pickens_Ytest, pickens_pred)

# Calculate percentage error for each prediction
percentage_error_pickens = np.mean((np.abs((pickens_Ytest - pickens_pred) / pickens_Ytest) * 100))

# Print the evaluation metrics
print("Mean Squared Error (MSE):", mse_pickens.round(2))
print("Mean Absolute Error (MAE):", mae_pickens.round(2), "km2")
print("Root Mean Squared Error (RMSE):", rmse_pickens.round(2), "km2")
print("R-squared (R²):", r2_pickens)
print("Mean Percentage Error (%):", percentage_error_pickens.round(2))

Mean Squared Error (MSE): 242.86
Mean Absolute Error (MAE): 2.11 km2
Root Mean Squared Error (RMSE): 15.58 km2
R-squared (R²): 0.00027838265195356815
Mean Percentage Error (%): inf
