In [1]:
# # Google Colab specific
# %cd ~/../content
# !rm -rf openet

# !git clone https://github.com/aetriusgx/openet.git
# %cd openet

# Preparation

In [2]:
from matplotlib.colors import LinearSegmentedColormap
from datetime import datetime, timedelta
from notebook_utils import calculate_metrics, eval_metrics, timeseries_rel, trim_extremes
from pathlib import Path
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import contextily as cx
import geopandas as gpd
import json
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns

In [3]:
# Styling Cell
sns.set_theme(context="notebook", style="whitegrid")

SMALL_SIZE = 18
MEDIUM_SIZE = 24
BIGGER_SIZE = 28

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

# Initial version

## Point Analysis

### Historical Data Import

In [4]:
historical = pd.read_csv('../data/monterey_historical.csv', low_memory=False)
historical['time'] = pd.to_datetime(historical['time'])
historical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2367017 entries, 0 to 2367016
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   field_id     object        
 1   crop         int64         
 2   time         datetime64[ns]
 3   actual_et    float64       
 4   actual_eto   float64       
 5   actual_etof  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(1)
memory usage: 108.4+ MB


In [5]:
# Show when the latest historical data is
historical['time'].max()

Timestamp('2024-10-07 00:00:00')

### Forecast Table Build

In [6]:
# Gather current forecast data for the county
forecasting_table = pd.DataFrame()
files = Path(f"../data/forecasts/monterey/").glob("*.csv")

for file in files:
    # splits into [$date, 'forecast.csv']
    parts = str(file.name).split("_")
    data = pd.read_csv(file, low_memory=False)
    data["forecasting_date"] = parts[0]
    forecasting_table = pd.concat([data, forecasting_table], ignore_index=True)

forecasting_table['forecasting_date'] = pd.to_datetime(forecasting_table['forecasting_date'])
forecasting_table['time'] = pd.to_datetime(forecasting_table['time'])
forecasting_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10818960 entries, 0 to 10818959
Data columns (total 7 columns):
 #   Column            Dtype         
---  ------            -----         
 0   field_id          object        
 1   crop              int64         
 2   time              datetime64[ns]
 3   expected_et       float64       
 4   expected_eto      float64       
 5   expected_etof     float64       
 6   forecasting_date  datetime64[ns]
dtypes: datetime64[ns](2), float64(3), int64(1), object(1)
memory usage: 577.8+ MB


In [7]:
# Show when the latest forecast date is
forecasting_table['forecasting_date'].max()

Timestamp('2024-09-30 00:00:00')

### Main Data Table compilation

In [8]:
dt = historical.loc[(historical['time'].dt.year == 2024), :]
dt = dt.merge(forecasting_table, on=['field_id', 'time', 'crop'], how='right').set_index(['forecasting_date', 'field_id', 'crop', 'time']).reset_index()
dt

Unnamed: 0,forecasting_date,field_id,crop,time,actual_et,actual_eto,actual_etof,expected_et,expected_eto,expected_etof
0,2024-09-23,CA_244000,47,2024-01-01,0.718,1.103,0.650,0.718,1.103,0.650
1,2024-09-23,CA_244000,47,2024-01-02,0.668,1.018,0.656,0.668,1.018,0.656
2,2024-09-23,CA_244000,47,2024-01-03,1.403,2.122,0.661,1.403,2.122,0.661
3,2024-09-23,CA_244000,47,2024-01-04,1.698,2.546,0.667,1.698,2.546,0.667
4,2024-09-23,CA_244000,47,2024-01-05,1.027,1.528,0.672,1.027,1.528,0.672
...,...,...,...,...,...,...,...,...,...,...
10818955,2024-01-29,CA_420924,68,2024-12-27,,,,0.635,1.316,0.581
10818956,2024-01-29,CA_420924,68,2024-12-28,,,,0.674,1.180,0.580
10818957,2024-01-29,CA_420924,68,2024-12-29,,,,0.671,1.076,0.576
10818958,2024-01-29,CA_420924,68,2024-12-30,,,,0.705,1.074,0.574


#### Add geographical data

In [9]:
# Add additional data to the data table
monterey_points = pd.read_csv("../data/Monterey.csv", low_memory=False).set_index("OPENET_ID").rename_axis("field_id")

# Expand .geo column into lon, lat columns
monterey_geo = (monterey_points[".geo"]
                .apply(lambda x: pd.Series(dict(json.loads(x))))['coordinates']
                .apply(lambda x: pd.Series(list(x), index=['longitude', 'latitude'])))
monterey_geo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 739 entries, CA_253578 to CA_251078
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   longitude  739 non-null    float64
 1   latitude   739 non-null    float64
dtypes: float64(2)
memory usage: 17.3+ KB


In [10]:
monterey_points['CROP_2023'].value_counts()

CROP_2023
47     513
69     136
61      74
72       5
37       4
122      2
211      2
215      2
68       1
Name: count, dtype: int64

In [11]:
dt = dt.join(monterey_geo, how="left", on=["field_id"], validate="many_to_one")
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10818960 entries, 0 to 10818959
Data columns (total 12 columns):
 #   Column            Dtype         
---  ------            -----         
 0   forecasting_date  datetime64[ns]
 1   field_id          object        
 2   crop              int64         
 3   time              datetime64[ns]
 4   actual_et         float64       
 5   actual_eto        float64       
 6   actual_etof       float64       
 7   expected_et       float64       
 8   expected_eto      float64       
 9   expected_etof     float64       
 10  longitude         float64       
 11  latitude          float64       
dtypes: datetime64[ns](2), float64(8), int64(1), object(1)
memory usage: 990.5+ MB


### Add USDA crop data

In [12]:
# Add crop data
cdl_codes = pd.read_csv("../data/cdl_codes.csv", low_memory=False).set_index("Codes")

dt = dt.join(cdl_codes, how="left", on="crop", validate="many_to_many")
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10818960 entries, 0 to 10818959
Data columns (total 19 columns):
 #   Column            Dtype         
---  ------            -----         
 0   forecasting_date  datetime64[ns]
 1   field_id          object        
 2   crop              int64         
 3   time              datetime64[ns]
 4   actual_et         float64       
 5   actual_eto        float64       
 6   actual_etof       float64       
 7   expected_et       float64       
 8   expected_eto      float64       
 9   expected_etof     float64       
 10  longitude         float64       
 11  latitude          float64       
 12  Class_Names       object        
 13  ESRI_Red          int64         
 14  ESRI_Green        int64         
 15  ESRI_Blue         int64         
 16  Erdas_Red         float64       
 17  Erdas_Green       float64       
 18  Erdas_Blue        float64       
dtypes: datetime64[ns](2), float64(11), int64(4), object(2)
memory usage: 1.5+ GB


#### Add Field Metadata

In [13]:
# Import table using only field and hectare columns
field_metadata = pd.read_json('../data/geo/field_metadata.json')[['field_id', 'hectares']]
# Reformat field IDs to be same convention as the other tables.
field_metadata['field_id'] = 'CA_' + field_metadata['field_id'].astype(str).str[1:]
field_metadata = field_metadata.set_index('field_id')
field_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 729 entries, CA_244000 to CA_258026
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   hectares  729 non-null    float64
dtypes: float64(1)
memory usage: 11.4+ KB


In [14]:
dt = dt.join(field_metadata, how='left', on='field_id', validate='many_to_one')
dt.info()
dt.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10818960 entries, 0 to 10818959
Data columns (total 20 columns):
 #   Column            Dtype         
---  ------            -----         
 0   forecasting_date  datetime64[ns]
 1   field_id          object        
 2   crop              int64         
 3   time              datetime64[ns]
 4   actual_et         float64       
 5   actual_eto        float64       
 6   actual_etof       float64       
 7   expected_et       float64       
 8   expected_eto      float64       
 9   expected_etof     float64       
 10  longitude         float64       
 11  latitude          float64       
 12  Class_Names       object        
 13  ESRI_Red          int64         
 14  ESRI_Green        int64         
 15  ESRI_Blue         int64         
 16  Erdas_Red         float64       
 17  Erdas_Green       float64       
 18  Erdas_Blue        float64       
 19  hectares          float64       
dtypes: datetime64[ns](2), float64(12), int64(4),

Unnamed: 0,forecasting_date,field_id,crop,time,actual_et,actual_eto,actual_etof,expected_et,expected_eto,expected_etof,longitude,latitude,Class_Names,ESRI_Red,ESRI_Green,ESRI_Blue,Erdas_Red,Erdas_Green,Erdas_Blue,hectares
0,2024-09-23,CA_244000,47,2024-01-01,0.718,1.103,0.65,0.718,1.103,0.65,-121.54054,36.537255,Misc Vegs & Fruits,255,102,102,1.0,0.4,0.4,263.816
1,2024-09-23,CA_244000,47,2024-01-02,0.668,1.018,0.656,0.668,1.018,0.656,-121.54054,36.537255,Misc Vegs & Fruits,255,102,102,1.0,0.4,0.4,263.816
2,2024-09-23,CA_244000,47,2024-01-03,1.403,2.122,0.661,1.403,2.122,0.661,-121.54054,36.537255,Misc Vegs & Fruits,255,102,102,1.0,0.4,0.4,263.816
3,2024-09-23,CA_244000,47,2024-01-04,1.698,2.546,0.667,1.698,2.546,0.667,-121.54054,36.537255,Misc Vegs & Fruits,255,102,102,1.0,0.4,0.4,263.816
4,2024-09-23,CA_244000,47,2024-01-05,1.027,1.528,0.672,1.027,1.528,0.672,-121.54054,36.537255,Misc Vegs & Fruits,255,102,102,1.0,0.4,0.4,263.816


#### List of identifiers

In [15]:
forecast_dates = forecasting_table['forecasting_date'].unique()
fields = dt['field_id'].unique()
crops = dt['crop'].unique()

### Reference Tables

The tables below are calculated prior to speed up metric calculations.

#### Average ET/ETo/ETof for 2024
This table will be used as a reference for any metric and plot calculations

In [16]:
avgs_table = pd.read_csv('../data/monterey_historical_2024_avgs.csv', low_memory=False)
avgs_table.head()

Unnamed: 0,field_id,crop,actual_et,actual_eto,actual_etof
0,CA_244000,47,2.703783,3.98068,0.681082
1,CA_244018,47,1.990459,3.448954,0.551285
2,CA_244025,47,2.197922,3.463452,0.658562
3,CA_244035,69,1.352139,3.081879,0.474822
4,CA_244053,47,1.738174,3.448954,0.496288


#### Climatology Reference
The table below records the average conditions for each field for each day of the year.

In [17]:
# Group by field, crop, and doy then calculate the average conditions
climatology_table = pd.read_csv('../data/monterey_historical_climatology.csv', low_memory=False)
climatology_table

Unnamed: 0,field_id,crop,doy,actual_et,actual_eto,actual_etof
0,CA_244000,47,1,0.910333,1.291889,0.691333
1,CA_244000,47,2,0.926556,1.310778,0.696556
2,CA_244000,47,3,0.999778,1.423778,0.701222
3,CA_244000,47,4,1.162000,1.697333,0.691111
4,CA_244000,47,5,1.102111,1.556000,0.694111
...,...,...,...,...,...,...
270469,CA_420924,68,362,0.813125,1.293625,0.643375
270470,CA_420924,68,363,0.781875,1.202875,0.658250
270471,CA_420924,68,364,0.750250,1.146250,0.673250
270472,CA_420924,68,365,0.570625,0.851125,0.690500


### Error Metric Calculation

In [18]:
analysis_end_date = datetime(year=2024, month=10, day=1)

#### Field metrics

Est. run time: 21m

In [None]:
metrics_norm = (dt[(dt['time'] > dt['forecasting_date']) & (dt['time'] < (dt['forecasting_date']) + timedelta(days=7)) & (dt['time'] < analysis_end_date)]
                .groupby(['forecasting_date'])[list(dt.columns)]
                .apply(eval_metrics, normalize=True, climatology_ref=climatology_table, avgs_ref=avgs_table))
metrics_norm.reset_index().to_csv('../data/metrics/monterey_metrics_normalized.csv', index=False)

  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stddev[None, :]
  c /= stddev[:, None]
  c /= stdd

In [None]:
metrics_norm = pd.read_csv('../data/metrics/monterey_metrics_normalized.csv').drop(['level_1'], axis=1)
metrics_norm['forecasting_date'] = pd.to_datetime(metrics_norm['forecasting_date'])

In [None]:
metrics_norm = (metrics_norm.groupby(['forecasting_date', 'variable'])[['field_id', 'crop', 'mae', 'rmse', 'bias', 'corr', 'skill_score']]
                .apply(trim_extremes, cols=['mae', 'rmse', 'bias', 'corr', 'skill_score'], threshold=0.02))
metrics_norm.info()

In [None]:
metrics_norm.head()

#### Crop metrics

In [None]:
crop_metrics = (dt[(dt['time'] > dt['forecasting_date']) & (dt['time'] < (dt['forecasting_date']) + timedelta(days=7)) & (dt['time'] < analysis_end_date)]
                .groupby(by=['forecasting_date', 'crop'])[list(dt.columns)]
                .apply(eval_metrics, normalize=True, climatology_ref=climatology_table, avgs_ref=avgs_table))
crop_metrics.to_csv('../data/metrics/montery_crop_metrics.csv')

In [None]:
crop_metrics = pd.read_csv('../data/metrics/montery_crop_metrics.csv', low_memory=False).drop(['Unnamed: 2', 'crop.1'], axis=1)
crop_metrics['forecasting_date'] = pd.to_datetime(crop_metrics['forecasting_date'])

In [None]:
# crop_metrics = (crop_metrics.groupby(['forecasting_date', 'variable'])[['field_id', 'crop', 'mae', 'rmse', 'bias', 'corr', 'skill_score']]
#                     .apply(trim_extremes, cols=['mae', 'rmse', 'bias', 'corr', 'skill_score'], threshold=0.02))
# crop_metrics.info()

### Visualization

#### Metric Correlation

In [None]:
plt.figure(figsize=(10, 6))
metrics = ['mae', 'rmse', 'bias', 'corr', 'skill_score']
corr_matrix = metrics_norm[metrics].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Metrics')
plt.savefig('../images/monterey/field_metric_correlation.png')

#### Distribution of Metrics

In [None]:
# First reshape data from wide-form to long-form
metrics_long = metrics_norm.reset_index().melt(id_vars=['forecasting_date', 'field_id', 'crop', 'variable'], value_vars=['mae', 'bias', 'skill_score'], var_name='stat')

In [None]:
# To help with visualization, map proper names to the stats
stat_propers = {
    'mae': 'Mean Absolute Error',
    'rmse': 'Root Mean Absolute Error',
    'bias': 'Mean Forecast Bias',
    'corr': 'Correlation Coefficient',
    'skill_score': 'Skill Score'
}
metrics_long['name'] = metrics_long['stat'].map(stat_propers)
metrics_long.info()

##### Boxplot Distribution

In [None]:
rel = timeseries_rel(data=metrics_long, plot='cat', kind='box',
                y='value', col='name', row='variable',
                width=0.5, errorbar='sd',
                flierprops=dict(markersize=0), title="Boxplot Distribution of Each Metric by Variable",
                export_img='field_boxplots.png', as_percent=True, margin_titles=True,
                row_order=['ET', 'ETo', 'ETof'], title_template={"col_template":"{col_name}", "row_template":"{row_name}"},
                facet_kws={"despine": False}, sharey='col', native_scale=True
               );
rel.axes.flat[2].yaxis.set_major_formatter('{x:.1f}');
rel.axes.flat[2].set(ylim=(-1.25, 1.25));
rel.axes.flat[2].set_yticks((1, 0.5, 0, -0.5, -1));
rel.savefig('images/monterey/field_boxplots.png')

##### Metric Interquartile Distrbution

In [None]:
rel = timeseries_rel(metrics_long, row='variable', col='name', y='value',
                     errorbar=('pi', 50), estimator=np.median, as_percent=True, aspect=1.5,
                     refline={'y': 0},
                     facet_kws={'sharey': 'col', 'sharex': True, 'margin_titles': True},
                     row_order=['ET', 'ETo', 'ETof'], title_template={"col_template":"{col_name}", "row_template":"{row_name}"},
                     title="Interquartile Field Error Metrics");
rel.axes.flat[2].yaxis.set_major_formatter('{x:.1f}');
rel.axes.flat[2].set(ylim=(-1.25, 1.25));
rel.axes.flat[2].set_yticks((1, 0.5, 0, -0.5, -1));
rel.savefig('../images/monterey/field_interquartile.png')

#### Crop Metrics

In [None]:
# Reappend class names of crops for better readability in plots.
metrics_long = metrics_long.join(cdl_codes, how="left", on="crop", validate="many_to_many")

In [None]:
# crop_selector = dt.groupby('crop')['hectares'].agg('sum')[:3]
crop_selector = [47, 69]

In [None]:
dt[dt['crop'].isin(crop_selector)].groupby('crop')['field_id'].nunique()

In [None]:
crop_plotter = metrics_long[metrics_long['crop'].isin(crop_selector)]
crop_plotter.head()

##### Probability Distributions

In [None]:
rel = timeseries_rel(crop_plotter[crop_plotter['variable'] == 'ET'], y='value', plot='rel', kind='line',
                        col='Class_Names', row='name',
                        refline={'y': 0},
                        facet_kws={'sharey': 'row', 'sharex': True, 'margin_titles': True}, 
                        estimator=np.median, errorbar=('pi', 50), title_template={"col_template":"{col_name}", "row_template":"{row_name}"},
                        title='25-75th ET Crop Statistics', as_percent=True);
rel.axes.flat[4].yaxis.set_major_formatter('{x:.1f}');
rel.axes.flat[4].set(ylim=(-1.25, 1.25));
rel.axes.flat[4].set_yticks((1, 0.5, 0, -0.5, -1));
rel.savefig("../images/monterey/et_crop_stats.png")

In [None]:
rel = timeseries_rel(crop_plotter[crop_plotter['variable'] == 'ETo'], y='value', plot='rel', kind='line', errorbar=('pi', 50),
                            col='Class_Names', row='name', facet_kws={'sharey': 'row', 'sharex': True, "margin_titles": True}, 
                            refline={'y': 0},
                            estimator=np.median, export_img="eto_crop_stats", title_template={"col_template":"{col_name}", "row_template":"{row_name}"},
                            title='25-75th ETo Crop Statistics', as_percent=True);
rel.axes.flat[4].yaxis.set_major_formatter('{x:.1f}');
rel.axes.flat[4].set(ylim=(-1.25, 1.25));
rel.axes.flat[4].set_yticks((1, 0.5, 0, -0.5, -1));
rel.savefig("../images/monterey/eto_crop_stats.png")

In [None]:
rel = timeseries_rel(crop_plotter[crop_plotter['variable'] == 'ETof'], y='value', plot='rel', kind='line', errorbar=('pi', 50),
                            col='Class_Names', row='name', facet_kws={'sharey': 'row', 'sharex': True, 'margin_titles': True},
                            refline={'y': 0},
                            estimator=np.median, export_img="etof_crop_metrics", title_template={"col_template":"{col_name}", "row_template":"{row_name}"},
                            title='25-75th ETof Crop Statistics', as_percent=True);
rel.axes.flat[4].yaxis.set_major_formatter('{x:.1f}');
rel.axes.flat[4].set(ylim=(-1.25, 1.25));
rel.axes.flat[4].set_yticks((1, 0.5, 0, -0.5, -1));
rel.savefig("../images/monterey/etof_crop_stats.png")

### Alignment

In [None]:
df = dt[(dt['time'] >= dt['forecasting_date']) & (dt['time'] < (dt['forecasting_date']) + timedelta(days=7)) & (dt['time'] < analysis_end_date)]
df = df.sort_values(by=['forecasting_date', 'field_id'])
df.head()

In [None]:
def align_forecast(data: pd.DataFrame, adjustment=0.9):
    recorded_et = data.head(1)['actual_et'].values[0]
    recorded_eto = data.head(1)['actual_eto'].values[0]
    recorded_etof = data.head(1)['actual_etof'].values[0]

    first_et = data.head(1)['expected_et'].values[0]
    first_eto = data.head(1)['expected_eto'].values[0]
    first_etof = data.head(1)['expected_etof'].values[0]
    
    # last_obs: 0.543
    # first_f: 0.750
    # calc = (0.750 - 0.543) * 0.9 = 0.207 * 0.9 = 0.1863
    
    # get the difference of just the first forecasted date and apply it to the rest of the forecasting period
    data['expected_et'] = data['expected_et'] - (first_et - recorded_et)
    data['expected_eto'] = data['expected_eto'] - (first_eto - recorded_eto)
    data['expected_etof'] = data['expected_etof'] - (first_etof - recorded_etof)

    return data[['field_id', 'crop', 'time', 'actual_et', 'actual_eto', 'actual_etof', 'expected_et', 'expected_eto', 'expected_etof', 'Class_Names', 'ESRI_Red', 'ESRI_Green', 'ESRI_Blue', 'Erdas_Red', 'Erdas_Green', 'Erdas_Blue', 'hectares']]

In [None]:
df = df.groupby('forecasting_date')[list(df.columns)].apply(align_forecast).reset_index().drop(columns='level_1')
df.head()

#### Metric Calculation

In [None]:
# alignment_metrics = df.groupby('forecasting_date').apply(eval_metrics, normalize=True, climatology_ref=climatology_table, avgs_ref=avgs_table)
# alignment_metrics.reset_index().to_csv('./data/metrics/monterey_alignment_metrics.csv', index=False)
# alignment_metrics.head()

In [None]:
alignment_metrics = pd.read_csv('../data/metrics/monterey_alignment_metrics.csv', low_memory=False).drop(columns='level_1')
alignment_metrics['forecasting_date'] = pd.to_datetime(alignment_metrics['forecasting_date'])
alignment_metrics.head()

#### Visualization

In [None]:
alignment_metrics_long = alignment_metrics.reset_index().melt(
    id_vars=['forecasting_date', 'field_id', 'crop', 'variable'], value_vars=['mae', 'bias', 'skill_score'], var_name='stat')
alignment_metrics_long.join(cdl_codes, how="left", on="crop", validate="many_to_many")
alignment_metrics_long

In [None]:
rel = timeseries_rel(alignment_metrics_long, row='variable', col='stat', y='value',
                     errorbar=('pi', 50), estimator=np.median, as_percent=True, aspect=1.5,
                     refline={'y': 0},
                     facet_kws={'sharey': 'col', 'sharex': True, 'margin_titles': True},
                     row_order=['ET', 'ETo', 'ETof'], title_template={"col_template":"{col_name}", "row_template":"{row_name}"},
                     title="Post-Alignment Interquartile Accuracy");
rel.axes.flat[2].yaxis.set_major_formatter('{x:.1f}');
rel.axes.flat[2].set(ylim=(-1.25, 1.25));
rel.axes.flat[2].set_yticks((1, 0.5, 0, -0.5, -1));
rel.axes.flat[5].yaxis.set_major_formatter('{x:.1f}');
rel.axes.flat[5].set(ylim=(-1.25, 1.25));
rel.axes.flat[5].set_yticks((1, 0.5, 0, -0.5, -1));
rel.axes.flat[8].yaxis.set_major_formatter('{x:.1f}');
rel.axes.flat[8].set(ylim=(-1.25, 1.25));
rel.axes.flat[8].set_yticks((1, 0.5, 0, -0.5, -1));

#### Alignment Comparison

In [None]:
alignment_metrics_long['aligned'] = True
metrics_long['aligned'] = False

alignment_metrics_cmp = pd.concat([alignment_metrics_long, metrics_long])
alignment_metrics_cmp

In [None]:
rel = timeseries_rel(alignment_metrics_cmp, row='variable', col='stat', y='value', hue='aligned',
                     errorbar='sd', estimator=np.median, as_percent=True, aspect=1.5,
                     facet_kws={'sharey': False, 'sharex': True, 'margin_titles': True},
                     row_order=['ET', 'ETo', 'ETof'], title_template={"col_template":"{col_name}", "row_template":"{row_name}"},
                     title="Alignment Performance Comparison");
rel.axes.flat[2].yaxis.set_major_formatter('{x:.1f}');
rel.axes.flat[2].set(ylim=(-1.25, 1.25));
rel.axes.flat[2].set_yticks((1, 0.5, 0, -0.5, -1));
rel.axes.flat[5].yaxis.set_major_formatter('{x:.1f}');
rel.axes.flat[5].set(ylim=(-1.25, 1.25));
rel.axes.flat[5].set_yticks((1, 0.5, 0, -0.5, -1));
rel.axes.flat[8].yaxis.set_major_formatter('{x:.1f}');
rel.axes.flat[8].set(ylim=(-1.25, 1.25));
rel.axes.flat[8].set_yticks((1, 0.5, 0, -0.5, -1));
rel.savefig('../images/monterey/alignment_comparison.png')

### Spatial Plotting

#### Preparation

First merge the geo table created in the beginning with the metrics table.

Afterwards, get the average metrics amongst all forecasts.

In [None]:
# Define date range of seasons
winter = (datetime(year=2024, month=1, day=1), datetime(year=2024, month=3, day=20)) # Start winter at start of forecast range
spring = (datetime(year=2024, month=3, day=21), datetime(year=2024, month=6, day=20))
summer = (datetime(year=2024, month=6, day=21), datetime(year=2024, month=9, day=20))

# Create list of the forecast dates that fit the seasonal ranges
winter_forecasts = pd.Series([x for x in forecast_dates if x >= winter[0] and x <= winter[1]])
spring_forecasts = pd.Series([x for x in forecast_dates if x >= spring[0] and x <= spring[1]])
summer_forecasts = pd.Series([x for x in forecast_dates if x >= summer[0] and x <= summer[1]])

def assign_season(x):
    season = ""
    if winter_forecasts.isin([x['forecasting_date']]).any():
        season = "Winter"
    elif spring_forecasts.isin([x['forecasting_date']]).any():
        season = "Spring"
    elif summer_forecasts.isin([x['forecasting_date']]).any():
        season = "Summer"
    return pd.Series({'season': season})

In [None]:
# Create column that defines a row's season
seasonal_metrics = metrics_norm.reset_index().sort_values(by='forecasting_date').merge(metrics_norm.reset_index().apply(assign_season, axis=1), left_index=True, right_index=True)

# Combine all forecasts for each field by getting mean of forecast metrics.
seasonal_metrics = seasonal_metrics.groupby(['field_id', 'season', 'variable'])[['mae', 'rmse', 'bias', 'corr', 'skill_score']].agg('mean').round(2)

# Add positional columns
seasonal_metrics = seasonal_metrics.join(monterey_geo, how='left', on='field_id', validate='many_to_one')
# Add field size column for marker size
seasonal_metrics = seasonal_metrics.join(field_metadata, how='left', on='field_id', validate='many_to_one')
# Calculate marker size
seasonal_metrics['markersize'] = seasonal_metrics.apply(lambda x: np.max(np.divmod(x['hectares'] / 10, 10)[1], initial=2.0), axis=1)

seasonal_metrics.reset_index(inplace=True)
seasonal_metrics.dropna(inplace=True)
seasonal_metrics

#### Import regional polygon

In [None]:
moco_geo = gpd.read_file("../data/geo/MoCo_Boundary.geojson")
moco_geo

In [None]:
moco_geo.crs

#### FacetGrid Geo Helper

In [None]:
def facet_geoscatter(data, *, boundary_map, col, row=None, hue, palette="YlOrRd", size=8,
                     title, export_img: bool|str=None, height=4, aspect=1.2, double_legend=False,
                     row_order=None, col_order=None, title_template={}, as_percent=True,
                     normalize_cmap=False, background=False):
    g = sns.FacetGrid(data, col=col, row=row, height=height, aspect=aspect, despine=False, row_order=row_order, col_order=col_order)
    for ax in g.axes.flat:
        boundary_map.plot(color="lightgrey", edgecolor='k', alpha=0.3, ax=ax)
        # Add basemap
        if background:
            ax.tick_params(left=False, bottom=False)
            ax.set(xticklabels=[], yticklabels=[], xlabel=None, ylabel=None)
            cx.add_basemap(ax, crs=boundary_map.crs.to_string(), attribution=False)

    # Colorbar config
    norm = None
    if normalize_cmap:
        norm = mcolors.TwoSlopeNorm(vcenter=0, vmin=data[hue].min(), vmax=data[hue].max())
        c_mappable = cm.ScalarMappable(norm=norm, cmap=palette)
        c_mappable.set_array(data[hue])
    else: 
        c_mappable = plt.scatter([], [], c=[], vmin=data[hue].min(), vmax=data[hue].max(), cmap=palette);

    # Plot points
    g.map_dataframe(sns.scatterplot, x="longitude", y="latitude", hue=hue, hue_norm=norm, palette=palette, linewidths=0, size=size);
    g.set(xlabel=None, ylabel=None)
    g.set_titles(**title_template)
    plt.suptitle(title, y=1.02);

    # Add colorbar to right side
    g.figure.subplots_adjust(right=.92)
    cax = g.fig.add_axes([.94, .25, .02, .6])
    g.figure.colorbar(c_mappable, cax=cax)
    if as_percent:
        cax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    if double_legend is not False:
        g.figure.subplots_adjust(right=.90)
        dax = cax.twinx()
        if type(double_legend) is not bool:
            dax.set(ylim=(double_legend.min()['value'], double_legend.max()['value']))

    # Export image
    if type(export_img) is bool and save is True: g.savefig(f'../images/monterey/{title}.png')
    elif type(export_img) is str: g.savefig(f'../images/monterey/{export_img}.png')
    
    return g

In [None]:
plt.rcdefaults()

#### Seasonal Overview

##### ET Visualization

In [None]:
et_seasonal = seasonal_metrics[seasonal_metrics['variable'] == 'ET']

In [None]:
facet_geoscatter(et_seasonal, boundary_map=moco_geo, col='season', hue='skill_score', palette='Spectral', size='markersize',
                 as_percent=False, normalize_cmap=True, export_img="et_spatial_skill_score", 
                 col_order=['Winter', 'Spring', 'Summer'], title_template={"template":"{col_name}"},
                 title="Average ET Skill Score across Monterey County by Season", background=True);

In [None]:
facet_geoscatter(et_seasonal, boundary_map=moco_geo, col='season', hue='corr', export_img="et_spatial_corr", size='markersize',
                     col_order=['Winter', 'Spring', 'Summer'], normalize_cmap=True, title_template={"template":"{col_name}"},
                     title="Average ET Forecast Correlation across Monterey County by Season", palette='Spectral', background=True);

In [None]:
facet_geoscatter(et_seasonal, boundary_map=moco_geo, col='season', hue='mae',export_img="et_spatial_mae", size='markersize',
                     col_order=['Winter', 'Spring', 'Summer'], title_template={"template":"{col_name}"},
                     title="Average ET MAE across Monterey County by Season", background=True);

In [None]:
facet_geoscatter(et_seasonal, boundary_map=moco_geo, col='season', hue='rmse', export_img="et_spatial_rmse", size='markersize',
                     col_order=['Winter', 'Spring', 'Summer'], title_template={"template":"{col_name}"},
                     title="Average ET RMSE across Monterey County by Season", background=True);

In [None]:
facet_geoscatter(et_seasonal, boundary_map=moco_geo, col='season', hue='bias', palette='Spectral', export_img="et_spatial_bias", size='markersize',
                     col_order=['Winter', 'Spring', 'Summer'], normalize_cmap=True, title_template={"template":"{col_name}"},
                     title="Average ET Forecast Bias across Monterey County by Season", background=True);

##### ETo Visualization

In [None]:
eto_seasonal = seasonal_metrics[seasonal_metrics['variable'] == 'ETo']

In [None]:
facet_geoscatter(eto_seasonal, boundary_map=moco_geo, col='season', hue='skill_score', export_img="eto_spatial_skill_score", as_percent=False, size='markersize',
                     col_order=['Winter', 'Spring', 'Summer'], normalize_cmap=True, title_template={"template":"{col_name}"},
                     title="Average ETo Skill Score across Monterey County by Season", palette='Spectral', background=True);

In [None]:
facet_geoscatter(eto_seasonal, boundary_map=moco_geo, col='season', hue='corr', export_img="eto_spatial_corr", size='markersize',
                     col_order=['Winter', 'Spring', 'Summer'], normalize_cmap=True, title_template={"template":"{col_name}"},
                     title="Average ETo Forecast Correlation across Monterey County by Season", palette='Spectral', background=True);

In [None]:
facet_geoscatter(eto_seasonal, boundary_map=moco_geo, col='season', hue='mae',export_img="eto_spatial_mae", size='markersize',
                     col_order=['Winter', 'Spring', 'Summer'], title_template={"template":"{col_name}"},
                     title="Average ETo MAE across Monterey County by Season", background=True);

In [None]:
facet_geoscatter(eto_seasonal, boundary_map=moco_geo, col='season', hue='rmse', export_img="eto_spatial_rmse", size='markersize',
                     col_order=['Winter', 'Spring', 'Summer'], title_template={"template":"{col_name}"},
                     title="Average ETo RMSE across Monterey County by Season", background=True);

In [None]:
facet_geoscatter(eto_seasonal, boundary_map=moco_geo, col='season', hue='bias', palette='Spectral', export_img="eto_spatial_bias", size='markersize',
                     col_order=['Winter', 'Spring', 'Summer'], normalize_cmap=True, title_template={"template":"{col_name}"},
                     title="Average ETo Forecast Bias across Monterey County by Season", background=True);

##### EToF Visualization

In [None]:
etof_seasonal = seasonal_metrics[seasonal_metrics['variable'] == 'ETof']

In [None]:
facet_geoscatter(etof_seasonal, boundary_map=moco_geo, col='season', hue='skill_score', size='markersize', export_img="etof_spatial_skill_score", as_percent=False,
                     col_order=['Winter', 'Spring', 'Summer'], normalize_cmap=True, title_template={"template":"{col_name}"},
                     title="Average EToF Skill Score across Monterey County by Season", palette='Spectral', background=True);

In [None]:
facet_geoscatter(etof_seasonal, boundary_map=moco_geo, col='season', hue='corr', size='markersize', export_img="etof_spatial_corr",
                     col_order=['Winter', 'Spring', 'Summer'], normalize_cmap=True, title_template={"template":"{col_name}"},
                     title="Average EToF Forecast Correlation across Monterey County by Season", palette='Spectral', background=True);

In [None]:
facet_geoscatter(etof_seasonal, boundary_map=moco_geo, col='season', hue='mae', size='markersize', export_img="etof_spatial_mae",
                     col_order=['Winter', 'Spring', 'Summer'], title_template={"template":"{col_name}"},
                     title="Average EToF MAE across Monterey County by Season", background=True);

In [None]:
facet_geoscatter(etof_seasonal, boundary_map=moco_geo, col='season', hue='rmse', size='markersize', export_img="etof_spatial_rmse",
                     col_order=['Winter', 'Spring', 'Summer'], title_template={"template":"{col_name}"},
                     title="Average EToF RMSE across Monterey County by Season", background=True);

In [None]:
facet_geoscatter(etof_seasonal, boundary_map=moco_geo, col='season', hue='bias', size='markersize', palette='Spectral', export_img="etof_spatial_bias",
                     col_order=['Winter', 'Spring', 'Summer'], normalize_cmap=True, title_template={"template":"{col_name}"},
                     title="Average EToF Forecast Bias across Monterey County by Season", background=True);

#### Summer Overview

In [None]:
summer_metrics = seasonal_metrics[seasonal_metrics['season'] == 'Summer'][['field_id', 'variable', 'longitude', 'latitude', 'mae', 'bias', 'skill_score', 'markersize']]
summer_metrics = summer_metrics.melt(id_vars=['field_id', 'variable', 'longitude', 'latitude', 'markersize'], value_vars=['mae', 'bias', 'skill_score'], var_name='stat')
summer_metrics.head()

In [None]:
# Add subtitles into the data to insert into the plots
subtitle_dict = {
    'mae': 'Mean Absolute Error',
    'bias': 'Forecast Bias',
    'skill_score': 'Skill Score'
}
summer_metrics['name'] = summer_metrics['stat'].map(subtitle_dict)

In [None]:
# Keyword for double-sided legend
sm_skill_score = summer_metrics[summer_metrics['stat'] == 'skill_score']

In [None]:
# Create subplot grid for each metric
g = facet_geoscatter(summer_metrics[summer_metrics['variable'] == 'ET'], boundary_map=moco_geo, col='name', hue='value', size='markersize', normalize_cmap=True,
                     title='ET Summer Metric Overview for Monterey County', title_template={"template": "{col_name}"}, export_img="et_summer_spatial",
                     double_legend=sm_skill_score[sm_skill_score['variable'] == 'ET'], background=True);

In [None]:
# Create subplot grid for each metric
g = facet_geoscatter(summer_metrics[summer_metrics['variable'] == 'ETo'], boundary_map=moco_geo, col='name', hue='value', size='markersize', normalize_cmap=True,
                     title='ETo Summer Metric Overview for Monterey County', title_template={"template": "{col_name}"}, export_img="eto_summer_spatial",
                     double_legend=sm_skill_score[sm_skill_score['variable'] == 'ETo'], background=True);

In [None]:
# Create subplot grid for each metric
g = facet_geoscatter(summer_metrics[summer_metrics['variable'] == 'ETof'], boundary_map=moco_geo, col='name', hue='value', size='markersize', normalize_cmap=True,
                     title='EToF Summer Metric Overview for Monterey County', title_template={"template": "{col_name}"}, export_img="etof_summer_spatial",
                     double_legend=sm_skill_score[sm_skill_score['variable'] == 'ETof'], background=True);