In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import xarray as xr

%matplotlib widget
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm
from shapely.geometry import Point

csv_file = 'data/grades_rivids_stratified_discharge_level2_sample.csv'

datasets = Path('/work/pi_kandread_umass_edu/Datasets/')

In [None]:
# Check if the CSV file already exists and delete it
# if os.path.exists(csv_file):
#     os.remove(csv_file)

time_slice = slice('2000-02-24', '2023-08-31')
sample_size = 100 # Define the sample size for each stratum
bin_edges = np.logspace(0,6,7)
pfafs = range(1,10)
for pfaf in pfafs:
    print(f"Basins {pfaf}")
    grades_file = datasets / 'GRADES_hydroDL' / f'output_pfaf_{pfaf:02.0f}_1979_202308.nc'
    merit_file = datasets / 'MERIT_Basins' / f'riv_pfaf_{pfaf:1.0f}_MERIT_Hydro_v07_Basins_v01_bugfix1.shp'
    
    print('Reading GRADES')
    grades = xr.open_dataset(grades_file, chunks={'rivid':100000})
    mean_values = grades.sel(time=time_slice).mean(dim='time')
    grades.close()
    
    mean_values.to_netcdf(f'data/grades_pfaf_{pfaf:02.0f}_meanQ.nc')
    mean_Q_series = mean_values['Qout'].to_series()

    print('Reading MERIT')
    merit_df = gpd.read_file(merit_file).set_index('COMID')
    
    print("Merging data")
    merged_df = merit_df.join(mean_Q_series)
    merged_df = merged_df[merged_df['Qout']>1]
    merged_df['q_bins'] = pd.cut(merged_df['Qout'], bins=bin_edges, labels=False)
    
    print("Sampling data")
    merged_df['l2'] = merged_df.index//1E6
    for i,g in merged_df.groupby('l2'):
        # Stratified sampling based on the discretized variable
        stratified_sample = (g.groupby('q_bins', group_keys=False)
                             .apply(lambda x: x.sample(min(len(x), sample_size)),include_groups=False))
        
        stratified_sample.reset_index()['COMID'].to_csv(csv_file, index=False, mode='a', header=False)
    

In [None]:
plt.close('all')
plt.hist(np.log10(merged_df.Qout),bins=np.log10(bin_edges),edgecolor='black')


In [None]:
id_df = pd.read_csv(csv_file, header=None).rename(columns={0: 'id'})
print(f"{len((id_df.id//1E6).unique())} L2 basins")
print(f"{len(id_df)} River reaches")

In [None]:
id_df = pd.read_csv(csv_file, header=None).rename(columns={0: 'id'})

gdf_list = []
pfafs = range(1,10)
for pfaf in tqdm(pfafs):
    merit_file = datasets / 'MERIT_Basins' / f'riv_pfaf_{pfaf:1.0f}_MERIT_Hydro_v07_Basins_v01_bugfix1.shp'
    merit_df = gpd.read_file(merit_file)
    merit_df = merit_df[merit_df['COMID'].isin(id_df['id'])]
    gdf_list.append(merit_df)

    
# Concatenate GeoDataFrames along rows (axis=0)
gdf = pd.concat(gdf_list, ignore_index=True)

# Convert concatenated DataFrame back to GeoDataFrame
gdf = gpd.GeoDataFrame(gdf, geometry='geometry')

# Simplify the geometries to a point instead of polyline
def centermost_point(line):
    # Calculate the length of the LineString
    total_length = line.length
    # Find the point that is halfway along the LineString
    halfway = total_length / 2
    # Interpolate the point at the halfway distance along the LineString
    center_point = line.interpolate(halfway)
    return center_point

# Apply the function to each LineString geometry to get the centermost point
gdf['geometry'] = gdf['geometry'].apply(centermost_point)

#reindex
gdf = gdf.set_index('COMID')
gdf.index.names = ['id']

output_shapefile = 'data/MERIT_sites/merit_stratified_discharge_level2_sample.shp'
gdf.to_file(output_shapefile)

In [None]:
output_shapefile = 'data/MERIT_sites/merit_stratified_discharge_level2_sample.shp'
gdf = gpd.read_file(output_shapefile)

In [None]:
gdf.plot(markersize=2)