# Classify nested catchments

Author: Thiago Nascimento (thiago.nascimento@eawag.ch)

This notebook is part of the EStreams publication and was used to classify potential nested catchments within the dataset.

* Note that this code enables not only the replicability of the current database but also the extrapolation to new catchment areas. 
* Additionally, the user should download and insert the original raw-data in the folder of the same name prior to run this code. 
* The original third-party data used were not made available in this repository due to redistribution and storage-space reasons.  

## Requirements
**Python:**

* Python>=3.6
* Jupyter
* geopandas=0.10.2
* networkx
* os
* pandas
* shapely
* tqdm

Check the Github repository for an environment.yml (for conda environments) or requirements.txt (pip) file.

**Files:**

* data/estreams_gauging_stations.csv 
* data/estreams_catchments.shp 

**Directory:**

* Clone the GitHub directory locally
* Place any third-data variables in their respective directory.
* ONLY update the "PATH" variable in the section "Configurations", with their relative path to the EStreams directory. 

# Import modules

In [1]:
import pandas as pd
import numpy as np
import tqdm
import os
import geopandas as gpd
import networkx as nx
from shapely.geometry import Polygon, Point

# Configurations

In [3]:
# Only editable variable:
# Relative path to your local directory
PATH = "../../.."

* #### The users should NOT change anything in the code below here. 

In [4]:
# Set the directory:
os.chdir(PATH)

# Import data
## Catchment boundaries

In [5]:
catchment_boundaries = gpd.read_file('data/shapefiles/estreams_catchments.shp')
catchment_boundaries

Unnamed: 0,basin_id,gauge_id,gauge_coun,area,area_calc,area_flag,area_perc,start_date,end_date,gauge_hier,watershed_,geometry
0,AT000001,200014,AT,4647.9,4668.379,0,-0.440608,1996-01-01,2019-12-31,14,1,"POLYGON Z ((9.69406 46.54322 0.00000, 9.69570 ..."
1,AT000002,200048,AT,102.0,102.287,0,-0.281373,1958-10-01,2019-12-31,1,1,"POLYGON Z ((10.13650 47.02949 0.00000, 10.1349..."
2,AT000003,231662,AT,535.2,536.299,0,-0.205344,1985-01-02,2019-12-31,2,1,"POLYGON Z ((10.11095 46.89437 0.00000, 10.1122..."
3,AT000004,200592,AT,66.6,66.286,0,0.471471,1998-01-02,2019-12-31,1,1,"POLYGON Z ((10.14189 47.09706 0.00000, 10.1404..."
4,AT000005,200097,AT,72.2,72.448,0,-0.343490,1990-01-01,2019-12-31,1,1,"POLYGON Z ((9.67851 47.06249 0.00000, 9.67888 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
15042,UAGR0017,6682300,UA,321.0,325.370,0,-1.361371,1978-01-01,1987-12-31,1,1916,"POLYGON Z ((33.96791 44.63291 0.00000, 33.9679..."
15043,UAGR0018,6682500,UA,49.7,47.594,0,4.237425,1978-01-01,1987-12-31,1,1917,"POLYGON Z ((34.19958 44.58291 0.00000, 34.2029..."
15044,UAGR0019,6683010,UA,261.0,244.731,1,6.233333,1978-01-01,1987-12-31,1,1918,"POLYGON Z ((34.19624 44.88375 0.00000, 34.1962..."
15045,UAGR0020,6683200,UA,760.0,731.073,0,3.806184,1978-01-01,1987-12-31,1,1919,"POLYGON Z ((35.78708 47.28708 0.00000, 35.7870..."


## Network information

In [6]:
network_estreams = pd.read_csv('data/streamflow/estreams_gauging_stations.csv', encoding='utf-8')
network_estreams.set_index("basin_id", inplace = True)
network_estreams

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,area,...,start_date,end_date,num_years,num_months,num_days,num_days_gaps,num_continuous_days,duplicated_suspect,watershed_group,gauges_upstream
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,4647.9,...,1996-01-01 00:00:00,2019-12-31 00:00:00,24,288,8766,0.0,8766,CH000197,1,13
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,102.0,...,1958-10-01 00:00:00,2019-12-31 00:00:00,62,735,22372,0.0,22372,CH000221,1,0
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,535.2,...,1985-01-02 00:00:00,2019-12-31 00:00:00,35,420,12782,0.0,12782,CH000215,1,1
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,66.6,...,1998-01-02 00:00:00,2019-12-31 00:00:00,22,264,8034,0.0,8034,CH000227,1,0
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.150770,9.802668,47.150770,72.2,...,1990-01-01 00:00:00,2019-12-31 00:00:00,30,360,10957,0.0,10957,CH000214,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,BASHTANOVKA,UA,UA_GRDC,KACHA,33.894739,44.691884,33.900000,44.683333,321.0,...,1978-01-01 00:00:00,1987-12-31 00:00:00,10,120,3652,0.0,3652,,1916,0
UAGR0018,6682500,YALTA,UA,UA_GRDC,DERE-KIOY,34.166667,44.500000,34.166667,44.500000,49.7,...,1978-01-01 00:00:00,1987-12-31 00:00:00,10,120,3652,0.0,3652,,1917,0
UAGR0019,6683010,PIONERSKOE,UA,UA_GRDC,SALHYR,34.199841,44.887685,34.200000,44.883333,261.0,...,1978-01-01 00:00:00,1987-12-31 00:00:00,10,120,3652,0.0,3652,,1918,0
UAGR0020,6683200,TOKMAK,UA,UA_GRDC,TOKMAK,35.705833,47.251389,35.705833,47.251389,760.0,...,1978-01-01 00:00:00,1987-12-31 00:00:00,10,120,3652,0.0,3652,,1919,0


## Subset of the catchments to be used

In [None]:
catchments = catchment_boundaries.copy()
network = network_estreams.copy()
catchments

In [7]:
catchments = catchment_boundaries.iloc[0:100,:]
network = network_estreams.iloc[0:100,:]
catchments

Unnamed: 0,basin_id,gauge_id,gauge_coun,area,area_calc,area_flag,area_perc,start_date,end_date,gauge_hier,watershed_,geometry
0,AT000001,200014,AT,4647.9,4668.379,0,-0.440608,1996-01-01,2019-12-31,14,1,"POLYGON Z ((9.69406 46.54322 0.00000, 9.69570 ..."
1,AT000002,200048,AT,102.0,102.287,0,-0.281373,1958-10-01,2019-12-31,1,1,"POLYGON Z ((10.13650 47.02949 0.00000, 10.1349..."
2,AT000003,231662,AT,535.2,536.299,0,-0.205344,1985-01-02,2019-12-31,2,1,"POLYGON Z ((10.11095 46.89437 0.00000, 10.1122..."
3,AT000004,200592,AT,66.6,66.286,0,0.471471,1998-01-02,2019-12-31,1,1,"POLYGON Z ((10.14189 47.09706 0.00000, 10.1404..."
4,AT000005,200097,AT,72.2,72.448,0,-0.343490,1990-01-01,2019-12-31,1,1,"POLYGON Z ((9.67851 47.06249 0.00000, 9.67888 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
95,AT000097,202135,AT,20.9,21.213,0,-1.497608,1987-01-01,2019-12-31,1,2,"POLYGON Z ((12.20085 47.51333 0.00000, 12.2028..."
96,AT000098,201897,AT,94.0,93.866,0,0.142553,1982-01-01,2019-12-31,2,2,"POLYGON Z ((12.24984 47.47782 0.00000, 12.2483..."
97,AT000099,230938,AT,34.0,34.090,0,-0.264706,2001-04-01,2019-12-31,1,2,"POLYGON Z ((12.27123 47.61193 0.00000, 12.2727..."
98,AT000100,202259,AT,6.2,6.211,0,-0.177419,1996-01-02,2019-12-31,1,2,"POLYGON Z ((12.09584 47.56637 0.00000, 12.0952..."


## Make a buffer around the catchments

In [8]:
# Frst we assign a tolerance to overcome problems of catchments with delineations 
# slightly outside the other catchment. 
# This code may take a while.
tolerance = 0.01
catchments_buffer = catchments.copy()
catchments_buffer['geometry'] = catchments['geometry'].buffer(tolerance)


  catchments_buffer['geometry'] = catchments['geometry'].buffer(tolerance)


# Processing
## Nested catchments count

* First we classifiy the catchments according to their possibility of being nested.
* At the end we have groups (main watershed) to where each sub-catchment is assigned. 

In [9]:
# Nested catchments:
# Initialize an empty list to store nested catchments
nested_catchments = []

# Iterate over each catchment
for index, catchment in tqdm.tqdm(catchments.iterrows()):
    # Get the geometry of the current catchment
    geom = catchment['geometry']
    
    # Iterate over other catchments to check if they are nested
    for index2, other_catchment in catchments_buffer.iterrows():
        # Skip the same catchment
        if index == index2:
            continue
        
        other_geom = other_catchment['geometry']
        
        # Check if the current catchment is completely within the other catchment
        if geom.within(other_geom):
            nested_catchments.append((catchment.basin_id, other_catchment.basin_id))

100it [00:00, 329.04it/s]


In [11]:
# Create the big-groups (main watershed):
# Initialize an empty graph
G = nx.Graph()

# Add nodes for each catchment
for index, catchment in catchments.iterrows():
    G.add_node(catchment['basin_id'])

# Add edges for nested catchments
for nested_pair in nested_catchments:
    G.add_edge(nested_pair[0], nested_pair[1])

# Find connected components
groups = list(nx.connected_components(G))

# Assign groups to catchments
group_assignment = {}
for i, group in enumerate(groups):
    for catchment_id in group:
        group_assignment[catchment_id] = i + 1  # Assigning group numbers starting from 1

# Update the catchments GeoDataFrame with the group assignments
catchments['watershed_group'] = catchments['basin_id'].map(group_assignment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [12]:
catchments.head(5)

Unnamed: 0,basin_id,gauge_id,gauge_coun,area,area_calc,area_flag,area_perc,start_date,end_date,gauge_hier,watershed_,geometry,watershed_main,watershed_group
0,AT000001,200014,AT,4647.9,4668.379,0,-0.440608,1996-01-01,2019-12-31,14,1,"POLYGON Z ((9.69406 46.54322 0.00000, 9.69570 ...",1,1
1,AT000002,200048,AT,102.0,102.287,0,-0.281373,1958-10-01,2019-12-31,1,1,"POLYGON Z ((10.13650 47.02949 0.00000, 10.1349...",1,1
2,AT000003,231662,AT,535.2,536.299,0,-0.205344,1985-01-02,2019-12-31,2,1,"POLYGON Z ((10.11095 46.89437 0.00000, 10.1122...",1,1
3,AT000004,200592,AT,66.6,66.286,0,0.471471,1998-01-02,2019-12-31,1,1,"POLYGON Z ((10.14189 47.09706 0.00000, 10.1404...",1,1
4,AT000005,200097,AT,72.2,72.448,0,-0.34349,1990-01-01,2019-12-31,1,1,"POLYGON Z ((9.67851 47.06249 0.00000, 9.67888 ...",1,1


In [13]:
nested_catchments_df = pd.DataFrame(nested_catchments)
nested_catchments_df.columns = ["catchment_1", "catchment_2"]
nested_catchments_df

Unnamed: 0,catchment_1,catchment_2
0,AT000001,AT000013
1,AT000002,AT000003
2,AT000002,AT000007
3,AT000002,AT000009
4,AT000002,AT000013
...,...,...
311,AT000093,AT000096
312,AT000094,AT000095
313,AT000094,AT000096
314,AT000095,AT000096


In [None]:
nested_catchments_df.to_excel("results/extras/estreams_nested_catchments_list.xlsx")

## Gauges upstream:
* Here we count the number of gauges upstream the given catchment.
* A headwater catchment will have a number 0, while a downstream catchment that has two gauges within (not counting the outlet) has a number 2.

In [None]:
# Assign the index to the shapefile:
catchments.set_index("basin_id", inplace = True)

# Keep one field with the same name:
catchments["basin_id"] = catchments.index

In [None]:
# Create one field with the same name as the index:
network["basin_id"]= network.index
network

In [None]:
# Create a geometry column with Point objects for being used:
geometry = [Point(lon, lat) for lon, lat in zip(network['lon_snap'], network['lat_snap'])]

# Create a GeoDataFrame
network = gpd.GeoDataFrame(network, geometry=geometry)

# Optional: Set the coordinate reference system (CRS) if known
# For example, if your coordinates are in WGS84 (EPSG:4326)
network.crs = 'EPSG:4326'

### Apply the count taking into account some filters:
       - Points to pay attention:
* Outlet is seldom slightly outside the shapefile. 
* Catchment outlet has one duplicate within the shapefile.
* Catchments within the shapefile also have duplicates. 

       - Solution:
* We exclude the catchment own outlet from the count for all catchments. 
* We apply a filter to delete the catchment outlet to count duplicated_suspects that are within the catchment shapefile. 
* We count the number of duplicates, and when it is even, we simply divide per 2 and substract at the end count = count - (n/2). If it is odd, we do count = count - ((n - 1)/2 + 1). The reason is that when we have a two duplicates, they could delete each other.

In [None]:
# Spatial join to count geometries within the catchments shapefile
joined = gpd.sjoin(catchments, network, how='inner', op='intersects')

# Exclude geometries with the same "basin_id" as in the network GeoDataFrame (exclude the outlet):
joined_filtered = joined[joined['basin_id_left'] != joined['basin_id_right']]

# Here we create a function to deal with the duplicates of the outlet when they happen to be within:
# Parse the "duplicated_suspect" column to extract individual basin_ids
def parse_duplicated_suspect(suspect):
    if pd.isna(suspect):
        return []
    else:
        return suspect.split(', ')

joined_filtered['duplicated_suspect_ids'] = joined_filtered['duplicated_suspect'].apply(parse_duplicated_suspect)

# Exclude basin IDs from the count when there are duplicated suspects
def exclude_duplicated_suspects(row):
    if len(row['duplicated_suspect_ids']) > 0:
        return row['basin_id_left'] not in row['duplicated_suspect_ids']
    else:
        return True

joined_filtered = joined_filtered[joined_filtered.apply(exclude_duplicated_suspects, axis=1)]

# Count the number of geometries for each unique "basin_id" in the catchments shapefile
count_per_basin = joined_filtered['basin_id_left'].value_counts()

# Count the number of non-null values in the "duplicated_suspect" column for each basin ID
duplicates_count = joined_filtered.groupby('basin_id_left')['duplicated_suspect'].count()

# Adjust the count based on the number of duplicates within each catchment
for basin_id, count in duplicates_count.items():
    if count % 2 == 0:
        count_per_basin[basin_id] -= count // 2
    else:
        count_per_basin[basin_id] -= (count - 1) // 2
        count_per_basin[basin_id] += 1

network["gauges_upstream"] = np.nan      
network["gauges_upstream"] = count_per_basin

# Filter the potential NaNs:
network['gauges_upstream'] = network['gauges_upstream'].fillna(0)

network.head(10)

## Assign the new values to the network:

In [None]:
network_estreams['watershed_group'] = catchments['watershed_group']
network_estreams

In [None]:
network_estreams['gauges_upstream'] = network['gauges_upstream'].astype(int)
network_estreams

In [None]:
network_estreams[network_estreams.gauges_upstream== 1524]

### Shapefile

In [None]:
catchment_boundaries

In [None]:
catchment_boundaries.set_index("basin_id", inplace = True)
catchment_boundaries

In [None]:
# Retrieve the information needed:
catchment_boundaries["gauges_upstream"] = network['gauges_upstream'].astype(int)
catchment_boundaries['watershed_group'] = network['watershed_main'].astype(int)

# Adjust the columns order and names:
catchment_boundaries = catchment_boundaries[['gauge_id', 'gauge_coun', 'area', 'area_calc',
       'area_flag', 'area_perc', 'start_date', 'end_date', 'gauges_upstream', 'watershed_group', 'geometry']]

catchment_boundaries.columns = ['gauge_id', 'gauge_country', 'area', 'area_calc',
       'area_flag', 'area_perc', 'start_date', 'end_date', 'gauges_upstream', 'watershed_group', 'geometry']

In [None]:
catchment_boundaries

## Save the data

In [None]:
# Save the dataframe:
network_estreams.to_csv('results/extras/estreams_gauging_stations.csv', encoding='utf-8')
# Save the shapefile:
catchment_boundaries.to_file('results/extras/estreams_catchments.shp')

## End