# Complementary extra codes: Group basins, find nested catchments and number of gauges upstream

Author: Thiago Nascimento (thiago.nascimento@eawag.ch)

This notebook complements the EStreams publication. The code is divided into first assigning groups for each catchment based on their conectivity (e.g., Rhine, Danube, etc); creating a list of all the nested catchments within each basin; and computing the number of gauges upstream the given basin. 

* Note that this code enables not only the replicability of the current database but also the extrapolation to new catchment areas. 
* Additionally, the user should download and insert the original raw-data in the folder of the same name prior to run this code. 
* The original third-party data used were not made available in this repository due to redistribution and storage-space reasons.  

## Requirements
**Python:**

* Python>=3.6
* Jupyter
* geopandas=0.10.2
* numpy
* os
* pandas
* shapely
* networkx
* tqdm

Check the Github repository for an environment.yml (for conda environments) or requirements.txt (pip) file.

**Files:**

* results/estreams_catchments.shp 
* results/estreams_gauging_stations.csv

**Directory:**

* Clone the GitHub directory locally
* Place any third-data variables in their respective directory.
* ONLY update the "PATH" variable in the section "Configurations", with their relative path to the EStreams directory. 

# Import modules

In [1]:
import pandas as pd
import numpy as np
import tqdm as tqdm
import os
import geopandas as gpd
import networkx as nx
from shapely.geometry import Polygon, Point
import time

# Configurations

In [2]:
# Only editable variable:
# Relative path to your local directory
PATH = "../../.."

* #### The users should NOT change anything in the code below here. 

In [3]:
# Non-editable variables:
PATH_OUTPUT = "results/"

# Set the directory:
os.chdir(PATH)

# Import data
## Catchment boundaries

In [4]:
catchment_boundaries = gpd.read_file('results/estreams_catchments.shp')
catchment_boundaries

Unnamed: 0,basin_id,gauge_id,gauge_coun,area,area_calc,area_flag,area_perc,start_date,end_date,geometry
0,AT000001,200014,AT,4647.9,4668.379,0,-0.440608,1996-01-01,2021-12-31,"POLYGON Z ((9.69406 46.54322 0.00000, 9.69570 ..."
1,AT000002,200048,AT,102.0,102.287,0,-0.281373,1958-10-01,2021-12-31,"POLYGON Z ((10.13650 47.02949 0.00000, 10.1349..."
2,AT000003,231662,AT,535.2,536.299,0,-0.205344,1985-01-02,2021-12-31,"POLYGON Z ((10.11095 46.89437 0.00000, 10.1122..."
3,AT000004,200592,AT,66.6,66.286,0,0.471471,1998-01-02,2021-12-31,"POLYGON Z ((10.14189 47.09706 0.00000, 10.1404..."
4,AT000005,200097,AT,72.2,72.448,0,-0.343490,1990-01-01,2019-12-31,"POLYGON Z ((9.67851 47.06249 0.00000, 9.67888 ..."
...,...,...,...,...,...,...,...,...,...,...
17125,UAGR0017,6682300,UA,321.0,325.370,0,-1.361371,1978-01-01,1987-12-31,"POLYGON Z ((33.96791 44.63291 0.00000, 33.9679..."
17126,UAGR0018,6682500,UA,49.7,47.594,0,4.237425,1978-01-01,1987-12-31,"POLYGON Z ((34.19958 44.58291 0.00000, 34.2029..."
17127,UAGR0019,6683010,UA,261.0,244.731,1,6.233333,1978-01-01,1987-12-31,"POLYGON Z ((34.19624 44.88375 0.00000, 34.1962..."
17128,UAGR0020,6683200,UA,760.0,731.073,0,3.806184,1978-01-01,1987-12-31,"POLYGON Z ((35.78708 47.28708 0.00000, 35.7870..."


## Network information

In [5]:
network_EU = pd.read_csv('results/estreams_gauging_stations_duplicates.csv', encoding='utf-8')
network_EU.set_index("basin_id", inplace = True)
network_EU

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,area,...,num_months,num_days,num_days_gaps,num_continuous_days,geometry,duplicated_suspect,watershed_group,gauges_upstream,gauge_downstream,nested_catchments
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,4647.9,...,312,9497,0.0,9497,POINT (9.534835180274024 47.27374823144418),['CH000197'],1,14,AT000013,"['AT000001', 'CH000010', 'CH000046', 'CH000048..."
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,102.0,...,759,23103,0.0,23103,POINT (9.913676603393986 47.08030125096045),['CH000221'],1,1,CH000221,['AT000002']
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,535.2,...,444,13513,0.0,13513,POINT (9.847765104487962 47.13282061553542),['CH000215'],1,2,CH000215,"['AT000002', 'AT000003', 'CH000221']"
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,66.6,...,288,8765,0.0,8765,POINT (10.06184292326845 47.1289938468501),['CH000227'],1,1,CH000227,['AT000004']
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.150770,9.802668,47.150770,72.2,...,360,10957,0.0,10957,POINT (9.802668269683238 47.15077039253907),['CH000214'],1,1,CH000214,"['AT000005', 'CH000214']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,BASHTANOVKA,UA,UA_GRDC,KACHA,33.894739,44.691884,33.900000,44.683333,321.0,...,120,3652,0.0,3652,POINT (33.9 44.683333),,1988,1,,['UAGR0017']
UAGR0018,6682500,YALTA,UA,UA_GRDC,DERE-KIOY,34.166667,44.500000,34.166667,44.500000,49.7,...,120,3652,0.0,3652,POINT (34.166667 44.5),,1989,1,,['UAGR0018']
UAGR0019,6683010,PIONERSKOE,UA,UA_GRDC,SALHYR,34.199841,44.887685,34.200000,44.883333,261.0,...,120,3652,0.0,3652,POINT (34.2 44.883333),,1990,1,,['UAGR0019']
UAGR0020,6683200,TOKMAK,UA,UA_GRDC,TOKMAK,35.705833,47.251389,35.705833,47.251389,760.0,...,120,3652,0.0,3652,POINT (35.705833 47.251389),,1991,1,,['UAGR0020']


## Subset of the catchments to be used

In [6]:
catchments = catchment_boundaries.iloc[:, :]

network = network_EU.copy()
catchments

Unnamed: 0,basin_id,gauge_id,gauge_coun,area,area_calc,area_flag,area_perc,start_date,end_date,geometry
0,AT000001,200014,AT,4647.9,4668.379,0,-0.440608,1996-01-01,2021-12-31,"POLYGON Z ((9.69406 46.54322 0.00000, 9.69570 ..."
1,AT000002,200048,AT,102.0,102.287,0,-0.281373,1958-10-01,2021-12-31,"POLYGON Z ((10.13650 47.02949 0.00000, 10.1349..."
2,AT000003,231662,AT,535.2,536.299,0,-0.205344,1985-01-02,2021-12-31,"POLYGON Z ((10.11095 46.89437 0.00000, 10.1122..."
3,AT000004,200592,AT,66.6,66.286,0,0.471471,1998-01-02,2021-12-31,"POLYGON Z ((10.14189 47.09706 0.00000, 10.1404..."
4,AT000005,200097,AT,72.2,72.448,0,-0.343490,1990-01-01,2019-12-31,"POLYGON Z ((9.67851 47.06249 0.00000, 9.67888 ..."
...,...,...,...,...,...,...,...,...,...,...
17125,UAGR0017,6682300,UA,321.0,325.370,0,-1.361371,1978-01-01,1987-12-31,"POLYGON Z ((33.96791 44.63291 0.00000, 33.9679..."
17126,UAGR0018,6682500,UA,49.7,47.594,0,4.237425,1978-01-01,1987-12-31,"POLYGON Z ((34.19958 44.58291 0.00000, 34.2029..."
17127,UAGR0019,6683010,UA,261.0,244.731,1,6.233333,1978-01-01,1987-12-31,"POLYGON Z ((34.19624 44.88375 0.00000, 34.1962..."
17128,UAGR0020,6683200,UA,760.0,731.073,0,3.806184,1978-01-01,1987-12-31,"POLYGON Z ((35.78708 47.28708 0.00000, 35.7870..."


## Make a buffer around the catchments
* We can either make the buffer here, or upload an already buffered version (made using QGIS) which is faster. 
* The buffering using Python may take a considerable while. Interestingly if one make the buffer first for a subset and then to the complete list, it processes faster.

In [7]:
# First we assign a tolerance to overcome problems of catchments with delineations 
# slightly outside the other catchment. 
# This code may take a while.

start_time = time.time()
tolerance = 0.01
catchments_buffer = catchments.copy()
catchments_buffer['geometry'] = catchments['geometry'].buffer(tolerance)
end_time = time.time()

# Print the total time elapsed
print("Elapsed time: {:.1f} seconds".format(end_time - start_time))


  catchments_buffer['geometry'] = catchments['geometry'].buffer(tolerance)


Elapsed time: 9543.9 seconds


# Processing
## Nested catchments groups

* First we classifiy the catchments according to their possibility of being nested.
* At the end we have groups (main watershed) to where each sub-catchment is assigned.
* For example, watershed_group == 1 corresponds to the Rhine.

In [8]:
# Nested catchments:
# Initialize an empty list to store nested catchments
nested_catchments = []

# Iterate over each catchment
for index, catchment in tqdm.tqdm(catchments.iterrows()):
    # Get the geometry of the current catchment
    geom = catchment['geometry']
    
    # Iterate over other catchments to check if they are nested
    for index2, other_catchment in catchments_buffer.iterrows():
        # Skip the same catchment
        if index == index2:
            continue
        
        other_geom = other_catchment['geometry']
        
        # Check if the current catchment is completely within the other catchment
        if geom.within(other_geom):
            nested_catchments.append((catchment.basin_id, other_catchment.basin_id))

17130it [2:48:25,  1.70it/s]


In [9]:
# Create the big-groups (main watershed):
# Initialize an empty graph
G = nx.Graph()

# Add nodes for each catchment
for index, catchment in catchments.iterrows():
    G.add_node(catchment['basin_id'])

# Add edges for nested catchments
for nested_pair in nested_catchments:
    G.add_edge(nested_pair[0], nested_pair[1])

# Find connected components
groups = list(nx.connected_components(G))

# Assign groups to catchments
group_assignment = {}
for i, group in enumerate(groups):
    for catchment_id in group:
        group_assignment[catchment_id] = i + 1  # Assigning group numbers starting from 1

# Update the catchments GeoDataFrame with the group assignments
catchments['watershed_group'] = catchments['basin_id'].map(group_assignment)

In [10]:
catchments.head(5)

Unnamed: 0,basin_id,gauge_id,gauge_coun,area,area_calc,area_flag,area_perc,start_date,end_date,geometry,watershed_group
0,AT000001,200014,AT,4647.9,4668.379,0,-0.440608,1996-01-01,2021-12-31,"POLYGON Z ((9.69406 46.54322 0.00000, 9.69570 ...",1
1,AT000002,200048,AT,102.0,102.287,0,-0.281373,1958-10-01,2021-12-31,"POLYGON Z ((10.13650 47.02949 0.00000, 10.1349...",1
2,AT000003,231662,AT,535.2,536.299,0,-0.205344,1985-01-02,2021-12-31,"POLYGON Z ((10.11095 46.89437 0.00000, 10.1122...",1
3,AT000004,200592,AT,66.6,66.286,0,0.471471,1998-01-02,2021-12-31,"POLYGON Z ((10.14189 47.09706 0.00000, 10.1404...",1
4,AT000005,200097,AT,72.2,72.448,0,-0.34349,1990-01-01,2019-12-31,"POLYGON Z ((9.67851 47.06249 0.00000, 9.67888 ...",1


In [11]:
catchments[catchments.watershed_group == 1]

Unnamed: 0,basin_id,gauge_id,gauge_coun,area,area_calc,area_flag,area_perc,start_date,end_date,geometry,watershed_group
0,AT000001,200014,AT,4647.9,4668.379,0,-0.440608,1996-01-01,2021-12-31,"POLYGON Z ((9.69406 46.54322 0.00000, 9.69570 ...",1
1,AT000002,200048,AT,102.0,102.287,0,-0.281373,1958-10-01,2021-12-31,"POLYGON Z ((10.13650 47.02949 0.00000, 10.1349...",1
2,AT000003,231662,AT,535.2,536.299,0,-0.205344,1985-01-02,2021-12-31,"POLYGON Z ((10.11095 46.89437 0.00000, 10.1122...",1
3,AT000004,200592,AT,66.6,66.286,0,0.471471,1998-01-02,2021-12-31,"POLYGON Z ((10.14189 47.09706 0.00000, 10.1404...",1
4,AT000005,200097,AT,72.2,72.448,0,-0.343490,1990-01-01,2019-12-31,"POLYGON Z ((9.67851 47.06249 0.00000, 9.67888 ...",1
...,...,...,...,...,...,...,...,...,...,...,...
14627,LU000016,3,LU,360.5,387.289,0,-7.431068,2002-01-01,2021-07-31,"POLYGON Z ((6.04625 49.39291 0.00000, 6.04625 ...",1
14628,LU000017,16,LU,4231.8,4255.524,0,-0.560613,2002-01-01,2021-07-31,"POLYGON Z ((6.12041 49.50791 0.00000, 6.12041 ...",1
14629,LU000018,5,LU,83.6,83.614,0,-0.016746,2002-01-01,2021-07-31,"POLYGON Z ((6.09374 49.72458 0.00000, 6.09874 ...",1
14630,LU000019,12,LU,641.3,638.434,0,0.446905,2002-01-01,2021-07-31,"POLYGON Z ((6.18958 49.99625 0.00000, 6.19041 ...",1


In [12]:
nested_catchments_df = pd.DataFrame(nested_catchments)
nested_catchments_df.columns = ["catchment_1", "catchment_2"]
nested_catchments_df

Unnamed: 0,catchment_1,catchment_2
0,AT000001,AT000013
1,AT000001,CH000026
2,AT000001,CH000042
3,AT000001,CH000092
4,AT000001,CH000185
...,...,...
265408,UAGR0011,UAGR0014
265409,UAGR0012,UAGR0014
265410,UAGR0013,UAGR0012
265411,UAGR0013,UAGR0014


In [13]:
nested_catchments_df.to_excel("results/extras/nested_catchments_assignment_one2one.xlsx")

## Nested catchments within 
* Here we provide the list of nested catchments within each catchment. 

In [14]:
# Create a geometry column with Point objects for being used:
geometry = [Point(lon, lat) for lon, lat in zip(network['lon_snap'], network['lat_snap'])]

# Create a GeoDataFrame
network = gpd.GeoDataFrame(network, geometry=geometry)

# Optional: Set the coordinate reference system (CRS) if known
# For example, if your coordinates are in WGS84 (EPSG:4326)
network.crs = 'EPSG:4326'

In [15]:
# List to store the results
catchments_nested = []

# Iterate through each catchments geometry
for i, catchment in tqdm.tqdm(catchments.iterrows()):
    # Find the network points located within the current catchments geometry
    network_in_catchment = network[network.within(catchment.geometry)]

    # Get the indices of the network points within the current catchments geometry
    indices = network_in_catchment.index.tolist()

    # Append the list of indices to the results list
    catchments_nested.append(indices)

17130it [44:53,  6.36it/s]


In [16]:
# Convert the list of lists to a pandas DataFrame
catchments_nested_df = pd.DataFrame({'nested_catchments': catchments_nested})

# Set the index of the DataFrame to be the index of the catchments GeoDataFrame
catchments_nested_df.index = catchments.basin_id

# Check each row and replace empty lists with the index value
# It may happen when the outlet is slightly outside (coordinates) the shapefile
for index, row in catchments_nested_df.iterrows():
    if not row['nested_catchments']:
        catchments_nested_df.at[index, 'nested_catchments'] = [index]  # Replace the empty list with the index as a list
          
catchments_nested_df

Unnamed: 0_level_0,nested_catchments
basin_id,Unnamed: 1_level_1
AT000001,"[AT000001, CH000010, CH000046, CH000048, CH000..."
AT000002,[AT000002]
AT000003,"[AT000002, AT000003, CH000221]"
AT000004,[AT000004]
AT000005,"[AT000005, CH000214]"
...,...
UAGR0017,[UAGR0017]
UAGR0018,[UAGR0018]
UAGR0019,[UAGR0019]
UAGR0020,[UAGR0020]


In [None]:
# Here we make sure that the outlet is within the list:
# Ensure that the basin_id is in the nested_catchments
for basin_id in catchments_nested_df.index:
    if basin_id not in catchments_nested_df.at[basin_id, 'nested_catchments']:
        catchments_nested_df.at[basin_id, 'nested_catchments'].append(basin_id)

In [17]:
network.loc[catchments_nested_df.loc["AT000001", "nested_catchments"]]

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,area,...,num_months,num_days,num_days_gaps,num_continuous_days,geometry,duplicated_suspect,watershed_group,gauges_upstream,gauge_downstream,nested_catchments
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,4647.9,...,312,9497,0.0,9497,POINT (9.53484 47.27375),['CH000197'],1,14,AT000013,"['AT000001', 'CH000010', 'CH000046', 'CH000048..."
CH000010,2033,Ilanz,CH,CH_CAMELS,Vorderrhein,9.21,46.78,9.21,46.78,774.0,...,480,14610,0.0,14610,POINT (9.21000 46.78000),,1,2,CH000175,['CH000139']
CH000046,2141,Tiefencastel,CH,CH_CAMELS,Albula,9.57,46.66,9.57,46.66,529.0,...,480,14610,0.0,14610,POINT (9.57000 46.66000),,1,4,CH000125,"['CH000105', 'CH000113', 'CH000135']"
CH000048,2150,Felsenbach,CH,CH_CAMELS,Landquart,9.61,46.97,9.61,46.97,613.7,...,480,14610,0.0,14610,POINT (9.61000 46.97000),,1,1,AT000013,['CH000048']
CH000062,2185,Chur,CH,CH_CAMELS,Plessur,9.51,46.86,9.51,46.86,264.4,...,480,14610,0.0,14610,POINT (9.51000 46.86000),,1,1,AT000013,['CH000062']
CH000105,2327,Davos-Kriegsmatte,CH,CH_CAMELS,Dischmabach,9.88,46.78,9.88,46.78,42.9,...,480,14610,0.0,14610,POINT (9.88000 46.78000),,1,1,CH000113,['CH000105']
CH000113,2355,Davos-Frauenkirch,CH,CH_CAMELS,Landwasser,9.79,46.76,9.79,46.76,183.7,...,480,14610,0.0,14610,POINT (9.79000 46.76000),,1,2,CH000046,"['CH000105', 'CH000113']"
CH000125,2387,Fürstenau,CH,CH_CAMELS,Hinterrhein,9.45,46.72,9.45,46.72,1576.9,...,480,14610,0.0,14610,POINT (9.45000 46.72000),,1,6,CH000197,"['CH000046', 'CH000105', 'CH000113', 'CH000135..."
CH000129,2410,Ruggell,CH,CH_CAMELS,Liechtensteiner_Binnenkanal,9.52,47.24,9.52,47.24,114.5,...,480,14610,0.0,14610,POINT (9.52000 47.24000),,1,1,DEBW0185,['CH000129']
CH000135,2418,Tiefencastel,CH,CH_CAMELS,Julia,9.58,46.66,9.58,46.66,324.7,...,480,14610,0.0,14610,POINT (9.58000 46.66000),,1,1,CH000125,['CH000135']


In [19]:
catchments_nested_df.to_csv("results/extras/estreams_gauging_stations_nested_catchments.csv")

## Number of unique gauges upstream
* Here we comoute the number of gauges upstream.
* A headwater catchment will have a number 1, while a downstream catchment that has two gauges within (not counting the outlet) has a number 3.

In [20]:
# Assign the index to the shapefile:
catchments.set_index("basin_id", inplace = True)

# Keep one field with the same name:
catchments["basin_id"] = catchments.index

In [21]:
# Create one field with the same name as the index:
network["basin_id"]= network.index
network

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,area,...,num_days,num_days_gaps,num_continuous_days,geometry,duplicated_suspect,watershed_group,gauges_upstream,gauge_downstream,nested_catchments,basin_id
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,4647.9,...,9497,0.0,9497,POINT (9.53484 47.27375),['CH000197'],1,14,AT000013,"['AT000001', 'CH000010', 'CH000046', 'CH000048...",AT000001
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,102.0,...,23103,0.0,23103,POINT (9.91368 47.08030),['CH000221'],1,1,CH000221,['AT000002'],AT000002
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,535.2,...,13513,0.0,13513,POINT (9.84777 47.13282),['CH000215'],1,2,CH000215,"['AT000002', 'AT000003', 'CH000221']",AT000003
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,66.6,...,8765,0.0,8765,POINT (10.06184 47.12899),['CH000227'],1,1,CH000227,['AT000004'],AT000004
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.150770,9.802668,47.150770,72.2,...,10957,0.0,10957,POINT (9.80267 47.15077),['CH000214'],1,1,CH000214,"['AT000005', 'CH000214']",AT000005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,BASHTANOVKA,UA,UA_GRDC,KACHA,33.894739,44.691884,33.900000,44.683333,321.0,...,3652,0.0,3652,POINT (33.89474 44.69188),,1988,1,,['UAGR0017'],UAGR0017
UAGR0018,6682500,YALTA,UA,UA_GRDC,DERE-KIOY,34.166667,44.500000,34.166667,44.500000,49.7,...,3652,0.0,3652,POINT (34.16667 44.50000),,1989,1,,['UAGR0018'],UAGR0018
UAGR0019,6683010,PIONERSKOE,UA,UA_GRDC,SALHYR,34.199841,44.887685,34.200000,44.883333,261.0,...,3652,0.0,3652,POINT (34.19984 44.88769),,1990,1,,['UAGR0019'],UAGR0019
UAGR0020,6683200,TOKMAK,UA,UA_GRDC,TOKMAK,35.705833,47.251389,35.705833,47.251389,760.0,...,3652,0.0,3652,POINT (35.70583 47.25139),,1991,1,,['UAGR0020'],UAGR0020


In [22]:
# Create a geometry column with Point objects for being used:
geometry = [Point(lon, lat) for lon, lat in zip(network['lon_snap'], network['lat_snap'])]

# Create a GeoDataFrame
network = gpd.GeoDataFrame(network, geometry=geometry)

# Optional: Set the coordinate reference system (CRS) if known
# For example, if your coordinates are in WGS84 (EPSG:4326)
network.crs = 'EPSG:4326'

### Apply the count taking into account some filters:
       - Points to pay attention:
* Outlet is seldom slightly outside the shapefile. 
* Catchment outlet has one duplicate within the shapefile.
* Catchments within the shapefile also have duplicates. 

       - Solution:
* We exclude the outlet from the count, and count + 1 at the end for all catchments. 
* We apply a filter to delete the catchment outlet to count duplicated_suspects that are within the catchment shapefile. 
* We count the number of duplicates, and when it is even, we simply divide per 2 and substract at the end count = count - (n/2). If it is odd, we do count = count - ((n - 1)/2 + 1). The reason is that when we have a two duplicates, they could delete each other.

In [23]:
# Spatial join to count geometries within the catchments shapefile
joined = gpd.sjoin(catchments, network, how='inner', op='intersects')

# Exclude geometries with the same "basin_id" as in the network GeoDataFrame (exclude the outlet):
joined_filtered = joined[joined['basin_id_left'] != joined['basin_id_right']]

# Here we create a function to deal with the duplicates of the outlet when they happen to be within:
# Parse the "duplicated_suspect" column to extract individual basin_ids
def parse_duplicated_suspect(suspect):
    if pd.isna(suspect):
        return []
    else:
        return suspect.split(', ')

joined_filtered['duplicated_suspect_ids'] = joined_filtered['duplicated_suspect'].apply(parse_duplicated_suspect)

# Exclude basin IDs from the count when there are duplicated suspects
def exclude_duplicated_suspects(row):
    if len(row['duplicated_suspect_ids']) > 0:
        return row['basin_id_left'] not in row['duplicated_suspect_ids']
    else:
        return True

joined_filtered = joined_filtered[joined_filtered.apply(exclude_duplicated_suspects, axis=1)]

# Count the number of geometries for each unique "basin_id" in the catchments shapefile
count_per_basin = joined_filtered['basin_id_left'].value_counts()

# Count the number of non-null values in the "duplicated_suspect" column for each basin ID
duplicates_count = joined_filtered.groupby('basin_id_left')['duplicated_suspect'].count()

# Adjust the count based on the number of duplicates within each catchment
for basin_id, count in duplicates_count.items():
    if count % 2 == 0:
        count_per_basin[basin_id] -= count // 2
    else:
        count_per_basin[basin_id] -= (count - 1) // 2
        count_per_basin[basin_id] += 1

# Here we add 1 station to include the outlet
count_per_basin += 1

network["gauges_upstream"] = np.nan      
network["gauges_upstream"] = count_per_basin

# Filter the potential NaNs:
network['gauges_upstream'] = network['gauges_upstream'].fillna(1)

network.head(10)

  if (await self.run_code(code, result,  async_=asy)):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,area,...,num_days,num_days_gaps,num_continuous_days,geometry,duplicated_suspect,watershed_group,gauges_upstream,gauge_downstream,nested_catchments,basin_id
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,4647.9,...,9497,0.0,9497,POINT (9.53484 47.27375),['CH000197'],1,16.0,AT000013,"['AT000001', 'CH000010', 'CH000046', 'CH000048...",AT000001
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,102.0,...,23103,0.0,23103,POINT (9.91368 47.08030),['CH000221'],1,1.0,CH000221,['AT000002'],AT000002
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,535.2,...,13513,0.0,13513,POINT (9.84777 47.13282),['CH000215'],1,2.0,CH000215,"['AT000002', 'AT000003', 'CH000221']",AT000003
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,66.6,...,8765,0.0,8765,POINT (10.06184 47.12899),['CH000227'],1,1.0,CH000227,['AT000004'],AT000004
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.15077,9.802668,47.15077,72.2,...,10957,0.0,10957,POINT (9.80267 47.15077),['CH000214'],1,3.0,CH000214,"['AT000005', 'CH000214']",AT000005
AT000006,200105,Garsella,AT,AT_EHYD,Lutz,9.875898,47.226658,9.875898,47.226658,95.5,...,21185,0.0,21185,POINT (9.87590 47.22666),['CH000218'],1,3.0,CH000218,"['AT000006', 'CH000218']",AT000006
AT000007,231688,Beschling,AT,AT_EHYD,Ill,9.67885,47.200301,9.67885,47.200301,1118.6,...,13514,0.0,13514,POINT (9.67885 47.20030),['CH000205'],1,8.0,AT000009,"['AT000002', 'AT000003', 'AT000004', 'AT000005...",AT000007
AT000008,200501,Amerluegen,AT,AT_EHYD,Samina,9.614203,47.205978,9.614203,47.205978,70.0,...,11322,0.0,11322,POINT (9.61420 47.20598),['CH000201'],1,1.0,CH000201,['AT000008'],AT000008
AT000009,200147,Gisingen,AT,AT_EHYD,Ill,9.57888,47.260362,9.57888,47.260362,1281.0,...,25933,0.0,25933,POINT (9.57888 47.26036),['CH000199'],1,10.0,CH000199,"['AT000002', 'AT000003', 'AT000004', 'AT000005...",AT000009
AT000010,200154,Laterns,AT,AT_EHYD,Frutz,9.728853,47.256933,9.728853,47.256933,33.4,...,24104,3.0,10517,POINT (9.72885 47.25693),['CH000209'],1,1.0,CH000209,['AT000010'],AT000010


## Assign the new values to the network:

In [27]:
network_EU['watershed_group'] = catchments['watershed_group']
network_EU

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,area,...,num_months,num_days,num_days_gaps,num_continuous_days,geometry,duplicated_suspect,watershed_group,gauges_upstream,gauge_downstream,nested_catchments
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,4647.9,...,312,9497,0.0,9497,POINT (9.534835180274024 47.27374823144418),['CH000197'],1,14,AT000013,"['AT000001', 'CH000010', 'CH000046', 'CH000048..."
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,102.0,...,759,23103,0.0,23103,POINT (9.913676603393986 47.08030125096045),['CH000221'],1,1,CH000221,['AT000002']
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,535.2,...,444,13513,0.0,13513,POINT (9.847765104487962 47.13282061553542),['CH000215'],1,2,CH000215,"['AT000002', 'AT000003', 'CH000221']"
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,66.6,...,288,8765,0.0,8765,POINT (10.06184292326845 47.1289938468501),['CH000227'],1,1,CH000227,['AT000004']
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.150770,9.802668,47.150770,72.2,...,360,10957,0.0,10957,POINT (9.802668269683238 47.15077039253907),['CH000214'],1,1,CH000214,"['AT000005', 'CH000214']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,BASHTANOVKA,UA,UA_GRDC,KACHA,33.894739,44.691884,33.900000,44.683333,321.0,...,120,3652,0.0,3652,POINT (33.9 44.683333),,1988,1,,['UAGR0017']
UAGR0018,6682500,YALTA,UA,UA_GRDC,DERE-KIOY,34.166667,44.500000,34.166667,44.500000,49.7,...,120,3652,0.0,3652,POINT (34.166667 44.5),,1989,1,,['UAGR0018']
UAGR0019,6683010,PIONERSKOE,UA,UA_GRDC,SALHYR,34.199841,44.887685,34.200000,44.883333,261.0,...,120,3652,0.0,3652,POINT (34.2 44.883333),,1990,1,,['UAGR0019']
UAGR0020,6683200,TOKMAK,UA,UA_GRDC,TOKMAK,35.705833,47.251389,35.705833,47.251389,760.0,...,120,3652,0.0,3652,POINT (35.705833 47.251389),,1991,1,,['UAGR0020']


In [28]:
network_EU['gauges_upstream'] = network['gauges_upstream'].astype(int)
network_EU

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,area,...,num_months,num_days,num_days_gaps,num_continuous_days,geometry,duplicated_suspect,watershed_group,gauges_upstream,gauge_downstream,nested_catchments
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,4647.9,...,312,9497,0.0,9497,POINT (9.534835180274024 47.27374823144418),['CH000197'],1,16,AT000013,"['AT000001', 'CH000010', 'CH000046', 'CH000048..."
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,102.0,...,759,23103,0.0,23103,POINT (9.913676603393986 47.08030125096045),['CH000221'],1,1,CH000221,['AT000002']
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,535.2,...,444,13513,0.0,13513,POINT (9.847765104487962 47.13282061553542),['CH000215'],1,2,CH000215,"['AT000002', 'AT000003', 'CH000221']"
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,66.6,...,288,8765,0.0,8765,POINT (10.06184292326845 47.1289938468501),['CH000227'],1,1,CH000227,['AT000004']
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.150770,9.802668,47.150770,72.2,...,360,10957,0.0,10957,POINT (9.802668269683238 47.15077039253907),['CH000214'],1,3,CH000214,"['AT000005', 'CH000214']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,BASHTANOVKA,UA,UA_GRDC,KACHA,33.894739,44.691884,33.900000,44.683333,321.0,...,120,3652,0.0,3652,POINT (33.9 44.683333),,1988,1,,['UAGR0017']
UAGR0018,6682500,YALTA,UA,UA_GRDC,DERE-KIOY,34.166667,44.500000,34.166667,44.500000,49.7,...,120,3652,0.0,3652,POINT (34.166667 44.5),,1989,1,,['UAGR0018']
UAGR0019,6683010,PIONERSKOE,UA,UA_GRDC,SALHYR,34.199841,44.887685,34.200000,44.883333,261.0,...,120,3652,0.0,3652,POINT (34.2 44.883333),,1990,1,,['UAGR0019']
UAGR0020,6683200,TOKMAK,UA,UA_GRDC,TOKMAK,35.705833,47.251389,35.705833,47.251389,760.0,...,120,3652,0.0,3652,POINT (35.705833 47.251389),,1991,1,,['UAGR0020']


In [29]:
network_EU[network_EU.watershed_group== 1]

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,area,...,num_months,num_days,num_days_gaps,num_continuous_days,geometry,duplicated_suspect,watershed_group,gauges_upstream,gauge_downstream,nested_catchments
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,4647.9,...,312,9497,0.0,9497,POINT (9.534835180274024 47.27374823144418),['CH000197'],1,16,AT000013,"['AT000001', 'CH000010', 'CH000046', 'CH000048..."
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,102.0,...,759,23103,0.0,23103,POINT (9.913676603393986 47.08030125096045),['CH000221'],1,1,CH000221,['AT000002']
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,535.2,...,444,13513,0.0,13513,POINT (9.847765104487962 47.13282061553542),['CH000215'],1,2,CH000215,"['AT000002', 'AT000003', 'CH000221']"
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,66.6,...,288,8765,0.0,8765,POINT (10.06184292326845 47.1289938468501),['CH000227'],1,1,CH000227,['AT000004']
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.150770,9.802668,47.150770,72.2,...,360,10957,0.0,10957,POINT (9.802668269683238 47.15077039253907),['CH000214'],1,3,CH000214,"['AT000005', 'CH000214']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LU000016,3,Pfaffenthal,LU,LU_CONTACTFORM,Alzette,6.132266,49.620647,6.132266,49.620647,360.5,...,235,7152,0.0,7152,POINT (6.132266 49.620647),,1,5,LU000014,"['FR003289', 'FR003290', 'FR003291', 'LU000009..."
LU000017,16,Rosport,LU,LU_CONTACTFORM,Sure,6.509851,49.785883,6.509851,49.785883,4231.8,...,235,7152,0.0,7152,POINT (6.509851 49.785883),,1,35,DEBU1959,"['BEWA0066', 'BEWA0067', 'BEWA0087', 'BEWA0106..."
LU000018,5,Schoenfels,LU,LU_CONTACTFORM,Mamer,6.100795,49.723112,6.100795,49.723112,83.6,...,235,7152,0.0,7152,POINT (6.100795 49.723112),,1,1,LU000007,['LU000018']
LU000019,12,Vianden,LU,LU_CONTACTFORM,Our,6.204738,49.939224,6.204738,49.939224,641.3,...,235,7152,0.0,7152,POINT (6.204738 49.939224),,1,9,LU000017,"['BEWA0066', 'BEWA0067', 'BEWA0106', 'BEWA0107..."


In [31]:
network_EU['nested_catchments'] = catchments_nested_df['nested_catchments']
network_EU

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,area,...,num_months,num_days,num_days_gaps,num_continuous_days,geometry,duplicated_suspect,watershed_group,gauges_upstream,gauge_downstream,nested_catchments
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,4647.9,...,312,9497,0.0,9497,POINT (9.534835180274024 47.27374823144418),['CH000197'],1,16,AT000013,"[AT000001, CH000010, CH000046, CH000048, CH000..."
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,102.0,...,759,23103,0.0,23103,POINT (9.913676603393986 47.08030125096045),['CH000221'],1,1,CH000221,[AT000002]
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,535.2,...,444,13513,0.0,13513,POINT (9.847765104487962 47.13282061553542),['CH000215'],1,2,CH000215,"[AT000002, AT000003, CH000221]"
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,66.6,...,288,8765,0.0,8765,POINT (10.06184292326845 47.1289938468501),['CH000227'],1,1,CH000227,[AT000004]
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.150770,9.802668,47.150770,72.2,...,360,10957,0.0,10957,POINT (9.802668269683238 47.15077039253907),['CH000214'],1,3,CH000214,"[AT000005, CH000214]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,BASHTANOVKA,UA,UA_GRDC,KACHA,33.894739,44.691884,33.900000,44.683333,321.0,...,120,3652,0.0,3652,POINT (33.9 44.683333),,1988,1,,[UAGR0017]
UAGR0018,6682500,YALTA,UA,UA_GRDC,DERE-KIOY,34.166667,44.500000,34.166667,44.500000,49.7,...,120,3652,0.0,3652,POINT (34.166667 44.5),,1989,1,,[UAGR0018]
UAGR0019,6683010,PIONERSKOE,UA,UA_GRDC,SALHYR,34.199841,44.887685,34.200000,44.883333,261.0,...,120,3652,0.0,3652,POINT (34.2 44.883333),,1990,1,,[UAGR0019]
UAGR0020,6683200,TOKMAK,UA,UA_GRDC,TOKMAK,35.705833,47.251389,35.705833,47.251389,760.0,...,120,3652,0.0,3652,POINT (35.705833 47.251389),,1991,1,,[UAGR0020]


## Save the data

In [32]:
# Save the dataframe:
network_EU.to_csv('results/extras/estreams_gauging_stations_nested.csv',  encoding='utf-8')

## End