# Streamflow FINAL adjusts shapefiles
Author: Thiago Nascimento (thiago.nascimento@eawag.ch)

This notebook is part of the EStreams publication and was used to filter the original collated streamflow time-series according to their original flags (when available from providers). The flags are translated as 2=no-flag from providers, 1=validated data, 0=suspect data, NaN=missing value. This approach follows the same framework employed by Chen et al. (2023)  


* Note that this code enables not only the replicability of the current database but also the extrapolation to new catchment areas. 
* Additionally, the user should download and insert the original raw-data in the folder of the same name prior to run this code. 
* The original third-party data used were not made available in this repository due to redistribution and storage-space reasons.  

## Requirements
**Python:**

* Python>=3.6
* Jupyter
* geopandas=0.10.2
* numpy
* os
* pandas=2.1.3
* scipy=1.9.0
* tqdm

Check the Github repository for an environment.yml (for conda environments) or requirements.txt (pip) file.

**Files:**

* data/streamflow/estreams_timeseries_discharge.csv
* data/streamflow/estreams_gauging_stations.csv
* data/shapefiles/estreams_catchments.shp

**Directory:**

* Clone the GitHub directory locally
* Place any third-data variables in their respective directory.
* ONLY update the "PATH" variable in the section "Configurations", with their relative path to the EStreams directory. 


## References
* Chen, X., Jiang, L., Luo, Y., and Liu, J.: A global streamflow indices time series dataset for large-sample hydrological analyses on streamflow regime (until 2022), Earth Syst. Sci. Data, 15, 4463–4479, https://doi.org/10.5194/essd-15-4463-2023, 2023.

# Import modules

In [1]:
import pandas as pd
import numpy as np
import tqdm as tqdm
import os
import warnings
import geopandas as gpd
from shapely.geometry import Point


# Configurations

In [2]:
# Only editable variable:
# Relative path to your local directory
PATH = ".."

* #### The users should NOT change anything in the code below here. 

In [3]:
# Non-editable variables:
PATH_OUTPUT = "results/staticattributes/"

# Set the directory:
os.chdir(PATH)

warnings.filterwarnings("ignore")

# Import data

## Streamflow gauges network

In [4]:
network_estreams = pd.read_csv('results/estreams_gauging_stations_v03.csv', encoding='utf-8')
network_estreams.set_index("basin_id", inplace = True)
network_estreams["end_date"] = pd.to_datetime(network_estreams["end_date"])
network_estreams["start_date"] = pd.to_datetime(network_estreams["start_date"])
network_estreams

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,elevation,...,num_continuous_days,num_days_gaps,num_days_reliable,num_days_noflag,num_days_suspect,gauge_flag,duplicated_suspect,watershed_group,gauges_upstream,nested_catchments
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,420,...,9497,0.0,0.0,9497.0,0.0,B,['CH000197'],1,16,"['AT000001', 'CH000010', 'CH000046', 'CH000048..."
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,673,...,23103,0.0,0.0,23103.0,0.0,B,['CH000221'],1,1,['AT000002']
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,579,...,13513,0.0,0.0,13513.0,0.0,B,['CH000215'],1,2,"['AT000002', 'AT000003', 'CH000221']"
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,1014,...,8765,0.0,0.0,8765.0,0.0,B,['CH000227'],1,1,['AT000004']
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.150770,9.802668,47.150770,564,...,10957,0.0,0.0,10957.0,0.0,B,['CH000214'],1,3,"['AT000005', 'CH000214']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,BASHTANOVKA,UA,UA_GRDC,KACHA,33.894739,44.691884,33.900000,44.683333,,...,3652,0.0,0.0,3652.0,0.0,B,,1988,1,['UAGR0017']
UAGR0018,6682500,YALTA,UA,UA_GRDC,DERE-KIOY,34.166667,44.500000,34.166667,44.500000,16,...,3652,0.0,0.0,3652.0,0.0,B,,1989,1,['UAGR0018']
UAGR0019,6683010,PIONERSKOE,UA,UA_GRDC,SALHYR,34.199841,44.887685,34.200000,44.883333,307,...,3652,0.0,0.0,3652.0,0.0,B,,1990,1,['UAGR0019']
UAGR0020,6683200,TOKMAK,UA,UA_GRDC,TOKMAK,35.705833,47.251389,35.705833,47.251389,,...,3652,0.0,0.0,3652.0,0.0,B,,1991,1,['UAGR0020']


In [5]:
# Rename the columns realted to area:
network_estreams.rename(columns={'area': 'area_official', 'area_calc': 'area_estreams',
                                 'area_perc': 'area_rel'}, inplace=True)

network_estreams

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,elevation,...,num_continuous_days,num_days_gaps,num_days_reliable,num_days_noflag,num_days_suspect,gauge_flag,duplicated_suspect,watershed_group,gauges_upstream,nested_catchments
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,420,...,9497,0.0,0.0,9497.0,0.0,B,['CH000197'],1,16,"['AT000001', 'CH000010', 'CH000046', 'CH000048..."
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,673,...,23103,0.0,0.0,23103.0,0.0,B,['CH000221'],1,1,['AT000002']
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,579,...,13513,0.0,0.0,13513.0,0.0,B,['CH000215'],1,2,"['AT000002', 'AT000003', 'CH000221']"
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,1014,...,8765,0.0,0.0,8765.0,0.0,B,['CH000227'],1,1,['AT000004']
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.150770,9.802668,47.150770,564,...,10957,0.0,0.0,10957.0,0.0,B,['CH000214'],1,3,"['AT000005', 'CH000214']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,BASHTANOVKA,UA,UA_GRDC,KACHA,33.894739,44.691884,33.900000,44.683333,,...,3652,0.0,0.0,3652.0,0.0,B,,1988,1,['UAGR0017']
UAGR0018,6682500,YALTA,UA,UA_GRDC,DERE-KIOY,34.166667,44.500000,34.166667,44.500000,16,...,3652,0.0,0.0,3652.0,0.0,B,,1989,1,['UAGR0018']
UAGR0019,6683010,PIONERSKOE,UA,UA_GRDC,SALHYR,34.199841,44.887685,34.200000,44.883333,307,...,3652,0.0,0.0,3652.0,0.0,B,,1990,1,['UAGR0019']
UAGR0020,6683200,TOKMAK,UA,UA_GRDC,TOKMAK,35.705833,47.251389,35.705833,47.251389,,...,3652,0.0,0.0,3652.0,0.0,B,,1991,1,['UAGR0020']


In [16]:
network_estreams[["area_official", "area_estreams", "area_rel"]].round(4)
network_estreams[["area_official", "area_estreams", "area_rel"]]

Unnamed: 0_level_0,area_official,area_estreams,area_rel
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AT000001,4647.9,4668.379,-0.440608
AT000002,102.0,102.287,-0.281373
AT000003,535.2,536.299,-0.205344
AT000004,66.6,66.286,0.471471
AT000005,72.2,72.448,-0.343490
...,...,...,...
UAGR0017,321.0,325.370,-1.361371
UAGR0018,49.7,47.594,4.237425
UAGR0019,261.0,244.731,6.233333
UAGR0020,760.0,731.073,3.806184


In [17]:
network_estreams.columns

Index(['gauge_id', 'gauge_name', 'gauge_country', 'gauge_provider', 'river',
       'lon_snap', 'lat_snap', 'lon', 'lat', 'elevation', 'area_official',
       'area_estreams', 'area_flag', 'area_rel', 'start_date', 'end_date',
       'num_years', 'num_months', 'num_days', 'num_continuous_days',
       'num_days_gaps', 'num_days_reliable', 'num_days_noflag',
       'num_days_suspect', 'gauge_flag', 'duplicated_suspect',
       'watershed_group', 'gauges_upstream', 'nested_catchments'],
      dtype='object')

## Catchment boundaries

In [18]:
catchment_boundaries = gpd.read_file('results/estreams_catchments.shp')
catchment_boundaries.set_index("basin_id", inplace=True)
catchment_boundaries

Unnamed: 0_level_0,gauge_id,country,area_offic,area_estre,area_flag,area_rel,start_date,end_date,gauge_flag,upstream,group,geometry
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AT000001,200014,AT,4647.9,4668.379,0,-0.440608,1996-01-01,2021-12-31,B,16,1,"POLYGON Z ((9.69406 46.54322 0.00000, 9.69570 ..."
AT000002,200048,AT,102.0,102.287,0,-0.281373,1958-10-01,2021-12-31,B,1,1,"POLYGON Z ((10.13650 47.02949 0.00000, 10.1349..."
AT000003,231662,AT,535.2,536.299,0,-0.205344,1985-01-02,2021-12-31,B,2,1,"POLYGON Z ((10.11095 46.89437 0.00000, 10.1122..."
AT000004,200592,AT,66.6,66.286,0,0.471471,1998-01-02,2021-12-31,B,1,1,"POLYGON Z ((10.14189 47.09706 0.00000, 10.1404..."
AT000005,200097,AT,72.2,72.448,0,-0.343490,1990-01-01,2019-12-31,B,3,1,"POLYGON Z ((9.67851 47.06249 0.00000, 9.67888 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,UA,321.0,325.370,0,-1.361371,1978-01-01,1987-12-31,B,1,1988,"POLYGON Z ((33.96791 44.63291 0.00000, 33.9679..."
UAGR0018,6682500,UA,49.7,47.594,0,4.237425,1978-01-01,1987-12-31,B,1,1989,"POLYGON Z ((34.19958 44.58291 0.00000, 34.2029..."
UAGR0019,6683010,UA,261.0,244.731,1,6.233333,1978-01-01,1987-12-31,B,1,1990,"POLYGON Z ((34.19624 44.88375 0.00000, 34.1962..."
UAGR0020,6683200,UA,760.0,731.073,0,3.806184,1978-01-01,1987-12-31,B,1,1991,"POLYGON Z ((35.78708 47.28708 0.00000, 35.7870..."


## Final adjusts shapefiles

### Catchment boundaries

In [19]:
catchment_boundaries["upstream"] = network_estreams["gauges_upstream"]
catchment_boundaries["group"] = network_estreams["watershed_group"]
catchment_boundaries["gauge_flag"] = network_estreams["gauge_flag"]
catchment_boundaries["country"] = network_estreams["gauge_country"]
catchment_boundaries

Unnamed: 0_level_0,gauge_id,country,area_offic,area_estre,area_flag,area_rel,start_date,end_date,gauge_flag,upstream,group,geometry
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AT000001,200014,AT,4647.9,4668.379,0,-0.440608,1996-01-01,2021-12-31,B,16,1,"POLYGON Z ((9.69406 46.54322 0.00000, 9.69570 ..."
AT000002,200048,AT,102.0,102.287,0,-0.281373,1958-10-01,2021-12-31,B,1,1,"POLYGON Z ((10.13650 47.02949 0.00000, 10.1349..."
AT000003,231662,AT,535.2,536.299,0,-0.205344,1985-01-02,2021-12-31,B,2,1,"POLYGON Z ((10.11095 46.89437 0.00000, 10.1122..."
AT000004,200592,AT,66.6,66.286,0,0.471471,1998-01-02,2021-12-31,B,1,1,"POLYGON Z ((10.14189 47.09706 0.00000, 10.1404..."
AT000005,200097,AT,72.2,72.448,0,-0.343490,1990-01-01,2019-12-31,B,3,1,"POLYGON Z ((9.67851 47.06249 0.00000, 9.67888 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,UA,321.0,325.370,0,-1.361371,1978-01-01,1987-12-31,B,1,1988,"POLYGON Z ((33.96791 44.63291 0.00000, 33.9679..."
UAGR0018,6682500,UA,49.7,47.594,0,4.237425,1978-01-01,1987-12-31,B,1,1989,"POLYGON Z ((34.19958 44.58291 0.00000, 34.2029..."
UAGR0019,6683010,UA,261.0,244.731,1,6.233333,1978-01-01,1987-12-31,B,1,1990,"POLYGON Z ((34.19624 44.88375 0.00000, 34.1962..."
UAGR0020,6683200,UA,760.0,731.073,0,3.806184,1978-01-01,1987-12-31,B,1,1991,"POLYGON Z ((35.78708 47.28708 0.00000, 35.7870..."


In [20]:
# Adjust the fields in the catchment boundaries layer
catchment_boundaries = catchment_boundaries[['gauge_id', 'country', 'area', 'area_calc',
       'area_flag', 'area_perc', 'start_date', 'end_date', 'gauge_flag', 'upstream', 'group', 'geometry']]

catchment_boundaries.start_date = catchment_boundaries.start_date.astype(str)
catchment_boundaries.end_date = catchment_boundaries.end_date.astype(str)

catchment_boundaries

KeyError: "['area', 'area_calc', 'area_perc'] not in index"

In [None]:
# Rename the columns related to area:
catchment_boundaries.rename(columns={'area': 'area_official', 'area_calc': 'area_estreams',
                                 'area_perc': 'area_rel'}, inplace=True)
catchment_boundaries

Unnamed: 0_level_0,gauge_id,country,area_official,area_estreams,area_flag,area_rel,start_date,end_date,gauge_flag,upstream,group,geometry
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AT000001,200014,AT,4647.9,4668.379,0,-0.440608,1996-01-01,2021-12-31,B,16,1,"POLYGON Z ((9.69406 46.54322 0.00000, 9.69570 ..."
AT000002,200048,AT,102.0,102.287,0,-0.281373,1958-10-01,2021-12-31,B,1,1,"POLYGON Z ((10.13650 47.02949 0.00000, 10.1349..."
AT000003,231662,AT,535.2,536.299,0,-0.205344,1985-01-02,2021-12-31,B,2,1,"POLYGON Z ((10.11095 46.89437 0.00000, 10.1122..."
AT000004,200592,AT,66.6,66.286,0,0.471471,1998-01-02,2021-12-31,B,1,1,"POLYGON Z ((10.14189 47.09706 0.00000, 10.1404..."
AT000005,200097,AT,72.2,72.448,0,-0.343490,1990-01-01,2019-12-31,B,3,1,"POLYGON Z ((9.67851 47.06249 0.00000, 9.67888 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,UA,321.0,325.370,0,-1.361371,1978-01-01,1987-12-31,B,1,1988,"POLYGON Z ((33.96791 44.63291 0.00000, 33.9679..."
UAGR0018,6682500,UA,49.7,47.594,0,4.237425,1978-01-01,1987-12-31,B,1,1989,"POLYGON Z ((34.19958 44.58291 0.00000, 34.2029..."
UAGR0019,6683010,UA,261.0,244.731,1,6.233333,1978-01-01,1987-12-31,B,1,1990,"POLYGON Z ((34.19624 44.88375 0.00000, 34.1962..."
UAGR0020,6683200,UA,760.0,731.073,0,3.806184,1978-01-01,1987-12-31,B,1,1991,"POLYGON Z ((35.78708 47.28708 0.00000, 35.7870..."


### Gauges network shapefile

In [23]:
# Create geometry column
geometry = [Point(xy) for xy in zip(network_estreams['lon_snap'], network_estreams['lat_snap'])]
network_estreams_gdf = gpd.GeoDataFrame(network_estreams, geometry=geometry)

network_estreams_gdf.start_date = network_estreams_gdf.start_date.astype(str)
network_estreams_gdf.end_date = network_estreams_gdf.end_date.astype(str)

network_estreams_gdf.drop("nested_catchments", axis=1, inplace=True)

network_estreams_gdf

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,elevation,...,num_continuous_days,num_days_gaps,num_days_reliable,num_days_noflag,num_days_suspect,gauge_flag,duplicated_suspect,watershed_group,gauges_upstream,geometry
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,420,...,9497,0.0,0.0,9497.0,0.0,B,['CH000197'],1,16,POINT (9.53484 47.27375)
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,673,...,23103,0.0,0.0,23103.0,0.0,B,['CH000221'],1,1,POINT (9.91368 47.08030)
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,579,...,13513,0.0,0.0,13513.0,0.0,B,['CH000215'],1,2,POINT (9.84777 47.13282)
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,1014,...,8765,0.0,0.0,8765.0,0.0,B,['CH000227'],1,1,POINT (10.06184 47.12899)
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.150770,9.802668,47.150770,564,...,10957,0.0,0.0,10957.0,0.0,B,['CH000214'],1,3,POINT (9.80267 47.15077)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,BASHTANOVKA,UA,UA_GRDC,KACHA,33.894739,44.691884,33.900000,44.683333,,...,3652,0.0,0.0,3652.0,0.0,B,,1988,1,POINT (33.89474 44.69188)
UAGR0018,6682500,YALTA,UA,UA_GRDC,DERE-KIOY,34.166667,44.500000,34.166667,44.500000,16,...,3652,0.0,0.0,3652.0,0.0,B,,1989,1,POINT (34.16667 44.50000)
UAGR0019,6683010,PIONERSKOE,UA,UA_GRDC,SALHYR,34.199841,44.887685,34.200000,44.883333,307,...,3652,0.0,0.0,3652.0,0.0,B,,1990,1,POINT (34.19984 44.88769)
UAGR0020,6683200,TOKMAK,UA,UA_GRDC,TOKMAK,35.705833,47.251389,35.705833,47.251389,,...,3652,0.0,0.0,3652.0,0.0,B,,1991,1,POINT (35.70583 47.25139)


## Export the data

In [24]:
# Export to csv (network)
network_estreams.to_csv('results/estreams_gauging_stations_v04.csv',  encoding='utf-8')

In [25]:
# Export to shapefile (network)
network_estreams_gdf.to_file('results/estreams_gauging_stations.shp')

In [26]:
# Export to shapefile (shapefile)
catchment_boundaries.to_file('results/estreams_catchments.shp')

# End