# Final adjust of the csv-files and shape-files

Author: Thiago Nascimento (thiago.nascimento@eawag.ch)

This notebook is part of the EStreams publication and was used to concatenate the final information for the gauging stations shapefiles and csvs, and if necessary, editing something. 

* Note that this code enables not only the replicability of the current database but also the extrapolation to new catchment areas. 
* Additionally, the user should download and insert the original raw-data in the folder of the same name prior to run this code. 
* The original third-party data used were not made avaialble in this repository due to redistribution and storage-space reasons.  

## Requirements
**Python:**

* Python>=3.6
* Jupyter
* geopandas=0.10.2
* numpy
* os
* pandas
* shapely
* tqdm
* warnings

Check the Github repository for an environment.yml (for conda environments) or requirements.txt (pip) file.

**Files:**

* results/estreams_gauging_stations.csv
* results/estreams_gauging_stations_duplicates.csv
* results/estreams_gauging_stations_nested.csv
* results/estreams_catchments.shp

**Directory:**

* Clone the GitHub directory locally
* Place any third-data variables in their respective directory.
* ONLY update the "PATH" variable in the section "Configurations", with their relative path to the EStreams directory. 

# Import modules

In [16]:
import pandas as pd
import numpy as np
import datetime
import tqdm as tqdm
import os
import geopandas as gpd
from shapely.geometry import Polygon, Point

# Configurations

In [2]:
# Only editable variable:
# Relative path to your local directory
PATH = ".."

* #### The users should NOT change anything in the code below here. 

In [3]:
# Non-editable variables:
PATH_OUTPUT = "results/"

# Set the directory:
os.chdir(PATH)

# Import data
## Catchment boundaries

In [7]:
catchment_boundaries = gpd.read_file('results/estreams_catchments.shp')
catchment_boundaries.set_index("basin_id", inplace = True)
catchment_boundaries

Unnamed: 0_level_0,gauge_id,gauge_coun,area,area_calc,area_flag,area_perc,start_date,end_date,gauges_ups,watershed_,geometry
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AT000001,200014,AT,4647.9,4668.379,0,-0.440608,1996-01-01,2019-12-31,13,1,"POLYGON Z ((9.69406 46.54322 0.00000, 9.69570 ..."
AT000002,200048,AT,102.0,102.287,0,-0.281373,1958-10-01,2019-12-31,0,1,"POLYGON Z ((10.13650 47.02949 0.00000, 10.1349..."
AT000003,231662,AT,535.2,536.299,0,-0.205344,1985-01-02,2019-12-31,1,1,"POLYGON Z ((10.11095 46.89437 0.00000, 10.1122..."
AT000004,200592,AT,66.6,66.286,0,0.471471,1998-01-02,2019-12-31,0,1,"POLYGON Z ((10.14189 47.09706 0.00000, 10.1404..."
AT000005,200097,AT,72.2,72.448,0,-0.343490,1990-01-01,2019-12-31,0,1,"POLYGON Z ((9.67851 47.06249 0.00000, 9.67888 ..."
...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,UA,321.0,325.370,0,-1.361371,1978-01-01,1987-12-31,0,1916,"POLYGON Z ((33.96791 44.63291 0.00000, 33.9679..."
UAGR0018,6682500,UA,49.7,47.594,0,4.237425,1978-01-01,1987-12-31,0,1917,"POLYGON Z ((34.19958 44.58291 0.00000, 34.2029..."
UAGR0019,6683010,UA,261.0,244.731,1,6.233333,1978-01-01,1987-12-31,0,1918,"POLYGON Z ((34.19624 44.88375 0.00000, 34.1962..."
UAGR0020,6683200,UA,760.0,731.073,0,3.806184,1978-01-01,1987-12-31,0,1919,"POLYGON Z ((35.78708 47.28708 0.00000, 35.7870..."


## Gauging stations information
* Full network

In [12]:
network_estreams = pd.read_csv('results/estreams_gauging_stations.csv', encoding='utf-8')
network_estreams.set_index("basin_id", inplace = True)
network_estreams.head()

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,area,...,start_date,end_date,num_years,num_months,num_days,num_days_gaps,num_continuous_days,duplicated_suspect,watershed_group,gauges_upstream
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,4647.9,...,1996-01-01,2019-12-31,24,288,8766,0.0,8766,CH000197,1,13
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,102.0,...,1958-10-01,2019-12-31,62,735,22372,0.0,22372,CH000221,1,0
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,535.2,...,1985-01-02,2019-12-31,35,420,12782,0.0,12782,CH000215,1,1
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,66.6,...,1998-01-02,2019-12-31,22,264,8034,0.0,8034,CH000227,1,0
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.15077,9.802668,47.15077,72.2,...,1990-01-01,2019-12-31,30,360,10957,0.0,10957,CH000214,1,0


* Duplicated gauges

In [11]:
network_duplicated = pd.read_csv('results/estreams_gauging_stations_duplicates.csv', encoding='utf-8')
network_duplicated.set_index("basin_id", inplace = True)
network_duplicated.head()

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,area,...,start_date,end_date,num_years,num_months,num_days,num_days_gaps,num_continuous_days,duplicated_suspect,watershed_main,gauges_upstream
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,4647.9,...,1996-01-01 00:00:00,2019-12-31 00:00:00,24,288,8766,0.0,8766,CH000197,1,13
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,102.0,...,1958-10-01 00:00:00,2019-12-31 00:00:00,62,735,22372,0.0,22372,CH000221,1,0
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,535.2,...,1985-01-02 00:00:00,2019-12-31 00:00:00,35,420,12782,0.0,12782,CH000215,1,1
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,66.6,...,1998-01-02 00:00:00,2019-12-31 00:00:00,22,264,8034,0.0,8034,CH000227,1,0
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.15077,9.802668,47.15077,72.2,...,1990-01-01 00:00:00,2019-12-31 00:00:00,30,360,10957,0.0,10957,CH000214,1,0


* Gauges upstream and nested

In [13]:
network_nested = pd.read_csv('results/estreams_gauging_stations_nested.csv', encoding='utf-8')
network_nested.set_index("basin_id", inplace = True)
network_nested.head()

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,area,...,start_date,end_date,num_years,num_months,num_days,num_days_gaps,num_continuous_days,duplicated_suspect,watershed_main,gauges_upstream
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,4647.9,...,1996-01-01 00:00:00,2019-12-31 00:00:00,24,288,8766,0.0,8766,CH000197,1,13
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,102.0,...,1958-10-01 00:00:00,2019-12-31 00:00:00,62,735,22372,0.0,22372,CH000221,1,0
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,535.2,...,1985-01-02 00:00:00,2019-12-31 00:00:00,35,420,12782,0.0,12782,CH000215,1,1
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,66.6,...,1998-01-02 00:00:00,2019-12-31 00:00:00,22,264,8034,0.0,8034,CH000227,1,0
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.15077,9.802668,47.15077,72.2,...,1990-01-01 00:00:00,2019-12-31 00:00:00,30,360,10957,0.0,10957,CH000214,1,0


## Assign the values and adjusts
* network estreams (CSV)

In [14]:
# Duplicated information
network_estreams["duplicated_suspect"] = network_duplicated["duplicated_suspect"]

# Watershed group
network_estreams["watershed_group"] = network_nested["watershed_main"] 

# Gauges upstream
network_estreams["gauges_upstream"] = network_nested["gauges_upstream"] 

network_estreams

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,area,...,start_date,end_date,num_years,num_months,num_days,num_days_gaps,num_continuous_days,duplicated_suspect,watershed_group,gauges_upstream
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,4647.9,...,1996-01-01,2019-12-31,24,288,8766,0.0,8766,CH000197,1,13
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,102.0,...,1958-10-01,2019-12-31,62,735,22372,0.0,22372,CH000221,1,0
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,535.2,...,1985-01-02,2019-12-31,35,420,12782,0.0,12782,CH000215,1,1
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,66.6,...,1998-01-02,2019-12-31,22,264,8034,0.0,8034,CH000227,1,0
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.150770,9.802668,47.150770,72.2,...,1990-01-01,2019-12-31,30,360,10957,0.0,10957,CH000214,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,BASHTANOVKA,UA,UA_GRDC,KACHA,33.894739,44.691884,33.900000,44.683333,321.0,...,1978-01-01,1987-12-31,10,120,3652,0.0,3652,,1916,0
UAGR0018,6682500,YALTA,UA,UA_GRDC,DERE-KIOY,34.166667,44.500000,34.166667,44.500000,49.7,...,1978-01-01,1987-12-31,10,120,3652,0.0,3652,,1917,0
UAGR0019,6683010,PIONERSKOE,UA,UA_GRDC,SALHYR,34.199841,44.887685,34.200000,44.883333,261.0,...,1978-01-01,1987-12-31,10,120,3652,0.0,3652,,1918,0
UAGR0020,6683200,TOKMAK,UA,UA_GRDC,TOKMAK,35.705833,47.251389,35.705833,47.251389,760.0,...,1978-01-01,1987-12-31,10,120,3652,0.0,3652,,1919,0


* network estreams (SHP)

In [17]:
# Create a geometry column with Point geometries from latitudes and longitudes
geometry = [Point(lon, lat) for lon, lat in zip(network_estreams['lon_snap'], network_estreams['lat_snap'])]

# Create a GeoDataFrame
network_estreams_gdf = gpd.GeoDataFrame(network_estreams, geometry=geometry, crs="EPSG:4326")  # CRS for WGS84 (latitude/longitude)

network_estreams_gdf

Unnamed: 0_level_0,gauge_id,gauge_name,gauge_country,gauge_provider,river,lon_snap,lat_snap,lon,lat,area,...,end_date,num_years,num_months,num_days,num_days_gaps,num_continuous_days,duplicated_suspect,watershed_group,gauges_upstream,geometry
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AT000001,200014,Bangs,AT,AT_EHYD,Rhein,9.534835,47.273748,9.534835,47.273748,4647.9,...,2019-12-31,24,288,8766,0.0,8766,CH000197,1,13,POINT (9.53484 47.27375)
AT000002,200048,Schruns (Vonbunweg),AT,AT_EHYD,Litz,9.913677,47.080301,9.913677,47.080301,102.0,...,2019-12-31,62,735,22372,0.0,22372,CH000221,1,0,POINT (9.91368 47.08030)
AT000003,231662,Loruens-Aeule,AT,AT_EHYD,Ill,9.847765,47.132821,9.847765,47.132821,535.2,...,2019-12-31,35,420,12782,0.0,12782,CH000215,1,1,POINT (9.84777 47.13282)
AT000004,200592,Kloesterle (OEBB),AT,AT_EHYD,Alfenz,10.061843,47.128994,10.061843,47.128994,66.6,...,2019-12-31,22,264,8034,0.0,8034,CH000227,1,0,POINT (10.06184 47.12899)
AT000005,200097,Buers (Bruecke L82),AT,AT_EHYD,Alvier,9.802668,47.150770,9.802668,47.150770,72.2,...,2019-12-31,30,360,10957,0.0,10957,CH000214,1,0,POINT (9.80267 47.15077)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,BASHTANOVKA,UA,UA_GRDC,KACHA,33.894739,44.691884,33.900000,44.683333,321.0,...,1987-12-31,10,120,3652,0.0,3652,,1916,0,POINT (33.89474 44.69188)
UAGR0018,6682500,YALTA,UA,UA_GRDC,DERE-KIOY,34.166667,44.500000,34.166667,44.500000,49.7,...,1987-12-31,10,120,3652,0.0,3652,,1917,0,POINT (34.16667 44.50000)
UAGR0019,6683010,PIONERSKOE,UA,UA_GRDC,SALHYR,34.199841,44.887685,34.200000,44.883333,261.0,...,1987-12-31,10,120,3652,0.0,3652,,1918,0,POINT (34.19984 44.88769)
UAGR0020,6683200,TOKMAK,UA,UA_GRDC,TOKMAK,35.705833,47.251389,35.705833,47.251389,760.0,...,1987-12-31,10,120,3652,0.0,3652,,1919,0,POINT (35.70583 47.25139)


* catchment boundaries

In [18]:
# Retrieve the information needed:
catchment_boundaries["gauges_upstream"] = network_estreams['gauges_upstream'].astype(int)
catchment_boundaries['watershed_group'] = network_estreams['watershed_group'].astype(int)

# Adjust the columns order and names:
catchment_boundaries = catchment_boundaries[['gauge_id', 'gauge_coun', 'area', 'area_calc',
       'area_flag', 'area_perc', 'start_date', 'end_date', 'gauges_upstream', 'watershed_group', 'geometry']]

catchment_boundaries.columns = ['gauge_id', 'gauge_country', 'area', 'area_calc',
       'area_flag', 'area_perc', 'start_date', 'end_date', 'gauges_upstream', 'watershed_group', 'geometry']

catchment_boundaries

Unnamed: 0_level_0,gauge_id,gauge_country,area,area_calc,area_flag,area_perc,start_date,end_date,gauges_upstream,watershed_group,geometry
basin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AT000001,200014,AT,4647.9,4668.379,0,-0.440608,1996-01-01,2019-12-31,13,1,"POLYGON Z ((9.69406 46.54322 0.00000, 9.69570 ..."
AT000002,200048,AT,102.0,102.287,0,-0.281373,1958-10-01,2019-12-31,0,1,"POLYGON Z ((10.13650 47.02949 0.00000, 10.1349..."
AT000003,231662,AT,535.2,536.299,0,-0.205344,1985-01-02,2019-12-31,1,1,"POLYGON Z ((10.11095 46.89437 0.00000, 10.1122..."
AT000004,200592,AT,66.6,66.286,0,0.471471,1998-01-02,2019-12-31,0,1,"POLYGON Z ((10.14189 47.09706 0.00000, 10.1404..."
AT000005,200097,AT,72.2,72.448,0,-0.343490,1990-01-01,2019-12-31,0,1,"POLYGON Z ((9.67851 47.06249 0.00000, 9.67888 ..."
...,...,...,...,...,...,...,...,...,...,...,...
UAGR0017,6682300,UA,321.0,325.370,0,-1.361371,1978-01-01,1987-12-31,0,1916,"POLYGON Z ((33.96791 44.63291 0.00000, 33.9679..."
UAGR0018,6682500,UA,49.7,47.594,0,4.237425,1978-01-01,1987-12-31,0,1917,"POLYGON Z ((34.19958 44.58291 0.00000, 34.2029..."
UAGR0019,6683010,UA,261.0,244.731,1,6.233333,1978-01-01,1987-12-31,0,1918,"POLYGON Z ((34.19624 44.88375 0.00000, 34.1962..."
UAGR0020,6683200,UA,760.0,731.073,0,3.806184,1978-01-01,1987-12-31,0,1919,"POLYGON Z ((35.78708 47.28708 0.00000, 35.7870..."


## Save the data

In [19]:
# network information
network_estreams.to_csv('results/estreams_gauging_stations.csv', encoding='utf-8')

# shapefiles
catchment_boundaries.to_file('results/estreams_catchments.shp')
network_estreams_gdf.to_file('results/estreams_gauging_stations.shp')

  catchment_boundaries.to_file('results/estreams_catchments.shp')
  network_estreams_gdf.to_file('results/estreams_gauging_stations.shp')


## End