# Complementary extra codes: Concatenation of new catchments


Author: Thiago Nascimento (thiago.nascimento@eawag.ch)

This notebook complements the EStreams publication and can be used to concatenate catchment attributes derived from new basins into the original dataset. This might be useful for users that wish to extend their datasets to new areas, for example. 


* Note that this code enables not only the replicability of the current database but also the extrapolation to new catchment areas. 
* Additionally, the user should download and insert the original raw-data in the folder of the same name prior to run this code. 
* The original third-party data used were not made avaialble in this repository due to redistribution and storage-space reasons.  

## Requirements
**Python:**

* Python>=3.6
* Jupyter
* geopandas=0.10.2
* numpy
* os
* pandas
* tqdm
* warnings

Check the Github repository for an environment.yml (for conda environments) or requirements.txt (pip) file.

**Files:**

* data/concatenation_new_basins/old_data/*.csv
* data/concatenation_new_basins/new_data/*.csv

**Directory:**

* Clone the GitHub directory locally
* Place any third-data variables in their respective directory.
* ONLY update the "PATH" variable in the section "Configurations", with their relative path to the EStreams directory. 

## Observations

* This notebook assumes that all the "new" and "old" catchments have their attributes exported and stored correctly in their respective folders. 

# Import modules

In [None]:
import pandas as pd
import numpy as np
import tqdm as tqdm
import os
import warnings

# Configurations

In [None]:
# Only editable variables
# Relative path to your local directory
PATH = "../../.."

# Suppress all warnings:
warnings.filterwarnings("ignore")

In [None]:
# Non-editable variables:
PATH_OUTPUT = "results/"
path_old_files = 'data/update_old_basins/old_files/'
path_new_files = 'data/update_old_basins/new_files/'

# Set the directory:
os.chdir(PATH)

# Import data

## Static attributes
* Here we automatize the update of the new dataset into the old ones.
* Note that the update is done in pairs (old and new).

In [None]:
# Static datasets names:
datasets_static = ['estreams_hydrometeo_signatures.csv',
    ]

# Output filename options:
datasets_static_output_options = {
    'estreams_hydrometeo_signatures.csv': PATH_OUTPUT+"/staticattributes/estreams_hydrometeo_signatures.csv",
    }

for chosen_dataset in tqdm.tqdm(datasets_static):
    
    # If the file is not in the folder (or stored with a different name), the run will skip this dataset. 
    try: 
        # Get the old, new and output pathnames
        csv_file_old = path_old_files+chosen_dataset
        csv_file_new = path_new_files+chosen_dataset
        csv_output = datasets_static_output_options.get(chosen_dataset)
        
        # Read the old and new datasets:
        estreams_old = pd.read_csv(csv_file_old, index_col=0)
        estreams_new = pd.read_csv(csv_file_new, index_col=0)
        
        # Make a copy to avoid overwriting original
        estreams_updated = estreams_old.copy()

        # Update only overlapping values
        estreams_updated.update(estreams_new)

        # Rows in old that have no match in new
        not_in_new = estreams_old.index.difference(estreams_new.index)

        # Rows in old that had a match but were unchanged
        common_indices = estreams_old.index.intersection(estreams_new.index)
        unchanged_common = [idx for idx in common_indices if estreams_old.loc[idx].equals(estreams_updated.loc[idx])]

        # Combine both sets
        not_updated_indices = list(not_in_new) + unchanged_common

        # Get full DataFrame
        not_updated_rows = estreams_old.loc[not_updated_indices].index.tolist()

        print("\nTotal rows in old that were NOT updated (no match or identical):", len(not_updated_rows))
        print(not_updated_rows)

        # Save the data:
        estreams_updated.to_csv(csv_output)
        
        # Prints usefull to visualize what was done:
        print("Dataset:", chosen_dataset)
        print("Number of catchments in the old file:", len(estreams_old))
        print("Number of catchments in the new file:", len(estreams_new))
        
    except:
        print("The dataset", chosen_dataset, "is not avaialble in the folders.")

## Temporal attributes
* Here we automatize the update of the new dataset into the old ones.
* Note that the update is done in pairs (old and new).
* The meteorological time-series are not considered at this part. 

In [None]:
# Paths
base_old = 'data/update_old_basins/old_files/streamflowindices'
base_new = 'data/update_old_basins/new_files/streamflowindices'
base_output = 'results/timeseries/streamflowindices'

# Subfolders to loop through
subfolders = ['monthly', 'seasonal', 'weekly', 'yearly']

for subfolder in subfolders:
    old_dir = os.path.join(base_old, subfolder)
    new_dir = os.path.join(base_new, subfolder)
    output_dir = os.path.join(base_output, subfolder)

    # Skip if the new subfolder doesn't exist
    if not os.path.isdir(new_dir):
        continue

    # Process each CSV in the new folder
    for file in os.listdir(new_dir):
        if file.endswith('.csv'):
            new_file_path = os.path.join(new_dir, file)
            old_file_path = os.path.join(old_dir, file)
            output_file_path = os.path.join(output_dir, file)

            # Only update if the file exists in old folder
            if os.path.exists(old_file_path):
                # Load both DataFrames
                df_old = pd.read_csv(old_file_path, index_col=0, parse_dates=True)
                df_new = pd.read_csv(new_file_path, index_col=0, parse_dates=True)

                # Update the old DataFrame with the new values
                df_old.update(df_new)

                # Save the updated DataFrame, overwriting the old file
                df_old.to_csv(output_file_path)

                print(f"Updated: {subfolder}/{file}")
            else:
                print(f"Skipped: {subfolder}/{file}")

## Small check

In [None]:
df_old = pd.read_csv('data/update_old_basins/old_files/streamflowindices/monthly/monthly_streamflow_mean.csv', index_col=0, parse_dates=True)
df_new = pd.read_csv('data/update_old_basins/new_files/streamflowindices/monthly/monthly_streamflow_mean.csv', index_col=0, parse_dates=True)
df_updated = pd.read_csv('results/timeseries/streamflowindices/monthly/monthly_streamflow_mean.csv', index_col=0, parse_dates=True)

In [None]:
df_updated.loc[:, "DENW1234"].plot()
df_new.loc[:, "DENW1234"].plot(alpha=0.5)
df_old.loc[:, "DENW1234"].plot()

In [None]:
df_updated.loc[:, "AT000001"].plot()
df_old.loc[:, "AT000001"].plot(alpha=0.5)

# End