# Complementary extra codes: Concatenation of new catchments


Author: Thiago Nascimento (thiago.nascimento@eawag.ch)

This notebook complements the EStreams publication and can be used to concatenate catchment attributes derived from new basins into the original dataset. This might be useful for users that wish to extend their datasets to new areas, for example. 


* Note that this code enables not only the replicability of the current database but also the extrapolation to new catchment areas. 
* Additionally, the user should download and insert the original raw-data in the folder of the same name prior to run this code. 
* The original third-party data used were not made avaialble in this repository due to redistribution and storage-space reasons.  

## Requirements
**Python:**

* Python>=3.6
* Jupyter
* geopandas=0.10.2
* numpy
* os
* pandas
* tqdm
* warnings

Check the Github repository for an environment.yml (for conda environments) or requirements.txt (pip) file.

**Files:**

* data/concatenation_new_basins/old_data/*.csv
* data/concatenation_new_basins/new_data/*.csv

**Directory:**

* Clone the GitHub directory locally
* Place any third-data variables in their respective directory.
* ONLY update the "PATH" variable in the section "Configurations", with their relative path to the EStreams directory. 

## Observations

* This notebook assumes that all the "new" and "old" catchments have their attributes exported and stored correctly in their respective folders. 

# Import modules

In [None]:
import pandas as pd
import numpy as np
import tqdm as tqdm
import os
import warnings

# Configurations

In [None]:
# Only editable variables
# Relative path to your local directory
PATH = "../../.."

# Suppress all warnings:
warnings.filterwarnings("ignore")

In [None]:
# Non-editable variables:
PATH_OUTPUT = "results/"
path_old_files = 'data/concatenation_new_basins/old_files/'
path_new_files = 'data/concatenation_new_basins/new_files/'

# Set the directory:
os.chdir(PATH)

# Import data

## Static attributes
* Here we automatize the concatenation of the new dataset into the old ones.
* Note that the concatenation is done in pairs (old and new).

In [None]:
# Static datasets names:
datasets_static = ['estreams_lithology_attributes.csv',
    'estreams_hydrology_attributes.csv',
    'estreams_landcover_attributes.csv',
    'estreams_meteorology_density.csv',
    'estreams_snowcover_attributes.csv',
    'estreams_soil_attributes.csv',
    'estreams_terrain_attributes.csv',
    'estreams_vegetation_attributes.csv']

# Output filename options:
datasets_static_output_options = {
    'estreams_lithology_attributes.csv': PATH_OUTPUT+"/staticattributes/estreams_lithology_attributes.csv",
    'estreams_hydrology_attributes.csv': PATH_OUTPUT+"/staticattributes/estreams_hydrology_attributes.csv",
    'estreams_landcover_attributes.csv': PATH_OUTPUT+"/staticattributes/estreams_landcover_attributes.csv",
    'estreams_meteorology_density.csv': PATH_OUTPUT+"/staticattributes/estreams_meteorology_density.csv",
    'estreams_snowcover_attributes.csv': PATH_OUTPUT+"/staticattributes/estreams_snowcover_attributes.csv",
    'estreams_soil_attributes.csv': PATH_OUTPUT+"/staticattributes/estreams_soil_attributes.csv",
    'estreams_terrain_attributes.csv': PATH_OUTPUT+"/staticattributes/estreams_terrain_attributes.csv",
    'estreams_vegetation_attributes.csv': PATH_OUTPUT+"/staticattributes/estreams_vegetation_attributes.csv"
    }

for chosen_dataset in tqdm.tqdm(datasets_static):
    
    # If the file is not in the folder (or stored with a different name), the run will skip this dataset. 
    try: 
        # Get the old, new and output pathnames
        csv_file_old = path_old_files+chosen_dataset
        csv_file_new = path_new_files+chosen_dataset
        csv_output = datasets_static_output_options.get(chosen_dataset)
        
        # Read the old and new datasets:
        estreams_old = pd.read_csv(csv_file_old, index_col=0)
        estreams_new = pd.read_csv(csv_file_new, index_col=0)
        
        # Concatenate and sort it by index
        estreams_concatenated = pd.concat([estreams_old, estreams_new], axis=0)
        estreams_concatenated = estreams_concatenated.sort_index(axis=0)
        
        # Delete some specific characters (in this case RS and ISGR)
        estreams_concatenated = estreams_concatenated.filter(regex='^(?!.*(ISGR|RS00)).*$', axis=0)
        
        # Here we adjust the lithology, because it was saving (original) only for the classes present:
        if chosen_dataset == "estreams_lithology_attributes.csv":
            estreams_concatenated = estreams_concatenated[["lit_fra_ev", "lit_fra_ig", "lit_fra_mt", "lit_fra_nd", "lit_fra_pa", "lit_fra_pb", 
                                                           "lit_fra_pi", "lit_fra_py", "lit_fra_sc", "lit_fra_sm", "lit_fra_ss", "lit_fra_su",
                                                           "lit_fra_va", "lit_fra_vb", "lit_fra_vi", "lit_fra_wb", "lit_dom", "tot_area", "bedrk_dep"]]
            # Fill NaN values with 0 in the first 16 columns
            estreams_concatenated.iloc[:, :16] = estreams_concatenated.iloc[:, :16].fillna(0)                                                 
        else:
            1+1

       # Here we also adjust the hydrology:
        if chosen_dataset == "estreams_hydrology_attributes.csv":
            estreams_concatenated.iloc[:, -2:] = estreams_concatenated.iloc[:, -2:].fillna(0)                                                 
        else:
            1+1
            
        # Save the data:
        estreams_concatenated.to_csv(csv_output)
        
        # Prints usefull to visualize what was done:
        print("Dataset:", chosen_dataset)
        print("Number of catchments in the old file:", len(estreams_old))
        print("Number of catchments in the new file:", len(estreams_new))
        
    except:
        print("The dataset", chosen_dataset, "is not avaialble in the folders.")
    

## Temporal attributes
* Here we automatize the concatenation of the new dataset into the old ones.
* Note that the concatenation is done in pairs (old and new).
* The meteorological time-series are not considered at this part. 

In [None]:
# Temporal datasets names:
datasets_temporal = [
    'estreams_irrigation_yearly.csv',
    'estreams_LAI_monhtly.csv',
    'estreams_LAI_yearly.csv',
    'estreams_NDVI_monhtly.csv',
    'estreams_NDVI_yearly.csv',
    'estreams_snowcover_monhtly.csv',
    'estreams_snowcover_yearly.csv']

# Output filename options:
datasets_temporal_output_options = {
    'estreams_irrigation_yearly.csv': PATH_OUTPUT+"/timeseries/irrigation/estreams_irrigation_yearly.csv",
    'estreams_LAI_monhtly.csv': PATH_OUTPUT+"/timeseries/vegetationindices/estreams_LAI_monhtly.csv",
    'estreams_LAI_yearly.csv': PATH_OUTPUT+"/timeseries/vegetationindices/estreams_LAI_yearly.csv",
    'estreams_NDVI_monhtly.csv': PATH_OUTPUT+"/timeseries/vegetationindices/estreams_NDVI_monhtly.csv",
    'estreams_NDVI_yearly.csv': PATH_OUTPUT+"/timeseries/vegetationindices/estreams_NDVI_yearly.csv",
    'estreams_snowcover_monhtly.csv': PATH_OUTPUT+"/timeseries/snowcover/estreams_snowcover_monhtly.csv",
    'estreams_snowcover_yearly.csv': PATH_OUTPUT+"/timeseries/snowcover/estreams_snowcover_yearly.csv",
    }

for chosen_dataset in tqdm.tqdm(datasets_temporal):
    
    # If the file is not in the folder (or stored with a different name), the run will skip this dataset. 
    try: 
        # Get the old, new and output pathnames
        csv_file_old = path_old_files+chosen_dataset
        csv_file_new = path_new_files+chosen_dataset
        csv_output = datasets_temporal_output_options.get(chosen_dataset)
        
        # Read the old and new datasets:
        estreams_old = pd.read_csv(csv_file_old, index_col=0)
        estreams_new = pd.read_csv(csv_file_new, index_col=0)
        
        if chosen_dataset != 'estreams_irrigation_yearly.csv':
            
            # Here we set the index to datetime (avoid problems during concatenation):
            try:
                estreams_old.index = pd.to_datetime(estreams_old.index)
            except:
                estreams_old.index = pd.to_datetime(estreams_old.index, format='%d.%m.%Y')
            try:
                estreams_new.index = pd.to_datetime(estreams_new.index)
            except:
                estreams_new.index = pd.to_datetime(estreams_new.index, format='%d.%m.%Y')
        else:
            1+1
            
        # Concatenate and sort it by index
        estreams_concatenated = pd.concat([estreams_old, estreams_new], axis=1)
        estreams_concatenated = estreams_concatenated.sort_index(axis=1)
        
        ## Delete some specific characters (in this case RS and ISGR)
        #estreams_concatenated = estreams_concatenated.filter(regex='^(?!.*(ISGR|RS00)).*$', axis=1)
        
        # Save the data:
        estreams_concatenated.to_csv(csv_output)
        
        # Prints usefull to visualize what was done:
        print("Dataset:", chosen_dataset)
        print("Number of catchments in the old file:", estreams_old.shape[1])
        print("Number of catchments in the new file:", estreams_new.shape[1])
        
    except:
        print("The dataset", chosen_dataset, "is not avaialble in the folders.")

## Meteorological csv-data (PET, P and T)

### PET

In [None]:
# Read the old and new datasets:
estreams_old = pd.read_csv(path_old_files+"estreams_meteorology_pet.csv", index_col=0)
estreams_new = pd.read_csv(path_new_files+"estreams_meteorology_pet.csv", index_col=0)

In [None]:
# Here we set the index to datetime (avoid problems during concatenation):
try:
    estreams_old.index = pd.to_datetime(estreams_old.index)
except:
    estreams_old.index = pd.to_datetime(estreams_old.index, format='%d.%m.%Y')
try:
    estreams_new.index = pd.to_datetime(estreams_new.index)
except:
    estreams_new.index = pd.to_datetime(estreams_new.index, format='%d.%m.%Y')

In [None]:
# Concatenate and sort it by index
estreams_concatenated = pd.concat([estreams_old, estreams_new], axis=1, ignore_index=False)
estreams_concatenated = estreams_concatenated.sort_index(axis=1)
estreams_concatenated

- (Extra) Example of how to delete specific basins from the analysis 

In [None]:
## Filter the DataFrame to keep columns that do not contain 'ISGR' or 'RS00' in their names
#estreams_concatenated = estreams_concatenated.filter(regex='^(?!.*(ISGR|RS00)).*$', axis=1)
#estreams_concatenated

In [None]:
# Save the data:
estreams_concatenated.to_csv("results/timeseries/meteorology/estreams_meteorology_pet.csv")

### P

In [None]:
# Read the old and new datasets:
estreams_old = pd.read_csv(path_old_files+"estreams_meteorology_precipitation.csv", index_col=0)
estreams_new = pd.read_csv(path_new_files+"estreams_meteorology_precipitation.csv", index_col=0)

In [None]:
# Here we set the index to datetime (avoid problems during concatenation):
try:
    estreams_old.index = pd.to_datetime(estreams_old.index)
except:
    estreams_old.index = pd.to_datetime(estreams_old.index, format='%d.%m.%Y')
try:
    estreams_new.index = pd.to_datetime(estreams_new.index)
except:
    estreams_new.index = pd.to_datetime(estreams_new.index, format='%d.%m.%Y')

In [None]:
# Concatenate and sort it by index
estreams_concatenated = pd.concat([estreams_old, estreams_new], axis=1, ignore_index=False)
estreams_concatenated = estreams_concatenated.sort_index(axis=1)
estreams_concatenated

- (Extra) Example of how to delete specific basins from the analysis 

In [None]:
## Filter the DataFrame to keep columns that do not contain 'ISGR' or 'RS00' in their names
#estreams_concatenated = estreams_concatenated.filter(regex='^(?!.*(ISGR|RS00)).*$', axis=1)
#estreams_concatenated

In [None]:
# Save the data:
estreams_concatenated.to_csv("results/timeseries/meteorology/estreams_meteorology_precipitation.csv")

### T

In [None]:
# Read the old and new datasets:
estreams_old = pd.read_csv(path_old_files+"estreams_meteorology_temperature.csv", index_col=0)
estreams_new = pd.read_csv(path_new_files+"estreams_meteorology_temperature.csv", index_col=0)

In [None]:
# Here we set the index to datetime (avoid problems during concatenation):
try:
    estreams_old.index = pd.to_datetime(estreams_old.index)
except:
    estreams_old.index = pd.to_datetime(estreams_old.index, format='%d.%m.%Y')
try:
    estreams_new.index = pd.to_datetime(estreams_new.index)
except:
    estreams_new.index = pd.to_datetime(estreams_new.index, format='%d.%m.%Y')

In [None]:
# Concatenate and sort it by index
estreams_concatenated = pd.concat([estreams_old, estreams_new], axis=1, ignore_index=False)
estreams_concatenated = estreams_concatenated.sort_index(axis=1)
estreams_concatenated

- (Extra) Example of how to delete specific basins from the analysis 

In [None]:
## Filter the DataFrame to keep columns that do not contain 'ISGR' or 'RS00' in their names
#estreams_concatenated = estreams_concatenated.filter(regex='^(?!.*(ISGR|RS00)).*$', axis=1)
#estreams_concatenated

In [None]:
# Save the data:
estreams_concatenated.to_csv("results/timeseries/meteorology/estreams_meteorology_temperature.csv")

# End