- heroku allow maximum **1GB** upload for one project

- dissolve large dataset into smaller ones
    - GitHub only allows data file **<100MB**

- required four datasets
    - must include the same set of streams gauges

- gauges
    - a selection of streams for trend analysis
    - at least 30 ADHs during 1979-2021

- ADHs
    - by streams
    
- hydrometrics
    - by hydrometrics

- MK-test outcome
    - by method and then by hydrometrics

In [13]:
import numpy as np
import pandas as pd

import os
import sys

import geopandas as gpd

import pickle
from tqdm.notebook import tqdm

sys.path.append('../../funclib')
from myclass import TensorFrame
from tranalysis import run_mktest_with_tframe

import warnings
warnings.simplefilter('ignore')

# root_dir = '../External'
# root_dir = '/Volumes/sambashare'
root_dir = '/media/weigangtang/Samba'

flow_dir = root_dir + '/Python/flow_data'
gis_dir = root_dir + '/GIS Data'

# ADHs

In [6]:
df_hys = pd.read_csv(flow_dir + '/hydata_merged/hys_1979-2021_30y.csv', index_col=[0, 1])
df_hys = df_hys.round(2)

In [7]:
sid_list = np.unique(df_hys.index.get_level_values('SID'))

for target_sid in tqdm(sid_list):
    
    df_hys_sel = df_hys.loc[target_sid]
    df_hys_sel.to_csv('../data/ADHs/{}.csv'.format(target_sid))

  0%|          | 0/1781 [00:00<?, ?it/s]

# Gauges

In [110]:
gauges = pd.read_csv(flow_dir + '/Station_Info_ALL_merged.csv', index_col=0)

In [122]:
hys_fpath_list = glob.glob('../data/ADHs/*.csv')
hys_fpath_list = sorted(hys_fpath_list)

sel_sid = [
    hys_fpath.split('/')[-1].replace('.csv', '') 
    for hys_fpath in hys_fpath_list
]

gauges.loc[sel_sid].to_csv('../data/Station_Info_ALL.csv')

# Hydrometrics

In [None]:
hym_fpath = flow_dir + '/hydata_merged/hym_1979-2021_30y.csv'
df_hym = pd.read_csv(hym_fpath, index_col=[0, 1])

In [None]:
out_folder = '../data/hydrometrics'
if not os.path.exists(out_folder):
    os.mkdir(out_folder)

for target_metric in df_hym.columns:
    df = df_hym[target_metric].reset_index()
    df = df.pivot(index='SID', columns='Year', values=target_metric)
    df.to_csv(out_folder + '/{}.csv'.format(target_metric))

# MK-test Output

In [44]:
method_list = ['original', 'rao', 'yue', 'prewhiten', 'trendfree']

for method in method_list:

    mkout_fpath_frame = flow_dir + '/hydata_merged/mkout_1979-2021_30y_{}.pkl'
    mkout_fpath = mkout_fpath_frame.format(method)
    with open(mkout_fpath, 'rb') as f:
        mkout_tframe = pickle.load(f)

    out_folder = '../data/mktest/{}'.format(method)
    if not os.path.exists(out_folder):
        os.mkdir(out_folder)

    metric_list = mkout_tframe.name2d
    for target_metric in metric_list:
        df = mkout_tframe.to_frame(target_metric)
        df.to_csv(out_folder + '/{}.csv'.format(target_metric))

# MK-test Daily

In [8]:
df_hys_list = []
for target_day in df_hys.columns:
    df_hys_list.append(
        df_hys.reset_index().pivot(
            index='SID', columns='Year', values=target_day
        )
    )

tensor = np.dstack(df_hys_list)
tensor = tensor.transpose([0, 2, 1])

name1d = df_hys_list[0].index.tolist()  # SID
name2d = df_hys.columns.tolist()  # Hydrometric
name3d = df_hys_list[0].columns.tolist()  # Year
hys_tframe = TensorFrame(tensor, name1d, name2d, name3d)

In [17]:
method_list = ['original', 'yue', 'rao', 'prewhiten', 'trendfree']

for method in tqdm(method_list):
    
    mkout_tframe = run_mktest_with_tframe(hys_tframe, method)
    
    for target_sid in mkout_tframe.name1d:
        
        df = mkout_tframe.to_frame(target_sid)
        df.index = np.arange(365)
        df['mean'] = (df['init'] + df['last']) / 2
        df = df[['mean', 'slp', 'chg', 'pvalue']].round(3) # reduce size for heroku upload
        df.to_csv(
            '../data/mktest_dly/{}/{}.csv'.format(method, target_sid)
        )

  0%|          | 0/5 [00:00<?, ?it/s]

# Watershed Shapefile

In [20]:
gdf_wat = gpd.read_file(gis_dir + '/clean_watersheds/ALL_REF_watersheds.shp')
gdf_wat = gdf_wat.set_index('SID')

In [30]:
# for target_sid in tqdm(gdf_wat.index):
#     gdf_wat.loc[[target_sid]].to_file(
#         '../data/watershed_shapefile/{}.geojson'.format(target_sid),
#         driver='GeoJSON'
#     )

  0%|          | 0/1242 [00:00<?, ?it/s]