In [1]:
import pandas as pd
import numpy as np
import glob

In [5]:
df = pd.read_pickle('../data/Qmax7_final_dataset_seasonal4.pkl')

X = df
grid_size = 100
feature = 'ImperviousSurface'
min_interval = 0
quantiles = np.linspace(0.05, 0.95, grid_size + 1, endpoint=True)
# use customized quantile function to get the same result as
# type 1 R quantile (Inverse of empirical distribution function)
bins = np.quantile(X[feature], quantiles).tolist()
bins = np.unique(bins)
print(bins)

# merge small-interval bins
# Initialize the new list of bins with the first bin from the original list
new_bins = [bins[0]]
# Iterate through the rest of the bins
for i in range(1, len(bins)):
    # If the difference between the current bin and the last bin added to new_bins
    # is greater than or equal to the minimum required interval, add the current bin.
    if bins[i] - new_bins[-1] >= min_interval:
        new_bins.append(bins[i])
# Ensure the last original bin is included if it extends beyond the last new_bin,
# especially if the last interval would be very small.
# This part is optional but can be useful to retain the full range of the data.
if bins[-1] > new_bins[-1] and (bins[-1] - new_bins[-1] < min_interval):
    new_bins[-1] = bins[-1] # Replace the last bin to ensure the upper bound
bins = np.array(new_bins)
bins

[0.00000000e+00 6.00000000e-05 2.31000000e-04 5.45000000e-04
 1.07400000e-03 1.81900000e-03 2.67400000e-03 3.93406000e-03
 5.43400000e-03 7.15900000e-03 8.84600000e-03 1.07810000e-02
 1.32060000e-02 1.57130000e-02 1.86770000e-02 2.17231000e-02
 2.54170000e-02 2.90160000e-02 3.32460000e-02 3.72510000e-02
 4.15230000e-02 4.62370000e-02 5.12310200e-02 5.63350000e-02
 6.20930000e-02 6.79770000e-02 7.43141800e-02 8.13043200e-02
 8.85350000e-02 9.65492800e-02 1.05605100e-01 1.14616000e-01
 1.23674000e-01 1.33056000e-01 1.42664000e-01 1.52914000e-01
 1.64036000e-01 1.75954920e-01 1.88448390e-01 2.01622000e-01
 2.15552000e-01 2.30163000e-01 2.45000000e-01 2.60579000e-01
 2.75797810e-01 2.93171000e-01 3.10860300e-01 3.29902000e-01
 3.49118890e-01 3.69052000e-01 3.90046000e-01 4.12181000e-01
 4.35024000e-01 4.58287480e-01 4.82305000e-01 5.09292000e-01
 5.37040000e-01 5.66192000e-01 5.95893470e-01 6.27025660e-01
 6.59177000e-01 6.93979080e-01 7.28953390e-01 7.67045380e-01
 8.07199000e-01 8.481910

array([0.00000000e+00, 6.00000000e-05, 2.31000000e-04, 5.45000000e-04,
       1.07400000e-03, 1.81900000e-03, 2.67400000e-03, 3.93406000e-03,
       5.43400000e-03, 7.15900000e-03, 8.84600000e-03, 1.07810000e-02,
       1.32060000e-02, 1.57130000e-02, 1.86770000e-02, 2.17231000e-02,
       2.54170000e-02, 2.90160000e-02, 3.32460000e-02, 3.72510000e-02,
       4.15230000e-02, 4.62370000e-02, 5.12310200e-02, 5.63350000e-02,
       6.20930000e-02, 6.79770000e-02, 7.43141800e-02, 8.13043200e-02,
       8.85350000e-02, 9.65492800e-02, 1.05605100e-01, 1.14616000e-01,
       1.23674000e-01, 1.33056000e-01, 1.42664000e-01, 1.52914000e-01,
       1.64036000e-01, 1.75954920e-01, 1.88448390e-01, 2.01622000e-01,
       2.15552000e-01, 2.30163000e-01, 2.45000000e-01, 2.60579000e-01,
       2.75797810e-01, 2.93171000e-01, 3.10860300e-01, 3.29902000e-01,
       3.49118890e-01, 3.69052000e-01, 3.90046000e-01, 4.12181000e-01,
       4.35024000e-01, 4.58287480e-01, 4.82305000e-01, 5.09292000e-01,
      

In [12]:
import logging
import subprocess
import sys
import pandas as pd
from pathlib import Path
import numpy as np
from scipy.stats import t,kstest,uniform

LOGGER = logging.getLogger(__name__)

def LogTrans(y, darea = None, addition = 0.1, log = True):
    '''transform predictand from m3/s to log(mm/day)'''
    y = y + addition
    y = y / darea * 86.4
    if log:
        y = np.log(y)
    return y

def InvLogTrans(y, darea = None, addition = 0.1, log = True):
    '''transform predictand from log(mm/day) to m3/s'''
    if log:
        y = np.exp(y)
    y = y * darea / 86.4
    y = y - addition
    return y

def check_GPU():
    try:
        result = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
        GPU = True
        if result.returncode != 0:
            GPU = False
    except FileNotFoundError:
        GPU = False
    return GPU

def quantile_ied(x_vec, q):
    """
    Inverse of empirical distribution function (quantile R type 1).

    More details in
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.mquantiles.html
    https://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html
    https://en.wikipedia.org/wiki/Quantile

    Arguments:
    x_vec -- A pandas series containing the values to compute the quantile for
    q -- An array of probabilities (values between 0 and 1)
    """

    x_vec = x_vec.sort_values()
    n = len(x_vec) - 1
    m = 0
    j = (n * q + m).astype(int)  # location of the value
    g = n * q + m - j

    gamma = (g != 0).astype(int)
    quant_res = (1 - gamma) * x_vec.shift(1, fill_value=0).iloc[j] + gamma * x_vec.iloc[
        j
    ]
    quant_res.index = q
    # add min at quantile zero and max at quantile one (if needed)
    if 0 in q:
        quant_res.loc[0] = x_vec.min()
    if 1 in q:
        quant_res.loc[1] = x_vec.max()
    return quant_res

def check_uniform(data):
    # Define the parameters of the uniform distribution
    data_min = np.min(data)
    data_max = np.max(data)
    scale = data_max - data_min

    # Perform the Kolmogorov–Smirnov test
    # We need to standardize the uniform distribution to match the data
    ks_stat, p_value = kstest(data, 'uniform', args=(data_min, scale))
    return ks_stat, p_value

In [28]:
df = pd.read_pickle('../data/Qmax7_final_dataset_seasonal4.pkl')
feature = 'ImperviousSurface'
low = df[feature].quantile(.05)
upp = df[feature].quantile(.95)
# downsampling would be only conducted for dataframe within 5th-95th percentiles
df0 = df.loc[(df[feature]>=low)&(df[feature]<=upp),:].reset_index(drop=True)
df1 = df.loc[(df[feature]<low)|(df[feature]>upp),:].reset_index(drop=True)

ks_stat, p_value = check_uniform(df0.ImperviousSurface.values)
print(ks_stat, p_value)

0.634361095618726 0.0


In [29]:
df0['group'] = pd.cut(df0[feature], 10)
sample_num = df0.groupby('group').Q.count().min()
df0 = df0.groupby('group').apply(lambda x:x.sample(n=sample_num, replace = False)).reset_index(drop=True).drop(columns=['group'])
df = pd.concat([df0, df1])

ks_stat, p_value = check_uniform(df0.ImperviousSurface.values)
print(ks_stat, p_value)

  sample_num = df0.groupby('group').Q.count().min()
  df0 = df0.groupby('group').apply(lambda x:x.sample(n=sample_num, replace = False)).reset_index(drop=True).drop(columns=['group'])


0.042121066575319305 4.9388619730204945e-110


  df0 = df0.groupby('group').apply(lambda x:x.sample(n=sample_num, replace = False)).reset_index(drop=True).drop(columns=['group'])


In [31]:
s = ''
f'dfdaf_{s}.csv'

'dfdaf_.csv'

In [27]:
import matplotlib.pyplot as plt
df['group'] = pd.cut(df.ImperviousSurface.values, bins = [0,5,10,15,20,100])
df.groupby('group').Q.count()

  df.groupby('group').Q.count()


group
(0, 5]       43117
(5, 10]      39298
(10, 15]     19196
(15, 20]      9451
(20, 100]    20421
Name: Q, dtype: int64

In [10]:
import matplotlib.pyplot as plt
import pandas as pd
import cartopy.crs as ccrs
import seaborn as sns
import branca.colormap as cm

df1 = pd.read_csv('../results/run_Qmax7_onlyUrban_3005_1307_seed950105/xgb_noLULC_cv10_station_based_result.csv')
df2 = pd.read_csv('../results/run_Qmax7_onlyUrban_3005_1307_seed950105/xgb_onlyUrban_cv10_station_based_result.csv')

df = pd.merge(df1, df2, on = ['ohdb_id','ohdb_longitude','ohdb_latitude','climate_label'])
for name in ['KGE','r','alpha','beta','nRMSEminmax','nRMSEmean']:
    df[name+'_diff'] = df[name+'_y'] - df[name+'_x']

import folium
# Create a map centered on the mean of your coordinates
map_center = [df['ohdb_latitude'].mean(), df['ohdb_longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=4)

colormap = cm.LinearColormap(
    ['red', 'blue'],
    vmin=-1,
    vmax=1,
    caption='Data Value' # Title for the legend
)

for idx, row in df.iterrows():
    folium.CircleMarker(
        location=[row['ohdb_latitude'], row['ohdb_longitude']],
        radius=2,  # Size of the circle marker
        color=colormap(row['KGE_diff']),  # Border color (use colormap)
        fill=True,
        fill_color=colormap(row['KGE_diff']), # Fill color (use colormap)
        fill_opacity=0.7,
        tooltip=f"Value: {row['KGE_diff']:.2f}" # Hover tooltip
    ).add_to(m)

# Display the map in the notebook
m