In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import xarray as xr

import nex_gddp_utils as model_funcs

import dask.dataframe as dd
import dask.bag as db 


import dask_gateway
from dask.distributed import print as rprint

In [3]:
models = sorted([
    'UKESM1-0-LL',
     'NorESM2-MM',
     'NorESM2-LM',
     'MRI-ESM2-0',
     'MPI-ESM1-2-LR',
     'MPI-ESM1-2-HR',
     'MIROC6',
     'MIROC-ES2L',
     'KIOST-ESM',
     'KACE-1-0-G',
     'IPSL-CM6A-LR',
     'INM-CM5-0',
     'INM-CM4-8',
     'HadGEM3-GC31-MM',
     'HadGEM3-GC31-LL',
     'GFDL-ESM4',
     'GFDL-CM4',
     'FGOALS-g3',
     'EC-Earth3-Veg-LR',
     'EC-Earth3',
     'CanESM5',
     'CNRM-ESM2-1',
     'CNRM-CM6-1',
     'CMCC-ESM2',
     'CMCC-CM2-SR5',
     'ACCESS-ESM1-5',
     'ACCESS-CM2',
     'TaiESM1'
])
models

model_family = {'UKESM1-0-LL': 'HadAM',
 'NorESM2-MM': 'CCM',
 'NorESM2-LM': 'CCM',
 'MRI-ESM2-0': 'UCLA GCM',
 'MPI-ESM1-2-LR': 'ECMWF',
 'MPI-ESM1-2-HR': 'ECMWF',
 'MIROC6': 'MIROC',
 'MIROC-ES2L': 'MIROC',
 'KIOST-ESM': 'GFDL',
 'KACE-1-0-G': 'HadAM',
 'IPSL-CM6A-LR': 'IPSL',
 'INM-CM5-0': 'INM',
 'INM-CM4-8': 'INM',
 'HadGEM3-GC31-MM': 'HadAM',
 'HadGEM3-GC31-LL': 'HadAM',
 'GFDL-ESM4': 'GFDL',
 'GFDL-CM4_gr2': 'GFDL',
 'GFDL-CM4': 'GFDL',
 'FGOALS-g3': 'CCM',
 'EC-Earth3-Veg-LR': 'ECMWF',
 'EC-Earth3': 'ECMWF',
 'CanESM5': 'CanAM',
 'CNRM-ESM2-1': 'ECMWF',
 'CNRM-CM6-1': 'ECMWF',
 'CMCC-ESM2': 'CCM',
 'CMCC-CM2-SR5': 'CCM',
 #'BCC-CSM2-MR': 'CCM',
 'ACCESS-ESM1-5': 'HadAM',
 'ACCESS-CM2': 'HadAM',
 'TaiESM1': 'CCM',
}

In [None]:
%%time

model_df = pd.read_csv(
    # f"s3://cities-climate-hazard/{variable}_era5.csv",
    f"s3://cities-climate-hazard/CanESM5_tasmin_1980-2014.csv",
    storage_options={
        "key": "AKIAUAAZPB7LT747PAX7",
        "secret": "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",
    }
)

See if parallelizing reading the csv is faster

In [23]:
gateway = dask_gateway.Gateway()
cluster_options = gateway.cluster_options()

In [25]:
cluster_options

VBox(children=(HTML(value='<h2>Cluster Options</h2>'), GridBox(children=(HTML(value="<p style='font-weight: bo…

Options<worker_cores=1.0,
        worker_memory=16.0,
        image='pcccr.azurecr.io/public/planetary-computer/python:2023.6.22.0',
        gpu=False,
        environment={'GDAL_DISABLE_READDIR_ON_OPEN': 'EMPTY_DIR',
         'GDAL_HTTP_MERGE_CONSECUTIVE_RANGES': 'YES',
         'GDAL_HTTP_MAX_RETRY': '5',
         'GDAL_HTTP_RETRY_DELAY': '3',
         'USE_PYGEOS': '0'}>


In [55]:
cluster = gateway.new_cluster(cluster_options)
# cluster = dask_gateway.GatewayCluster(public_address="https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway/clusters/prod.76e890ca286b43558f8cece0d48e0ff6/individual-scheduler-system")
client = cluster.get_client()

cluster.adapt(minimum=100)



In [56]:
client

0,1
Connection method: Cluster object,Cluster type: dask_gateway.GatewayCluster
Dashboard: https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway/clusters/prod.a681021f56ba4f08a3244496dc513061/status,


In [None]:
# from distributed.diagnostics.plugin import UploadDirectory

# client.register_scheduler_plugin(UploadDirectory("./"))
# client.register_worker_plugin(UploadDirectory("./"))

In [None]:
%%time

model_df = dd.read_csv(
    # f"s3://cities-climate-hazard/{variable}_era5.csv",
    f"s3://cities-climate-hazard/CanESM5_tasmin_1980-2014.csv",
    storage_options={
        "key": "AKIAUAAZPB7LT747PAX7",
        "secret": "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",
    },
    blocksize=25e6
).compute()

Yup a lot faster (3x on 50 worker 1vcpu-8G cluster) to read the csvs

In [None]:
model_df.index = pd.date_range(start='1980-01-01', end='2014-12-31', freq='D')

In [None]:
# model_df = model_df.dropna(axis=1)

In [None]:
model_ds = xr.Dataset(
    {'min_temp': (['model', 'time', 'city'], [model_df.head(), model_df.head() * .9])},
    coords={'model': ['CanESM5', 'foo'], 'time': model_df.head().index, 'city': model_df.head().columns})

In [35]:
obs_df = pd.read_csv('../era5/air_temperature_at_2_metres.csv', index_col='date')

In [None]:
obs_df.head()

In [None]:
%%time

rmsd = model_df.aggregate(model_funcs.get_rmsd, d2=obs_df)

In [53]:
variable_mapping = {
    'tasmin': 'air_temperature_at_2_metres_1hour_Minimum',
    'tasmax': 'air_temperature_at_2_metres_1hour_Maximum',
    'tas': 'air_temperature_at_2_metres',
    'pr': 'precipitation_amount_1hour_Accumulation',
}

2023-12-29 20:23:47,523 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client


In [57]:
def rank_model(model, variable):

    obs_ddf = dd.read_parquet(
        f's3://cities-climate-hazard/{variable_mapping[variable]}.parquet',
        storage_options={
            "key": "AKIAUAAZPB7LT747PAX7",
            "secret": "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",
            },
        blocksize=25e6
    )
    obs_df = obs_ddf.compute()

    model_ddf = dd.read_parquet(
        f"s3://cities-climate-hazard/{model}_{variable}_1980-2014.parquet",
        storage_options={
            "key": "AKIAUAAZPB7LT747PAX7",
            "secret": "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",
            },
        blocksize=25e6
    )
    model_df = model_ddf.compute()
    # model_ddf = dd.read_csv(
    #     f"s3://cities-climate-hazard/{model}_{variable}_1980-2014.csv",
    #     storage_options={
    #         "key": "AKIAUAAZPB7LT747PAX7",
    #         "secret": "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",
    #         },
    #     blocksize=25e6
    # )  
    
    rprint(model_df.shape, model)
    model_df.index = pd.date_range(start='1980-01-01', end='2014-12-31', freq='D')
    
    # model_ddf.to_parquet(
    #     f"s3://cities-climate-hazard/{model}_{variable}_1980-2014.parquet",
    #     storage_options={
    #         "key": "AKIAUAAZPB7LT747PAX7",
    #         "secret": "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",
    #         },
    #     overwrite=True
    # )
    
    return model_df.aggregate(get_rmsd, d2=obs_df)

In [58]:
models_bag = db.from_sequence(
    models,
    # ['MIROC6'],
    npartitions=56  # Number of partitions should match the number of workers
)

In [59]:
rmsd = models_bag.map(rank_model, 'pr').compute()

(12784, 13135) EC-Earth3-Veg-LR
(12784, 13135) EC-Earth3
(12784, 13135) KIOST-ESM
(12784, 13135) CanESM5
(12784, 13135) CMCC-ESM2
(12784, 13135) INM-CM5-0
(12784, 13135) GFDL-ESM4
(12784, 13135) MIROC6
(12784, 13135) ACCESS-CM2
(12784, 13135) FGOALS-g3
(12784, 13135) MIROC-ES2L
(12784, 13135) CMCC-CM2-SR5
(12784, 13135) ACCESS-ESM1-5
(12784, 13135) GFDL-CM4
(12784, 13135) HadGEM3-GC31-LL
(12784, 13135) MRI-ESM2-0
(12784, 13135) TaiESM1
(12784, 13135) NorESM2-LM
(12784, 13135) MPI-ESM1-2-HR
(12784, 13135) NorESM2-MM
(12784, 13135) KACE-1-0-G
(12784, 13135) UKESM1-0-LL
(12784, 13135) CNRM-ESM2-1
(12784, 13135) CNRM-CM6-1
(12784, 13135) MPI-ESM1-2-LR
(12784, 13135) INM-CM4-8
(12784, 13135) HadGEM3-GC31-MM
(12784, 13135) IPSL-CM6A-LR


In [60]:
rmsd_df = pd.DataFrame(rmsd, index=models)

In [None]:
rmsd_df['KEN_Nairobi'].sort_values()

In [61]:
rmsd_df.to_csv('./rmsd_pr.csv', index_label='model')

In [72]:
datetime.date(2023,12,30) + 

In [None]:
obs_df = pd.read_csv(f'../era5/air_temperature_at_2_metres_1hour_Maximum.csv', index_col='date')

In [36]:
obs_df.to_parquet(
    f"s3://cities-climate-hazard/air_temperature_at_2_metres.parquet",
    storage_options={
        "key": "AKIAUAAZPB7LT747PAX7",
        "secret": "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",
        }
)

In [None]:
model_ddf = dd.read_parquet(
    f"s3://cities-climate-hazard/{model}_{variable}_1980-2014.parquet",
    storage_options={
        "key": "AKIAUAAZPB7LT747PAX7",
        "secret": "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",
        },
    blocksize=25e6
)

In [22]:


HIST_START = 1980
HIST_END = 2014

import calendar
import numpy as np

PERCENTILE_STARTYEAR = 1980
PERCENTILE_ENDYEAR = 2019

def calendardate_percentiles(nex_varname, q, latlon, sh_hem=False):
    hist_start = PERCENTILE_STARTYEAR
    hist_end = PERCENTILE_ENDYEAR
    allyears = []
    for year in range(hist_start, hist_end):
        allyears.append(get_observed_gee(nex_varname, latlon, start_year=year, end_year=year, southern_hem=False))
    if not sh_hem:
        return np.percentile(np.vstack(allyears), q, axis=0)
    else:
        res = np.percentile(np.vstack(allyears), q, axis=0)
        return np.concatenate([res[152:], res[:152]])

def wholeyear_percentile(nex_varname, q, latlon):
    if not nex_varname == 'ari':
        hist_start = PERCENTILE_STARTYEAR
        hist_end = PERCENTILE_ENDYEAR
        allyears = []
        for year in range(hist_start, hist_end):
            allyears.append(get_observed_gee(nex_varname, latlon, start_year=year, end_year=year, southern_hem=False))
        return np.percentile(np.concatenate(allyears).flatten(), q)
    else:
        hist_start = PERCENTILE_STARTYEAR
        hist_end = PERCENTILE_ENDYEAR
        allyears = []
        for year in range(hist_start, hist_end):
            allyears.append(get_observed_gee('pr', latlon, start_year=year, end_year=year, southern_hem=False))
        ari_data = ari(np.concatenate(allyears).flatten())
        return np.percentile(ari_data, 95)

def yearextreme_percentile(nex_varname, q, latlon, wantmax):
    hist_start = PERCENTILE_STARTYEAR
    hist_end = PERCENTILE_ENDYEAR
    allyears = []
    for year in range(hist_start, hist_end):
        allyears.append(get_observed_gee(nex_varname, latlon, start_year=year, end_year=year, southern_hem=False))
    return np.percentile(np.array(allyears), q)

def d2j(datestring):
    d = datetime.date.fromisoformat(datestring)
    jday = d.timetuple().tm_yday
    if calendar.isleap(d.year) and jday > 59:
        jday -= 1
    return jday

def removeLeapDays(arr, start_year, end_year, southern_hem):
    indices_to_remove = []
    for year in range(start_year, end_year + 1):
        if calendar.isleap(year):
            indices_to_remove.append(((year-start_year) * 365) + [0,183][int(southern_hem)] + len(indices_to_remove) + 59)
    return np.delete(arr, indices_to_remove)

def get_rmsd(d1, d2):
    c1 = seasonal_means(d1)
    c2 = seasonal_means(d2[d1.name])

    return np.sqrt(np.mean(np.sum((c1 - c2)**2)))

def count_runs(tf_array, min_runsize):
    falses = np.zeros(tf_array.shape[0]).reshape((tf_array.shape[0],1))
    extended_a = np.concatenate([[0], tf_array, [0]])
    df = np.diff(extended_a)
    starts = np.nonzero(df == 1)[0]
    ends = np.nonzero(df == -1)[0]
    count = 0
    for idx in range(starts.size):
        if ends[idx] - starts[idx] >= min_runsize:
            count += 1
    return count

def longest_run(tf_array):
    if np.sum(tf_array) == 0:
        return 0
    falses = np.zeros(tf_array.shape[0]).reshape((tf_array.shape[0],1))
    extended_a = np.concatenate([[0], tf_array, [0]])
    df = np.diff(extended_a)
    starts = np.nonzero(df == 1)[0]
    ends = np.nonzero(df == -1)[0]
    durations = ends - starts
    return max(durations)
    
def quarters(d, start_year, end_year, southern_hem=False):
    #Takes multi-year array and returns data reorganized into quarters
    q2 = []  # 60-151
    q3 = []  # 152-243
    q4 = []  # 244-334
    q1 = []  # 335-59
    if not southern_hem:
        jan1_idx = 365
        for year in range(start_year, end_year):
            tmp = np.concatenate((d[jan1_idx - 365 : jan1_idx - 365 + 60], d[jan1_idx + 335 : jan1_idx + 365]), axis=0)
            q1.append(tmp)
            q2.append(d[jan1_idx + 60 : jan1_idx + 152])
            q3.append(d[jan1_idx + 152 : jan1_idx + 244])
            q4.append(d[jan1_idx + 244 : jan1_idx + 335])

            jan1_idx += 365 + [0, 0][int(False and calendar.isleap(year))]
        mam_res = np.vstack(q2)
        jja_res = np.vstack(q3)
        son_res = np.vstack(q4)
        djf_res = np.vstack(q1)
    else:
        jul1_idx = 365
        for year in range(start_year, end_year):
            tmp = np.concatenate((d[jul1_idx - 365 : jul1_idx - 365 + 60], d[jul1_idx + 335 : jul1_idx + 365]), axis=0)
            q3.append(tmp)
            q4.append(d[jul1_idx + 60 : jul1_idx + 152])
            q1.append(d[jul1_idx + 152 : jul1_idx + 244])
            q2.append(d[jul1_idx + 244 : jul1_idx + 335])

            jul1_idx += 365 + [0, 0][int(False and calendar.isleap(year))]
        mam_res = np.vstack(q4)
        jja_res = np.vstack(q1)
        son_res = np.vstack(q2)
        djf_res = np.vstack(q3)
    return mam_res, jja_res, son_res, djf_res
    
def seasonal_means(d):
    q = quarters(d, HIST_START, HIST_END)
    return np.array([np.mean(q[0], axis=1), np.mean(q[1], axis=1), np.mean(q[2], axis=1), np.mean(q[3], axis=1)])

def calibration_function(hist_obs, hist_mod):
# Calibration functions are P-P plots of historical and modeled values

    source = np.sort(hist_obs.flatten())
    target= np.sort(hist_mod.flatten())
   
    if (np.max(source) == 0 and np.min(source) == 0):
        return np.arange(0, target.size) / target.size
    if (np.max(target) == 0 and np.min(target) == 0):
        return np.arange(0, source.size) / source.size
    new_indices = []

    for target_idx, target_value in enumerate(target):
        if target_idx < len(source):
            source_value = source[target_idx]
            if source_value > target[-1]:
                new_indices.append(target.size - 1)
            else:
                new_indices.append(np.argmax(target >= source_value))
    return np.array(new_indices) / source.size

def calibrate_component(uncalibrated_data, calibration_fxn):
    N = len(uncalibrated_data)
    unsorted_uncalib = [(i, idx) for idx, i in enumerate(uncalibrated_data)]
    sorted_uncalib = sorted(unsorted_uncalib)
    result = [0] * N
    for j in range(N):
        X_j = j / (N + 1)
        Y_jprime = calibration_fxn[math.floor(X_j * len(calibration_fxn))]
        jprime = math.floor(Y_jprime * (N + 1))
        result[sorted_uncalib[j][1]] = sorted_uncalib[min(len(sorted_uncalib)-1, jprime)][0]
    
    return result

def calibrate(uncalibrated_data, calibration_fxn):
    mam = []
    jja = []
    son = []
    djf = []
    mam_idx = []
    jja_idx = []
    son_idx = []
    djf_idx = []
    for idx, i in enumerate(uncalibrated_data):
        if idx % 365 >= 60 and idx % 365 < 152:
            mam.append(uncalibrated_data[idx])
            mam_idx.append(idx)
        elif idx % 365 >= 152 and idx % 365 < 244:
            jja.append(uncalibrated_data[idx])
            jja_idx.append(idx)
        elif idx % 365 >= 244 and idx % 365 < 335:
            son.append(uncalibrated_data[idx])
            son_idx.append(idx)
        else:
            djf.append(uncalibrated_data[idx])
            djf_idx.append(idx)
    
    mam_calib = calibrate_component(np.array(mam), calibration_fxn[0])
    jja_calib = calibrate_component(np.array(jja), calibration_fxn[1])
    son_calib = calibrate_component(np.array(son), calibration_fxn[2])
    djf_calib = calibrate_component(np.array(djf), calibration_fxn[3])
    
    result = [0] * len(uncalibrated_data)
    for i in range(len(mam_idx)):
        result[mam_idx[i]] = mam_calib[i]
    for i in range(len(jja_idx)):
        result[jja_idx[i]] = jja_calib[i]
    for i in range(len(son_idx)):
        result[son_idx[i]] = son_calib[i]
    for i in range(len(djf_idx)):
        result[djf_idx[i]] = djf_calib[i]

    return np.array(result)

In [2]:
tasmax_rmsd = pd.read_csv('./rmsd_tasmax.csv', index_col='model')

In [5]:
tasmax_rmsd['model_family'] = [model_family[idx] for idx in tasmax_rmsd.index]

In [6]:
tasmax_rmsd[['IND_Kolkata', 'model_family']].sort_values(by='IND_Kolkata').drop_duplicates(subset='model_family', keep='first')

Unnamed: 0_level_0,IND_Kolkata,model_family
model,Unnamed: 1_level_1,Unnamed: 2_level_1
EC-Earth3-Veg-LR,19.054157,ECMWF
MIROC-ES2L,19.272219,MIROC
ACCESS-ESM1-5,19.649043,HadAM
MRI-ESM2-0,20.251476,UCLA GCM
IPSL-CM6A-LR,21.605611,IPSL
NorESM2-MM,22.436936,CCM
KIOST-ESM,23.533962,GFDL
CanESM5,23.556727,CanAM
INM-CM5-0,24.285861,INM


In [None]:
model_df = pd.read_parquet(
    f"s3://cities-climate-hazard/MIROC6_pr_1980-2014.parquet",
    storage_options={
        "key": "AKIAUAAZPB7LT747PAX7",
        "secret": "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",
        }
)

In [4]:
tas_max = pd.read_csv('../era5/air_temperature_at_2_metres_1hour_Maximum.csv', index_col='date')

In [8]:
tas_max['IND_Kolkata']

date
1980-01-01    299.5000
1980-01-02    299.0625
1980-01-03    294.8125
1980-01-04    296.8125
1980-01-05    296.1250
                ...   
2014-12-27    295.3750
2014-12-28    295.7500
2014-12-29    296.8750
2014-12-30    295.6250
2014-12-31    296.8750
Name: IND_Kolkata, Length: 12784, dtype: float64