In [2]:
from pathlib import Path
import os
import sys
import numpy as np
import pandas as pd
import concurrent.futures
import xarray as xr
import dask
import cftime
import timeit
import convert_ll2str as c2str
import concurrent.futures
import functools

In [3]:
HOME = str(Path.home())

BINS = {1: [2, 4, 7, 10, 14, 18, 24, 30, 40, 55, 70, 90, 110, 135],
        3: [2, 6, 10, 15, 20, 30, 40, 60, 80, 110, 140, 175, 215, 265],
        6: [2, 7, 13, 19, 28, 40, 55, 80, 115, 160, 210, 260, 320, 390]}

# what do these values represent? 
RPS = {1: [19, 24, 32, 36, 42], 3: [29, 35, 44, 49, 57],
       6: [35, 42, 53, 59, 67], 9: [39, 46, 59, 65, 75],
       12: [42, 50, 63, 70, 80], 24: [50, 58, 74, 83, 95]}

rp_years = [5, 10, 30, 50, 100]

global OUTPUT_PATH
global PROJECTION_ID

INPUT_PATH = '/mnt/c/Users/samhardy/OneDrive - JBA Group/2024s1475_RED_UP/ukcp_data/'
SAVE_PATH = '/mnt/c/Users/samhardy/OneDrive - JBA Group/2024s1475_RED_UP/'
NUMBER_OF_WC = 1 # 13

accum_duration_start = {1: 23, 3: 22, 6: 19}
MASK = None

remove_items = ['ensemble_member_id', 'grid_latitude_bnds', 'grid_longitude_bnds',
                'time_bnds','rotated_latitude_longitude', 'year', 'yyyymmddhh', 'ensemble_member']
squeeze_coords = ['bnds', 'ensemble_member']

## Run the notebook code

In [4]:
mask_nc_filename = "UKWC_Cleaned_land-cpm_uk_2.2km.nc"
projection_id = 1 # 2021-2040 time slice
member_id = 1
year = 1981
month = 7

def get_file_name(year: int, 
                  month: int , 
                  m: int
                  ) -> str:
    """ 
    Return string for UKCP file name specific to a month, year and ensemble member
    """
    start_date = f"{year:04d}{month:02d}01"
    file_main = f"pr_rcp85_land-cpm_uk_2.2km_{m:02d}_1hr_{start_date}"
    file_name = os.path.join(INPUT_PATH, file_main + f"-{year:04d}{month:02d}30.nc")

    return file_name


def get_start_year(year: int, 
                   month: int, 
                   hour: int):
    """ 
    This function provides a buffer around the selected date
    Starts the analysis on the 30th of the previous month
    (i.e. 30th June 1981 if the user chose July 1981)
    """
    if year != 1980:
        year1 = year #1981
        month1 = month - 1 #6
        if month == 1:
            year1 = year - 1
            month1 = 12
        start = cftime.Datetime360Day(year1, month1, 30, hour, 0, 0)
    else:
        month = 12
        start = cftime.Datetime360Day(year, month, 1, 0, 30, 0)

    return start


def rolling_window_sum(ds: xr.Dataset, 
                       window_size: int
                       ) -> xr.Dataset:
    """
    rolling window calculation for an xr.ds by defined window size (1-h, 3-h, 6-h, etc)
    """
    print(f"Starting calculation of rolling {str(window_size)}-h accumulated precip!")
    ds_window = ds.rolling(time=window_size, min_periods=window_size).construct("new").sum("new", skipna=True)
    print(f"Finished calculating rolling {str(window_size)}-h accumulated precip!")

    return ds_window


def run_notebook_functions(INPUT_PATH: str, 
                           mask_nc_filename: str, 
                           projection_id: int, 
                           member_id: int,
                           month: int,
                           year: int): 
    """ 
    
    """
    global PROJECTION_ID
    PROJECTION_ID = projection_id
    profile_selected_month = pd.read_csv(os.path.join(INPUT_PATH, "YearsMonths_byBinCounts_Rand_OtherYears.csv"))

    mask_nc = os.path.join(INPUT_PATH, mask_nc_filename)
    mask_orig = xr.open_dataset(mask_nc)
    mask_1D = mask_orig.stack(location=("grid_latitude", "grid_longitude"))

    proj_df = profile_selected_month[profile_selected_month['Projection_slice_ID']== projection_id]
    
    # call_main function 
    df_row = proj_df[(proj_df['Month'] == month) & (proj_df['Year'] == year)]
    if df_row.empty:
        print(f"No data found for Month: {month}, Year: {year}")
    
    year = int(df_row['Year'].iloc[0])
    month = int(df_row['Month'].iloc[0])

    year1 = year
    month1 = month - 1
    if month == 1:
        year1 = year - 1
        month1 = 12

    file_name = get_file_name(year, month, member_id)
    pre_file_name = get_file_name(year1, month1, member_id)

    if (year==1980 and month==12) or (year==2020 and month==12) or (year==2060 and month==12):
        infile = [file_name]
    else:
        infile = [pre_file_name, file_name]

    # main function 
    with xr.open_mfdataset(infile, engine='netcdf4',
                           combine='nested', concat_dim='time', parallel=True,
                           data_vars='minimal', coords='minimal', compat='override') as ds:

        ds = ds.stack(location=("grid_latitude", "grid_longitude"))
        ids = c2str.get_cell_ids(ds.location.values)
        ds.coords['location_id'] = ('location', ids)
        ds = ds.where(ds.bnds == 0, drop=True)
        for item in remove_items:
            del ds[item]
        for item in squeeze_coords:
            ds = ds.squeeze(item)

        ds = ds.where(mask_1D["WCID"] >= 0, drop=True)

        starttime = timeit.default_timer()

        with dask.config.set(**{'array.slicing.split_large_chunks': False}):
            for wcid in range(NUMBER_OF_WC):
                print(f"Working on water company {str(wcid)}")
                ds_mask = ds.where(mask_1D.WCID == wcid, drop=True)
                for duration, start_hour in accum_duration_start.items():
                    start = get_start_year(year, month, start_hour)
                    precip = ds_mask.where(ds['time'] >= start, drop=True)
                    print(f"Calculating {str(duration)}-h accumulated precip, starting at {str(start_hour)}Z")
                    if duration > 1:
                        da_window = rolling_window_sum(precip, duration)
                        da_window = da_window.rename({"pr": "pr_sum"})
                        da_window = da_window.assign(pr=precip.pr)
                        da_window['time'] = da_window["time"].dt.strftime("%Y-%m-%d %H:%M")
                        df_window = da_window.to_dataframe()

                        df_window.index = df_window.index.droplevel(['grid_latitude', 'grid_longitude'])
                        #get_pr_profile(df_window, member_id, month, duration, wcid)

    return da_window, df_window
    
da_window, df_window = run_notebook_functions(INPUT_PATH, mask_nc_filename, projection_id, member_id, month, year)

Working on water company 0
Calculating 1-h accumulated precip, starting at 23Z
Calculating 3-h accumulated precip, starting at 22Z
Starting calculation of rolling 3-h accumulated precip!
Finished calculating rolling 3-h accumulated precip!
Calculating 6-h accumulated precip, starting at 19Z
Starting calculation of rolling 6-h accumulated precip!
Finished calculating rolling 6-h accumulated precip!


### Understand the functionality of `get_pr_profile`

In [None]:
duration = 6
df_win_wid = df_window
OUTPUT_PATH = f"{SAVE_PATH}/precip_profiles/proj{PROJECTION_ID}/output_mem{member_id}"
if not os.path.isdir(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)
wcid=0

# numbers represent rainfall thresholds for each RP within the rolling window (1-h, 3-h, 6-h)
PR = list(RPS[duration])
df_win_wid.insert(0, 'Time', df_win_wid.index) 
df_win_wid.sort_values(by=['location_id', 'Time', 'pr_sum'], inplace=True) 

In [16]:
for i in range(1, len(PR) + 1):
    print(i,PR[i-1],f"Profile_{rp_years[i - 1]}y_{duration}h_ens{member_id}_proj{PROJECTION_ID}.csv")
    if i < len(PR):
        df_pr_threshold = df_win_wid.loc[(df_win_wid['pr_sum'] > PR[i - 1]) 
                                            & (df_win_wid['pr_sum'] <= PR[i])
                                            & (df_win_wid['month_number'] == month) ]
    # this code is the same as the code above, 
    # but there is no upper limit (PR[i]) since we have reached the end of the loop
    else:
        df_pr_threshold = df_win_wid.loc[ (df_win_wid['pr_sum'] > PR[i - 1]) 
                                            & (df_win_wid['month_number'] == month)]

1 35 Profile_5y_6h_ens1_proj1.csv
2 42 Profile_10y_6h_ens1_proj1.csv
3 53 Profile_30y_6h_ens1_proj1.csv
4 59 Profile_50y_6h_ens1_proj1.csv
5 67 Profile_100y_6h_ens1_proj1.csv


In [None]:
select_list = ['Time', 'location_id', 'longitude', 'latitude', 'pr', 'pr_sum']
df_pr_threshold = df_pr_threshold.loc[:, select_list]
location = df_pr_threshold['location_id'].tolist()
df_pr_threshold

Unnamed: 0_level_0,Time,location_id,longitude,latitude,pr,pr_sum
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1981-07-30 11:30,1981-07-30 11:30,UK_00308N_35825E,-5.588207,55.542648,9.184972e+00,73.842995
1981-07-30 12:30,1981-07-30 12:30,UK_00308N_35825E,-5.588207,55.542648,1.468702e-01,73.989868
1981-07-30 13:30,1981-07-30 13:30,UK_00308N_35825E,-5.588207,55.542648,1.330230e-06,73.989868
1981-07-30 14:30,1981-07-30 14:30,UK_00308N_35825E,-5.588207,55.542648,1.129252e-06,73.989868
1981-07-30 15:30,1981-07-30 15:30,UK_00308N_35825E,-5.588207,55.542648,2.625524e-06,73.979057
...,...,...,...,...,...,...
1981-07-30 15:30,1981-07-30 15:30,UK_00488N_35867E,-4.955935,57.357448,1.562098e+01,129.389130
1981-07-30 16:30,1981-07-30 16:30,UK_00488N_35867E,-4.955935,57.357448,3.071456e-06,129.389130
1981-07-30 17:30,1981-07-30 17:30,UK_00488N_35867E,-4.955935,57.357448,3.705551e-07,129.389130
1981-07-30 18:30,1981-07-30 18:30,UK_00488N_35867E,-4.955935,57.357448,1.068980e-06,129.389130


In [19]:
temp_df = df_win_wid[['Time', 'location_id', 'pr', 'pr_sum']]
temp_df = temp_df[temp_df['location_id'].isin(location)]
temp_time = np.array(temp_df['Time'].values)
temp_df1 = np.array(temp_df['pr'].values)
temp_sum = np.array(temp_df['pr_sum'].values)
temp_location = np.array(temp_df['location_id'].values)

In [54]:
index_num = 0
profile_list = []

for index, row in df_pr_threshold.iterrows():
    this_time = row['Time']
    this_location = row['location_id']
    this_sum = row['pr_sum']
    sum_index = np.where(temp_sum == this_sum)
    sum_index1 = sum_index[0][:]

    for sum_ind in sum_index1:
        data_profile = temp_df1[sum_ind - duration + 1: sum_ind + 1].tolist()
        print(temp_df1[sum_ind-duration+1:sum_ind+1])
        if temp_location[sum_ind] == this_location and temp_time[sum_ind] == this_time:
            print(f"Match found at index {sum_ind}")
            break

    profile_list.append(data_profile)
    index_num += 1

[7.1218625e-07 1.7598015e-06 6.8782782e-07 1.0808248e-02 6.4647209e+01
 9.1849718e+00]
Match found at index 712
[1.7598015e-06 6.8782782e-07 1.0808248e-02 6.4647209e+01 9.1849718e+00
 1.4687021e-01]
Match found at index 713
[1.7598015e-06 6.8782782e-07 1.0808248e-02 6.4647209e+01 9.1849718e+00
 1.4687021e-01]
[6.8782782e-07 1.0808248e-02 6.4647209e+01 9.1849718e+00 1.4687021e-01
 1.3302299e-06]
Match found at index 714
[1.7598015e-06 6.8782782e-07 1.0808248e-02 6.4647209e+01 9.1849718e+00
 1.4687021e-01]
[6.8782782e-07 1.0808248e-02 6.4647209e+01 9.1849718e+00 1.4687021e-01
 1.3302299e-06]
[1.0808248e-02 6.4647209e+01 9.1849718e+00 1.4687021e-01 1.3302299e-06
 1.1292523e-06]
Match found at index 715
[6.4647209e+01 9.1849718e+00 1.4687021e-01 1.3302299e-06 1.1292523e-06
 2.6255236e-06]
Match found at index 716
[4.3685830e-07 5.3365324e-07 4.5866202e-07 1.9713052e-07 2.5864997e-06
 7.0798523e+01]
Match found at index 1439
[5.3365324e-07 4.5866202e-07 1.9713052e-07 2.5864997e-06 7.0798523

In [None]:
# df_pr_threshold
# df_pr_threshold['Hyet'] = profile_list
# final_list = ['Time', 'longitude', 'latitude', 'pr_sum', 'Hyet']
# df_pr_threshold = df_pr_threshold.loc[:, final_list]
# df_pr_threshold.columns = ['end date', 'lon', 'lat', 'pr_sum', 'Hyet']

# df_pr_threshold.insert(0, 'WCID', wcid)
# df_pr_threshold.insert(0, 'Member', member_id)
# df_pr_threshold.insert(0, 'Projection_slice_ID', PROJECTION_ID)

### What does `df_pr_threshold` look like before input into `get_bin_counts`? 

In [75]:
df_pr_threshold

Unnamed: 0_level_0,Projection_slice_ID,Member,WCID,end date,lon,lat,pr_sum,Hyet
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1981-07-30 11:30,1,1,0,1981-07-30 11:30,-5.588207,55.542648,73.842995,"[7.121862495296227e-07, 1.7598015347175533e-06..."
1981-07-30 12:30,1,1,0,1981-07-30 12:30,-5.588207,55.542648,73.989868,"[1.7598015347175533e-06, 6.87827821366227e-07,..."
1981-07-30 13:30,1,1,0,1981-07-30 13:30,-5.588207,55.542648,73.989868,"[6.87827821366227e-07, 0.010808248072862625, 6..."
1981-07-30 14:30,1,1,0,1981-07-30 14:30,-5.588207,55.542648,73.989868,"[0.010808248072862625, 64.64720916748047, 9.18..."
1981-07-30 15:30,1,1,0,1981-07-30 15:30,-5.588207,55.542648,73.979057,"[64.64720916748047, 9.184971809387207, 0.14687..."
...,...,...,...,...,...,...,...,...
1981-07-30 15:30,1,1,0,1981-07-30 15:30,-4.955935,57.357448,129.389130,"[1.1898524689968326e-06, 2.0852899069723208e-0..."
1981-07-30 16:30,1,1,0,1981-07-30 16:30,-4.955935,57.357448,129.389130,"[2.0852899069723208e-06, 1.9266476556367707e-0..."
1981-07-30 17:30,1,1,0,1981-07-30 17:30,-4.955935,57.357448,129.389130,"[1.9266476556367707e-06, 2.924600601196289, 11..."
1981-07-30 18:30,1,1,0,1981-07-30 18:30,-4.955935,57.357448,129.389130,"[2.924600601196289, 110.84355163574219, 15.620..."


### `get_bin_counts`

In [None]:
duration = 6
bins = BINS[duration]
OUTPUT_PATH = f"{SAVE_PATH}/precip_profiles/proj{PROJECTION_ID}/output_mem{member_id}"

filename = os.path.join(OUTPUT_PATH, f"Rainfall_bin_counts_{duration}h_ens{member_id}_proj{PROJECTION_ID}.csv")
cols = ['Projection_slice_ID', 'Member', 'Year', 'Month', 'WCID', 'Bin counts']

total_count = []

for i in range(1, len(bins)):
    df_count = df_pr_threshold[(df_pr_threshold['pr_sum'] > bins[i - 1]) & (df_pr_threshold['pr_sum'] < bins[i])]
    total_count.append(df_count.shape[0])
data_list = [PROJECTION_ID, member_id, year, month, wcid, total_count]
total_count_df = pd.DataFrame([data_list], columns=cols)
# 'check_dir' function
directory = os.path.dirname(filename)
if not os.path.exists(directory):
    os.makedirs(directory)
# 'save' function
if os.path.isfile(filename):
    total_count_df.to_csv(filename, mode='a', header=False, index=False, float_format="%.2f")
else:
    total_count_df.to_csv(filename, mode='a', index=False, float_format="%.2f")

# data_list = None
# total_count = None
# total_count_df = None

# if month == 9:
#     df_prcp_copy = df_pr_threshold
#     df_prcp_copy.insert(0, 'Time', df_prcp_copy.index)
#     start_sept_date = cftime.Datetime360Day(year, month, 15, 0, 30, 0)
#     df_prcp_copy = df_prcp_copy[df_prcp_copy['Time'] <= start_sept_date]
#     for i in range(1, len(bins)):
#         df_count = df_prcp_copy[(df_prcp_copy['pr'] > bins[i - 1]) & (df_prcp_copy['pr'] < bins[i])]
#         total_count.append(df_count.shape[0])
#     data_list = [PROJECTION_ID, member_id, year, month, wcid, total_count]
#     total_count_df = pd.DataFrame([data_list], columns=cols)
#     # 'check_dir' function
#     directory = os.path.dirname(filename)
#     if not os.path.exists(directory):
#         os.makedirs(directory)
#     # 'save' function
#     if os.path.isfile(filename):
#         total_count_df.to_csv(filename, mode='a', header=False, index=False, float_format="%.2f")
#     else:
#         total_count_df.to_csv(filename, mode='a', index=False, float_format="%.2f")

#     data_list = None
#     total_count = None
#     total_count_df = None

2 7
7 13
13 19
19 28
28 40
40 55
55 80
80 115
115 160
160 210
210 260
260 320
320 390


Unnamed: 0_level_0,grid_latitude,grid_longitude,latitude,longitude,month_number,location_id,pr_sum,pr
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1981-06-30 19:30,2.840050,358.130737,55.297592,-5.780537,6,UK_00284N_35813E,0.000035,3.501138e-05
1981-06-30 19:30,2.840050,358.150757,55.298496,-5.745451,6,UK_00284N_35815E,0.002589,2.589190e-03
1981-06-30 19:30,2.840050,358.170746,55.299389,-5.710418,6,UK_00284N_35817E,0.016571,1.657110e-02
1981-06-30 19:30,2.860050,358.130737,55.317571,-5.782135,6,UK_00286N_35813E,0.000221,2.209407e-04
1981-06-30 19:30,2.860050,358.150757,55.318475,-5.747032,6,UK_00286N_35815E,0.013826,1.382616e-02
...,...,...,...,...,...,...,...,...
1981-07-30 23:30,8.062849,360.650757,60.556943,-1.189136,7,UK_00806N_36065E,0.055243,2.637622e-06
1981-07-30 23:30,8.062849,360.670746,60.556575,-1.148880,7,UK_00806N_36067E,0.057612,3.252323e-06
1981-07-30 23:30,8.062849,360.690735,60.556195,-1.108626,7,UK_00806N_36069E,0.070465,2.229595e-06
1981-07-30 23:30,8.062849,360.710754,60.555804,-1.068311,7,UK_00806N_36071E,0.081132,9.994269e-07
