### CMIP5 Data Processing

In [5]:
import matplotlib.pyplot as plt
import numpy as np
from numpy import ma
import xarray as xr
import geopandas as gpd
import pandas as pd
import dask

# requires cartopy to be installed
import cartopy.feature as cfeature
import cartopy.io.shapereader as shpreader

import cartopy.crs as ccrs # for projection
import cartopy.feature as cfeature # for map features
from cartopy.util import add_cyclic_point
from matplotlib.axes import Axes
from cartopy.mpl.geoaxes import GeoAxes
#from matplotlib.colors import TwoSlopeNorm
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
from collections import Counter
import sys
import os
import time

### Using Dask to process large data

In [2]:
from dask_jobqueue import SLURMCluster

cluster = SLURMCluster(cores=1,
                       processes=1,
                       memory="200GB",
                       walltime="04:00:00",
                       scheduler_options={'host': '172.22.179.3:7674'}) # Change the last 4 numbers here to something else between 7000-8000

cluster.scale(6)

In [3]:
from dask.distributed import Client

client = Client(cluster)

client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: /proxy/8787/status,

0,1
Dashboard: /proxy/8787/status,Workers: 6
Total threads: 6,Total memory: 1.09 TiB

0,1
Comm: tcp://172.22.179.3:7674,Workers: 6
Dashboard: /proxy/8787/status,Total threads: 6
Started: Just now,Total memory: 1.09 TiB

0,1
Comm: tcp://172.22.179.129:39921,Total threads: 1
Dashboard: /proxy/41152/status,Memory: 186.26 GiB
Nanny: tcp://172.22.179.129:44340,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-qu2tc4zp,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-qu2tc4zp

0,1
Comm: tcp://172.22.179.126:38293,Total threads: 1
Dashboard: /proxy/34922/status,Memory: 186.26 GiB
Nanny: tcp://172.22.179.126:43853,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-lmcv8ujn,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-lmcv8ujn

0,1
Comm: tcp://172.22.179.128:39566,Total threads: 1
Dashboard: /proxy/34563/status,Memory: 186.26 GiB
Nanny: tcp://172.22.179.128:40662,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-vb3eyaab,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-vb3eyaab

0,1
Comm: tcp://172.22.178.78:39183,Total threads: 1
Dashboard: /proxy/44304/status,Memory: 186.26 GiB
Nanny: tcp://172.22.178.78:34832,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-v2_5qaj6,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-v2_5qaj6

0,1
Comm: tcp://172.22.179.127:44060,Total threads: 1
Dashboard: /proxy/38237/status,Memory: 186.26 GiB
Nanny: tcp://172.22.179.127:39512,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-l5rjok8m,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-l5rjok8m

0,1
Comm: tcp://172.22.178.77:45849,Total threads: 1
Dashboard: /proxy/33109/status,Memory: 186.26 GiB
Nanny: tcp://172.22.178.77:36009,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-pks4i008,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-pks4i008


In [4]:
models = ["BNU-ESM", "CNRM-CM5", "CSIRO-Mk3-6-0", "CanESM2", "GFDL-ESM2G", "GFDL-ESM2M", "HadGEM2-CC365", "HadGEM2-ES365", "IPSL-CM5A-LR",
         "IPSL-CM5A-MR", "IPSL-CM5B-LR", "MIROC-ESM-CHEM", "MIROC-ESM", "MIROC5", "bcc-csm1-1-m", "MRI-CGCM3", "bcc-csm1-1", "inmcm4"]

In [5]:
start_time = time.time()

def coarsened_all(model, start_day, end_day, lon_max, lon_min, lat_max, lat_min):
    print(time.time() - start_time)           #####################################3#######################################################
    base_path_H = "/data/keeling/a/davidcl2/d/MACA/FWI_RHmin/historical/out/comp/macav2metdata_fwi_" + model + "_r1i1p1_historical_"
    base_path_85 = "/data/keeling/a/davidcl2/d/MACA/FWI_RHmin/rcp85/out/macav2metdata_fwi_" + model + "_r1i1p1_rcp85_"
    
    # Initialize an empty list to store the datasets
    datasets = []
    
    for year in range(1975, 2004, 5):
        # Construct the file path for the current 5-year range
        file_path = f"{base_path_H}{year}_{year + 4}_CONUS_daily.nc"
        
        # Open the dataset and append it to the list
        ds = xr.open_dataset(file_path)
        datasets.append(ds)
        
    filein2005 = "/data/keeling/a/davidcl2/d/MACA/FWI_RHmin/historical/out/comp/macav2metdata_fwi_" + model + "_r1i1p1_historical_2005_2005_CONUS_daily.nc"
    datasets.append(xr.open_dataset(filein2005))

    for year in range(2006, 2025, 5):
        
        file_path = f"{base_path_85}{year}_{year + 4}_CONUS_daily.nc"
        
        # Open the dataset and append it to the list
        ds = xr.open_dataset(file_path)
        datasets.append(ds)

    concatenated_ds = xr.concat(datasets, dim='time')
    print(time.time() - start_time)                       ######################################################################
    
    ds_1979_2010 = concatenated_ds.sel(time=slice('1979-01-01', '2022-10-31'))

    # Use .groupby() to group the data by year
    grouped_ds = ds_1979_2010.groupby('time.year')
    
    # Use .where() to mask the days outside the desired range for each year
    selected_ds = grouped_ds.apply(lambda x: x.where((x['time.dayofyear'] >= start_day) & (x['time.dayofyear'] <= end_day)))
    
    # Drop any NaN values created by the mask
    selected_ds = selected_ds.dropna(dim='time', how='all')
    print(time.time() - start_time)                       #################################################################
    #annual_mean = selected_ds.groupby('time.year').mean(dim='time')
    selected_ds["lon"] = np.where(selected_ds["lon"] > 180, selected_ds["lon"] - 360, selected_ds["lon"])
    selected_ds = selected_ds.sortby("lon")
    
    annual_mean_boundaries = selected_ds.where((selected_ds.lon >= lon_min) & (selected_ds.lon <= lon_max) & (selected_ds.lat >= lat_min) & (selected_ds.lat <= lat_max), drop=True)

    output_path = "/data/keeling/a/ctavila2/7_bootstrapping/models_tmaxrhmin_smaller/macav2metdata_fwi_" + model + "_r1i1p1_rcp8.5_tmaxrhmin_1979_2022_CONUS_daily_DASK_oregon.nc"

    # Save the dataset to a .nc file
    annual_mean_boundaries.to_netcdf(output_path)
    
    print("iteration " + model)
    





In [8]:
lon_max = -114.016667
lon_min = -124.766667
lat_min = 32.025
lat_max = 50

In [9]:
delayed = []
for model in models: # only do for 5 models
    out = dask.delayed(coarsened_all)(model=model, start_day=152, end_day=304, lon_max=-114.016667, lon_min=-124.766667, lat_max = 50, lat_min = 32.025)
    delayed.append(out)

In [10]:
delayed

[Delayed('coarsened_all-9ce16bd1-ff3b-4a73-9a93-cd53b477f50b'),
 Delayed('coarsened_all-2976797f-ffa7-4091-9059-ea86128f6900'),
 Delayed('coarsened_all-fea91c89-55ca-4a32-9485-21b24ea90c72'),
 Delayed('coarsened_all-4d99c506-2f29-43c3-ba4f-91e479fb15a9'),
 Delayed('coarsened_all-2d86a4c0-fedd-4b7f-b449-3e2c28c5352a'),
 Delayed('coarsened_all-e7455787-8524-42a9-85f8-e295d386f573'),
 Delayed('coarsened_all-7f66a4c5-66c3-4e07-a0ce-4ff0e16ad68e'),
 Delayed('coarsened_all-821e593f-6b27-4cde-9845-f9f7483cd275'),
 Delayed('coarsened_all-9f4b2c80-0787-4f69-abf4-9e1c4a262455'),
 Delayed('coarsened_all-16160c8b-b596-48b4-9ffe-f16fea12d436'),
 Delayed('coarsened_all-e3fd3946-e2c5-4541-9b28-0cd2926c4a6f'),
 Delayed('coarsened_all-04f8dd4c-51aa-468f-b9aa-936d6b7a66dd'),
 Delayed('coarsened_all-0d0858e9-f84a-4a1b-ad32-e1674a94c6a8'),
 Delayed('coarsened_all-17916641-3199-4207-84f9-0b7e6a129af4'),
 Delayed('coarsened_all-0872c973-2882-4559-b89a-34e761389cb2'),
 Delayed('coarsened_all-cd3c4679-eb47-47

In [11]:
results = dask.compute(*delayed)  # Specify distributed scheduler

### Check filesize

In [22]:
file_path = "/data/keeling/a/ctavila2/7_bootstrapping/models_tmaxrhmin_smaller/macav2metdata_fwi_HadGEM2-CC365_r1i1p1_rcp4.5_tmaxrhmin_1979_2022_CONUS_daily_DASK.nc"



def get_file_size_gb(file_path):
    try:
        size_bytes = os.path.getsize(file_path)
        size_gb = size_bytes / (1024**3)  # Convert bytes to gigabytes
        return size_gb
    except FileNotFoundError:
        print(f"The file '{file_path}' does not exist.")
        return None

# Example usage:
file_size_gb = get_file_size_gb(file_path)

if file_size_gb is not None:
    print(f"The size of '{file_path}' is {file_size_gb:.2f} GB.")

The size of '/data/keeling/a/ctavila2/7_bootstrapping/models_tmaxrhmin_smaller/macav2metdata_fwi_HadGEM2-CC365_r1i1p1_rcp8.5_tmaxrhmin_1979_2022_CONUS_daily_DASK.nc' is 1.27 GB.


In [None]:
reader = shpreader.Reader('/data/keeling/a/ctavila2/2_maps/county_shapefile/countyl010g.shp')
    counties = list(reader.geometries())
    #counties
    COUNTIES = cfeature.ShapelyFeature(counties, ccrs.PlateCarree())
    
    shapefile_path = '/data/keeling/a/ctavila2/4_Model_Trendline/shape/ne_10m_ocean.shp'
    gdf_ocean = gpd.read_file(shapefile_path)
    
    scale = '110m'
    states110 = cfeature.NaturalEarthFeature(
                category='cultural',
                name='admin_1_states_provinces_lines',
                scale=scale,
                facecolor='none',
                edgecolor='r')
    
    cmap='pink'
    cmap='cividis'
    #cmap='viridis'
    cmap='coolwarm'
    # central_longitude=260.0
    ## (17,17)
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw={'projection': ccrs.PlateCarree()})
    spatial_aggregate.FWI.plot(ax=ax,cmap=cmap,vmin=0,vmax=65)
    
    # Set the aspect ratio to 'box' for horizontal stretching
    # Replace min_x and max_x with appropriate values# Plot the second shapefile (gdf_wgs84)
    # Replace 'white' and 'black' with the desired colors and styling
    
    #gdf_wgs84.boundary.plot(ax=ax, color='none', edgecolor='red', linewidth=1.0)
    
    # Set the extent and add other map features as needed
    ax.set_extent([-125, -113, 30, 45], crs=ccrs.PlateCarree())
    ax.add_feature(cfeature.NaturalEarthFeature('cultural', 'admin_1_states_provinces_lines', '110m', edgecolor='gray', facecolor='none'))
    ax.coastlines()
    ax.add_feature(cfeature.LAKES)
    ax.add_feature(COUNTIES, linewidth=0.8, alpha=0.5, facecolor='none', edgecolor='black')
    ax.add_feature(cfeature.BORDERS, linewidth=2, edgecolor='white')
    
    gl = ax.gridlines(draw_labels=True, color='black', alpha=0.5, linestyle='--')
    gl.xlabel_style = {'size': 10, 'color': 'black'}
    gl.ylabel_style = {'size': 10, 'color': 'black'}
    
    gdf_ocean.plot(ax=ax, color='white', edgecolor='white', linewidth=1.0)
    
    plt.title('CMIP5 GFDL-ESM2G Average FWI', fontsize=16)
    plt.savefig('/data/keeling/a/ctavila2/5_Model_Data_FWI/MACA gridmet CMIP5/CMIP5_GFDL-ESM2G Average FWI 1979 to 2010 mxtmp rhavg')
    