# Datacleaning CMIP5
- This is code for dataprocessing for 'norcal' and 'socal'

### Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from numpy import ma
import xarray as xr
import geopandas as gpd
import pandas as pd
# requires cartopy to be installed
import cartopy.feature as cfeature
import cartopy.io.shapereader as shpreader

import cartopy.crs as ccrs # for projection
import cartopy.feature as cfeature # for map features
from cartopy.util import add_cyclic_point
from matplotlib.axes import Axes
from cartopy.mpl.geoaxes import GeoAxes
#from matplotlib.colors import TwoSlopeNorm
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
import xesmf as xe 
import sys
import os
import dask

### Processing

In [2]:
models = ["BNU-ESM", "CNRM-CM5", "CSIRO-Mk3-6-0", "CanESM2", "GFDL-ESM2G", "GFDL-ESM2M", "HadGEM2-CC365", "HadGEM2-ES365", "IPSL-CM5A-LR",
         "IPSL-CM5A-MR", "IPSL-CM5B-LR", "MIROC-ESM-CHEM", "MIROC-ESM", "MIROC5", "bcc-csm1-1-m", "MRI-CGCM3", "bcc-csm1-1", "inmcm4"]

In [3]:
len(models)

18

In [4]:
from dask_jobqueue import SLURMCluster

cluster = SLURMCluster(cores=3,
                       processes=3,
                       memory="100GB",
                       walltime="04:00:00",
                       scheduler_options={'host': '172.22.179.3:7662'}) # Change the last 4 numbers here to something else between 7000-8000

cluster.scale(18)

In [5]:
from dask.distributed import Client

client = Client(cluster)

client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: /proxy/8787/status,

0,1
Dashboard: /proxy/8787/status,Workers: 18
Total threads: 18,Total memory: 558.72 GiB

0,1
Comm: tcp://172.22.179.3:7662,Workers: 18
Dashboard: /proxy/8787/status,Total threads: 18
Started: Just now,Total memory: 558.72 GiB

0,1
Comm: tcp://172.22.178.80:42230,Total threads: 1
Dashboard: /proxy/40559/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.80:43214,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-g0c8bupq,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-g0c8bupq

0,1
Comm: tcp://172.22.178.80:43151,Total threads: 1
Dashboard: /proxy/33719/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.80:42770,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-we7_pbis,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-we7_pbis

0,1
Comm: tcp://172.22.178.80:35454,Total threads: 1
Dashboard: /proxy/45535/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.80:36929,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-zjzv_8as,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-zjzv_8as

0,1
Comm: tcp://172.22.178.81:44437,Total threads: 1
Dashboard: /proxy/33769/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.81:33367,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-_sn6lsqx,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-_sn6lsqx

0,1
Comm: tcp://172.22.178.81:33045,Total threads: 1
Dashboard: /proxy/44724/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.81:38789,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-24nvluhz,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-24nvluhz

0,1
Comm: tcp://172.22.178.81:45966,Total threads: 1
Dashboard: /proxy/39387/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.81:34445,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-z3svmyfd,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-z3svmyfd

0,1
Comm: tcp://172.22.178.74:33172,Total threads: 1
Dashboard: /proxy/45167/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.74:39705,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-fuc4lud_,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-fuc4lud_

0,1
Comm: tcp://172.22.178.74:36944,Total threads: 1
Dashboard: /proxy/33794/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.74:36792,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-tlg1gysj,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-tlg1gysj

0,1
Comm: tcp://172.22.178.74:45282,Total threads: 1
Dashboard: /proxy/36095/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.74:37653,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-gh7ugu1c,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-gh7ugu1c

0,1
Comm: tcp://172.22.178.84:33223,Total threads: 1
Dashboard: /proxy/40579/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.84:38018,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-w0o8djgx,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-w0o8djgx

0,1
Comm: tcp://172.22.178.84:37516,Total threads: 1
Dashboard: /proxy/44352/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.84:36186,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-whyiwo3u,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-whyiwo3u

0,1
Comm: tcp://172.22.178.84:46136,Total threads: 1
Dashboard: /proxy/41125/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.84:37105,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-nplqvr8p,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-nplqvr8p

0,1
Comm: tcp://172.22.178.69:43056,Total threads: 1
Dashboard: /proxy/42031/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.69:35516,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-kmq457p9,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-kmq457p9

0,1
Comm: tcp://172.22.178.69:43961,Total threads: 1
Dashboard: /proxy/46040/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.69:40139,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-3penjtu2,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-3penjtu2

0,1
Comm: tcp://172.22.178.69:39770,Total threads: 1
Dashboard: /proxy/38014/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.69:33289,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-mo5d35v4,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-mo5d35v4

0,1
Comm: tcp://172.22.178.82:45634,Total threads: 1
Dashboard: /proxy/37660/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.82:44905,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-duh9bf2b,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-duh9bf2b

0,1
Comm: tcp://172.22.178.82:44000,Total threads: 1
Dashboard: /proxy/42008/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.82:44493,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-mu816ci1,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-mu816ci1

0,1
Comm: tcp://172.22.178.82:40726,Total threads: 1
Dashboard: /proxy/36988/status,Memory: 31.04 GiB
Nanny: tcp://172.22.178.82:36564,
Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-srxnaep9,Local directory: /data/keeling/a/ctavila2/tmp/dask-scratch-space/worker-srxnaep9


In [6]:
def model_yearly_averages(model, start_day, end_day, lon_max, lon_min, lat_max, lat_min):
    base_path_H = "/data/keeling/a/davidcl2/d/MACA/FWI_RHmin/historical/out/comp/macav2metdata_fwi_" + model + "_r1i1p1_historical_"
    base_path_85 = "/data/keeling/a/davidcl2/d/MACA/FWI_RHmin/rcp45/out/macav2metdata_fwi_" + model + "_r1i1p1_rcp45_"
    
    # Initialize an empty list to store the datasets
    datasets = []
    
    # Loop through the years from 1950 to 1980 (adjust as needed)
    for year in range(1975, 2004, 5):
        # Construct the file path for the current 5-year range
        file_path = f"{base_path_H}{year}_{year + 4}_CONUS_daily.nc"
        
        # Open the dataset and append it to the list
        ds = xr.open_dataset(file_path)
        day_ds = ds.apply(lambda x: x.where((x['time.dayofyear'] >= start_day) & (x['time.dayofyear'] <= end_day)))
        averaged_yearly = day_ds.groupby('time.year').mean(dim='time')
        datasets.append(averaged_yearly)
        
    filein2005 = "/data/keeling/a/davidcl2/d/MACA/FWI_RHmin/historical/out/comp/macav2metdata_fwi_" + model + "_r1i1p1_historical_2005_2005_CONUS_daily.nc"
    ds = xr.open_dataset(filein2005)
    day_ds = ds.apply(lambda x: x.where((x['time.dayofyear'] >= start_day) & (x['time.dayofyear'] <= end_day)))
    averaged_yearly = day_ds.groupby('time.year').mean(dim='time')
    datasets.append(averaged_yearly)
    
    for year in range(2006, 2060, 5):
        
        file_path = f"{base_path_85}{year}_{year + 4}_CONUS_daily.nc"
        
        # Open the dataset and append it to the list
        ds = xr.open_dataset(file_path)
        day_ds = ds.apply(lambda x: x.where((x['time.dayofyear'] >= start_day) & (x['time.dayofyear'] <= end_day)))
        averaged_yearly = day_ds.groupby('time.year').mean(dim='time')
        datasets.append(averaged_yearly)
    
    
    # Drop any NaN values created by the mask
    concatenated_ds = xr.concat(datasets, dim='year')
    selected_ds = concatenated_ds.dropna(dim='year', how='all')
    #annual_mean = selected_ds.groupby('time.year').mean(dim='time')
    selected_ds["lon"] = np.where(selected_ds["lon"] > 180, selected_ds["lon"] - 360, selected_ds["lon"])
    selected_ds = selected_ds.sortby("lon")
    
    annual_mean_boundaries = selected_ds.where((selected_ds.lon >= lon_min) & (selected_ds.lon <= lon_max) & (selected_ds.lat >= lat_min) & (selected_ds.lat <= lat_max), drop=True)
    annual_mean_boundaries = annual_mean_boundaries.mean(dim=['lat'])
    annual_mean_boundaries = annual_mean_boundaries.mean(dim=['lon'])
    
    output_path = "/data/keeling/a/ctavila2/7_bootstrapping/models_yearly_points/macav2metdata_fwi_" + model + "_r1i1p1_rcp4.5_tmaxrhmin_1979_2060_yearly_points_socal.nc"
    
    # Save the dataset to a .nc file
    annual_mean_boundaries.to_netcdf(output_path)
    
    print("iteration " + model)




In [7]:
delayed = []
for model in models: # only do for 5 models
    out = dask.delayed(model_yearly_averages)(model=model, start_day=152, end_day=304, lon_max=-116.5, lon_min=-118.5, lat_max = 36, lat_min = 34)
    delayed.append(out)



In [8]:
delayed

[Delayed('model_yearly_averages-1ff52f2e-d836-4b15-b70d-b60b976a3cc2'),
 Delayed('model_yearly_averages-42858aa8-ce56-4348-9465-c2e9469fa6b3'),
 Delayed('model_yearly_averages-d8331295-bbbd-4c03-bd87-1aa2a358387b'),
 Delayed('model_yearly_averages-f339182f-2f0a-4e8b-8f4a-49356ddccc96'),
 Delayed('model_yearly_averages-66646887-0d1d-44bf-9e87-dc19c7342175'),
 Delayed('model_yearly_averages-677c149d-fe05-4c86-94d7-ee94953557ad'),
 Delayed('model_yearly_averages-09acd091-ccd6-4293-a342-2b940596e16a'),
 Delayed('model_yearly_averages-2fb61568-aabf-4b5a-8309-0771cb27fd18'),
 Delayed('model_yearly_averages-d47c7bbd-df5b-4dc6-b84e-1a0eb8d19e2c'),
 Delayed('model_yearly_averages-479b7913-af5a-47ca-9bdf-170489032236'),
 Delayed('model_yearly_averages-fce04674-fc81-4766-9803-471e4a38f8ca'),
 Delayed('model_yearly_averages-4d92f69f-ae77-4259-842e-02a3104dde8c'),
 Delayed('model_yearly_averages-4d821a19-fe1f-4810-af29-d7a1710af882'),
 Delayed('model_yearly_averages-902a991b-2c2b-4e14-9f7f-9f070e99

In [9]:
results = dask.compute(*delayed)  # Specify distributed scheduler