# Objective
The purpose of this notebook is to create a unified dataset of landfast ice extent for the Alaskan coastline from the Mahoney / InteRFACE data collection as described in the exploratory data analysis notebooks and summary documentation thereof.

The flow here is as follows:

 1. Create a DateTime index that encompasses the entire range of the data and gracefully handle that the atomic units of data here have variable ranges or "durations" on the order of 20 days. Also check that the turnover between calendar years is handled.
 2. Map the existing data collection to this index.
 3. For each item in the existing data collection constrain the valid array values and establish a common and well known spatial reference.
 4. Create a merged raster from Chukchi and Beaufort data (or lackthereof) for each date in the time index.

In [1]:
import os
import shutil
import re
import datetime
import pandas as pd
import rasterio as rio
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing import Pool
from tqdm import tqdm
from pathlib import Path
from rasterio.crs import CRS
from rasterio.merge import merge
from pandas import Timestamp
NCORES = 28

The inital step is to copy data from the source (SRC) to a pre-processing (DST) directory. We'll also set a destination directory for the ultimate processed data (MRG). Based on our exploratory work, we know that we just want the `slie` data and so we'll copy these data to our pre-processing directory.

In [2]:
os.environ["SRC"] = "/atlas_scratch/kmredilla/ardac/landfast_seaice/"
src_p = Path(os.environ["SRC"])

os.environ["DST"] = "/atlas_scratch/cparr4/landfast_seaice/mahoney_preprocess"
dst_p = Path(os.environ["DST"]).mkdir(parents=True, exist_ok=True)
dst_p = Path(os.environ["DST"])

os.environ["MRG"] = "/atlas_scratch/cparr4/landfast_seaice/mahoney_preprocess/merged"
mrg_p = Path(os.environ["MRG"]).mkdir(parents=True, exist_ok=True)
mrg_p = Path(os.environ["MRG"])

slie_fps = [x for x in src_p.rglob("*_slie.tif")]
new_fps = [dst_p / ''.join(x.parent.parent.name.lower() + "_" + x.name) for x in slie_fps]

# uncomment the two lines below to actually copy the data over
# for src, dst in zip(slie_fps, new_fps):
#     shutil.copy(src, dst)

Next we'll write a few functions to parse file names and convert day-of-year (DOY) style dates to a more readable and DateTime friendly YYYY-MM-DD format. The file names are of the style `chukchi_r2007_326-350_slie.tif`

In [3]:
def get_doy(fp):
    """Fetch the DOY range from a file name."""
    try:
        doy = re.match(r'.*([0-3][0-9][0-9]-[0-3][0-9][0-9])', fp).group(1)
    except:
        doy = re.match(r'.*([0-3][0-9][0-9]_[0-3][0-9][0-9])', fp).group(1)
        doy = doy.replace("_", "-")
    return doy


def get_re_year(fp):
    """Fetch a single year (YYYY) from a file name."""
    year = re.match(r'.*([1-3][0-9]{3})', fp).group(1)
    return int(year)


def split_doy(doy_range):
    """Split a DOY range string into a integer start and end."""
    doy_start, doy_end = doy_range.split("-")
    return int(doy_start), int(doy_end)    


def doy_date_to_YYYYMMDD(year, days):
    """Convert a a DOY + Year date to a YYYY-MM-DD datetime object."""
    dt = datetime.datetime(year, 1, 1) + datetime.timedelta(days - 1)
    return dt

Next we'll construct a lookup where we can see all the date information associated with each file.

In [4]:
di = {}
for fp in new_fps:
    k = fp.name
    di[k] = {}
    di[k]["start_year"] = get_re_year(k)
    di[k]["doy_range"] = get_doy(k)
    di[k]["doy_start"], di[k]["doy_end"] = split_doy(di[k]["doy_range"])
    di[k]["dt_start"] = doy_date_to_YYYYMMDD(di[k]["start_year"], di[k]["doy_start"])
    di[k]["dt_end"] = doy_date_to_YYYYMMDD(di[k]["start_year"], di[k]["doy_end"])
    di[k]["dt_duration"] = di[k]["dt_end"] - di[k]["dt_start"]

But we need to be aware that negative durations are lurking where we jump to the next calendar year, like in the below example:

In [5]:
di['beaufort_r2000_349-003_slie.tif']

{'start_year': 2000,
 'doy_range': '349-003',
 'doy_start': 349,
 'doy_end': 3,
 'dt_start': datetime.datetime(2000, 12, 14, 0, 0),
 'dt_end': datetime.datetime(2000, 1, 3, 0, 0),
 'dt_duration': datetime.timedelta(days=-346)}

We'll create a function to check for these instances and then bump the year as needed.

In [6]:
def set_end_year(di):
    """Adds an end year for each file and handles rollover cases."""
    for k in di.keys():
        if di[k]["dt_duration"].days < 0:
            di[k]["end_year"] = di[k]["start_year"] + 1
            di[k]["dt_end"] = doy_date_to_YYYYMMDD(di[k]["end_year"], di[k]["doy_end"])
            di[k]["dt_duration"] = di[k]["dt_end"] - di[k]["dt_start"]

        else:
            di[k]["end_year"] = di[k]["start_year"]


set_end_year(di)

In [7]:
di['beaufort_r2000_349-003_slie.tif']

{'start_year': 2000,
 'doy_range': '349-003',
 'doy_start': 349,
 'doy_end': 3,
 'dt_start': datetime.datetime(2000, 12, 14, 0, 0),
 'dt_end': datetime.datetime(2001, 1, 3, 0, 0),
 'dt_duration': datetime.timedelta(days=20),
 'end_year': 2001}

Now that we have all the date information we can establish the start and end dates of the entire collection and create a a DateTime index that encompasses all the data.

In [11]:
def get_first_dt(di):
    """Get the earliest chronological start date from the lookup."""
    start_dts = []
    for k in di.keys():
        start_dts.append(di[k]["dt_start"])
    start_dt = sorted(start_dts)[0]
    return start_dt


def get_last_dt(di):
    """Get the latest chronological end date from the lookup."""
    end_dts = []
    for k in di.keys():
        end_dts.append(di[k]["dt_end"])
    last_dt = sorted(end_dts)[-1]
    return last_dt

end = get_last_dt(di)
start = get_first_dt(di)
dt_range = pd.date_range(start=start, end=end)

print(start)
print(end)
dt_range

1996-10-17 00:00:00
2008-07-14 00:00:00


DatetimeIndex(['1996-10-17', '1996-10-18', '1996-10-19', '1996-10-20',
               '1996-10-21', '1996-10-22', '1996-10-23', '1996-10-24',
               '1996-10-25', '1996-10-26',
               ...
               '2008-07-05', '2008-07-06', '2008-07-07', '2008-07-08',
               '2008-07-09', '2008-07-10', '2008-07-11', '2008-07-12',
               '2008-07-13', '2008-07-14'],
              dtype='datetime64[ns]', length=4289, freq='D')

Because each file spans many days, a single calendar day can be represented by more than one raster. We need to map date to the representative rasters by checking if the date is in the range represented by the file. We'll want to know how many files match from the Beaufort region, how many from the Chukchi region, and how many match total. We also expect that some calendar days will have no matches at all (e.g., August, which is not in the seasonal ice cycle). We'll create a new lookup keyed by each date in the DateTime Index that stores the above information and the matching file names.

In [13]:
def time_in_range(start, end, x):
    """Return true if x is in the range [start, end]"""
    if start <= end:
        return start <= x <= end
    else:
        return start <= x or x <= end

dt_di = {}

for dt in dt_range:
    dt_di[dt] = {}
    dt_di[dt]["matching data"] = []
    beaufort_count = 0
    chukchi_count = 0
    
    for k in di.keys():
        if time_in_range(di[k]["dt_start"], di[k]["dt_end"], dt):
            dt_di[dt]["matching data"].append(k)
            
            if "beaufort" in k.lower():
                beaufort_count += 1
            if "chukchi" in k.lower():
                chukchi_count += 1
            
    if len(dt_di[dt]["matching data"]) == 0:
        dt_di[dt]["matching data"].append("no data")
    
    dt_di[dt]["beaufort_count"] = beaufort_count
    dt_di[dt]["chukchi_count"] = chukchi_count
    dt_di[dt]["match_count"] = chukchi_count + beaufort_count

We can see how matching files are distributed amongst the days by flipping this to a DataFrame. This isn' strictly necessary, but it is convenient.

In [14]:
df = pd.DataFrame.from_dict(dt_di).T
df.sort_values("match_count", ascending=False).head(5)

Unnamed: 0,matching data,beaufort_count,chukchi_count,match_count
2007-02-16,"[beaufort_r2007_029-048_slie.tif, beaufort_r20...",3,3,6
2006-03-23,"[beaufort_r2006_058-084_slie.tif, beaufort_r20...",3,3,6
2000-06-19,"[beaufort_r2000_147-171_slie.tif, beaufort_r20...",3,3,6
2006-03-18,"[beaufort_r2006_049-077_slie.tif, beaufort_r20...",3,3,6
2006-03-17,"[beaufort_r2006_049-077_slie.tif, beaufort_r20...",3,3,6


In [15]:
df.sort_values("match_count").head(5)

Unnamed: 0,matching data,beaufort_count,chukchi_count,match_count
2002-08-31,[no data],0,0,0
2002-08-09,[no data],0,0,0
2002-08-08,[no data],0,0,0
2002-08-07,[no data],0,0,0
2002-08-06,[no data],0,0,0


Some individual days are represented by as many as six rasters across both regions! We'll return to this DataFrame to handle how to unify the Chukchi and Beaufort regions based on how many rasters from each are represent each calendar date, if any.

The next step is to constrain our array values to a known range. We know from our EDA work the value schema in the existing collection is as follows:

 - 0: No landfast sea ice is present for this pixel. This means either water, or sea-ice that is NOT landfast.
 - 255: landfast sea ice is present for this pixel
 - 128: A landmask.
 - 63, 64, other oddball values: NoData

These values can be reduced to binary set where `0` indicates the absence of landfast sea ice and `1` the presence of it.

We'll define a function that forces this array value mapping and writes a new GeoTIFF.

In [16]:
def constrain_arr_values(args):
    
    """Writes a GeoTIFF of conditionally set 0 or 1 values"""
    
    fp, profile, out_fp = args
    with rio.open(fp) as src:
        arr = src.read(1)
    
    arr[arr != 255] = 0
    arr[arr == 255] = 1
    
    with rio.open(out_fp, 'w', **profile) as dst:
        dst.write(arr, 1)

At this stage we can also create cohesive spatial references and raster creation profiles. Ultimately we want to create a merged raster as well, so we'll create three new rasters and raster profiles:
1. An empty mask (all zeros) for the Chukchi Region
2. An empty mask (all zeros) for the Beaufort Region
3. An empty mask (all zeros) for the merged region

In [29]:
beauf_sample = new_fps[0]
chuk_sample = new_fps[-1]

with rio.open(beauf_sample) as beauf_src:
    beauf_src = rio.open(beauf_sample)
    beauf_profile = beauf_src.profile
beauf_profile["crs"] = CRS.from_epsg(3338)
beauf_profile.update(compress="lzw")

with rio.open(chuk_sample):
    chuk_src = rio.open(chuk_sample)
    chuk_profile = chuk_src.profile
chuk_profile["crs"] = CRS.from_epsg(3338)
chuk_profile.update(compress="lzw")

arr_merge, aff_merge = merge([beauf_src, chuk_src])

new_profile = chuk_profile.copy()
new_profile["height"], new_profile["width"] = arr_merge[0].shape
new_profile["transform"] = aff_merge
new_profile["crs"] = CRS.from_epsg(3338)
new_profile.update(compress="lzw")

with rio.open(mrg_p / "both_region_mask.tif", 'w', **new_profile) as dst:
        dst.write(np.zeros(arr_merge[0].shape), 1)

with rio.open(mrg_p / "beaufort_mask.tif", 'w', **beauf_profile) as dst:
        dst.write(np.zeros((beauf_profile["height"], beauf_profile["width"])), 1)

with rio.open(mrg_p / "chukchi_mask.tif", 'w', **chuk_profile) as dst:
        dst.write(np.zeros((chuk_profile["height"], chuk_profile["width"])), 1)
        
# we want the mask paths for later
both_mask_fp = mrg_p / "both_region_mask.tif"
chuk_mask_fp = mrg_p / "chukchi_mask.tif"
beauf_mask_fp = mrg_p / "beaufort_mask.tif"

Now that we have established cohesive raster profiles, we can generate the function arguments that will be used to consolidate the array values. The fixed output GeoTIFFs will just have a "arrfix" prefix.

In [18]:
chuk_fps = list(dst_p.glob("chukchi*"))
beauf_fps = list(dst_p.glob("beaufort*"))

beauf_args = [
    (
        fp,
        beauf_profile,
        fp.parent / ("arrfix_" + fp.name)
    ) 
    for fp in beauf_fps
]

chuk_args = [
    (
        fp,
        chuk_profile,
        fp.parent / ("arrfix_" + fp.name)
    ) 
    for fp in chuk_fps
]

The following functions will execute the array correction for each region in parallel. This is a significant speedup. However, these functions can sometimes hang or "deadlock" at the end of their execution. Sometimes (not always), this hang results in a few of the fixed GeoTIFFs not being produced.

In [None]:
def do_beaufort():
    with Pool(NCORES) as pool:
        _ = [
            result for result in tqdm(
                pool.imap_unordered(constrain_arr_values, beauf_args), 
                total=len(beauf_args),
                desc=f"Fixing Beaufort array values and writing to new GeoTIFFs...",
            )
        ]
        pool.close()
        pool.terminate()
        
do_beaufort()

In [None]:
def do_chukchi():   
    with Pool(NCORES) as pool:
        _ = [
            result for result in tqdm(
                pool.imap_unordered(constrain_arr_values, chuk_args), 
                total=len(chuk_args),
                desc=f"Fixing Chukchi array values and writing to new GeoTIFFs...",
            )
        ]
        pool.close()
        pool.terminate()
do_chukchi()

Verify the correct number of GeoTIFFs was generated. If not, you may need to restart the kernel and try again.

In [20]:
len(list(dst_p.glob("arrfix*"))) == len(slie_fps)

True

Now that we have constrained the data values and spatial reference and raster profiles, we can move to creating a daily GeoTIFF that includes all possible data. There are four possibles cases of data availability for each date in our DateTime Index.

1. No data is available at all.
2. A single raster exists from only one region.
3. Multiple rasters exist from only one region.
4. Multiple rasters exist from both regions.

We can handle these cases like so:

Case 1: Skip these dates for now, there is no data available. These dates can be brought back into a datacube at a later time if needed.

Case 2: Take the single raster and merge it with the mask for whichever region has no data.

Case 3: Take all the rasters and create a 3D array of values, and filter for the maximum of the time index. Then merge these data with the mask for whichever region has no data.

Case 4. Same as above, but do the stacking and filtering for both regions, and then then merge those two outputs together.

We can map these conditions (cases 1 through 4) to our DataFrame for convenience.

In [21]:
conditions = [
    (df["match_count"] == 0),
    (df["match_count"] == 1),
    ((df["match_count"] > 1) & (df["chukchi_count"] * df["beaufort_count"] == 0)),
    ((df["match_count"] > 1) & (df["chukchi_count"] * df["beaufort_count"] != 0))]
choices = [1, 2, 3, 4]
df["merge_case"] = np.select(conditions, choices)

In [25]:
df[df.merge_case == 1].head(1)

Unnamed: 0,matching data,beaufort_count,chukchi_count,match_count,merge_case
1997-07-09,[no data],0,0,0,1


In [26]:
df[df.merge_case == 2].head(1)

Unnamed: 0,matching data,beaufort_count,chukchi_count,match_count,merge_case
1996-10-17,[chukchi_r1996_291-309_slie.tif],0,1,1,2


In [27]:
df[df.merge_case == 3].head(1)

Unnamed: 0,matching data,beaufort_count,chukchi_count,match_count,merge_case
1996-10-24,"[chukchi_r1996_291-309_slie.tif, chukchi_r1996...",0,2,2,3


In [28]:
df[df.merge_case == 4].head(1)

Unnamed: 0,matching data,beaufort_count,chukchi_count,match_count,merge_case
1999-10-08,"[beaufort_r1999_276-297_slie.tif, chukchi_r199...",1,1,2,4


Now that we've defined how to handle each case, we'll create functions to execute that logic. When called, each function will create a single GeoTIFF that encompasses both regions. These functions are set up to take a TimeStamp object as the arguement and then query the lookup for data associated with that date.

In [32]:
def use_fixed_one_arr_one_region(k):
    """This function handles the array-merge case #2."""
    if dt_di[k]["chukchi_count"] > dt_di[k]["beaufort_count"]:
        mask_fp = beauf_mask_fp
    else:
        mask_fp = chuk_mask_fp
    
    fp = dst_p / dt_di[k]["matching data"][0]
    fixed_fp = fp.parent / ("arrfix_" + fp.name)
    
    data_src = rio.open(fixed_fp)
    mask_src = rio.open(mask_fp)
    out_arr, out_aff = merge([data_src, mask_src])
    data_src.close()
    mask_src.close()
    
    out_merge_fp = Path(mrg_p / ("ak_landfast_ice" + str(k).split(' ')[0] + ".tif"))
    with rio.open(out_merge_fp, 'w', **new_profile) as dst:
        dst.write(out_arr[0], 1)


def use_fixed_many_arrs_one_region(k):
    """This function handles the array-merge case #3."""
    if dt_di[k]["chukchi_count"] > dt_di[k]["beaufort_count"]:
        mask_fp = beauf_mask_fp
        data_profile = chuk_profile
    else:
        mask_fp = chuk_mask_fp
        data_profile = beauf_profile
        
    data_arrs = []
    for src in dt_di[k]["matching data"]:
        fp = dst_p / src
        fixed_fp = fp.parent / ("arrfix_" + fp.name)
        
        with rio.open(fixed_fp) as data_src:
            data_arrs.append(data_src.read(1))
        
    stack = np.dstack(data_arrs)
    time_max = np.nanmax(stack, axis=2)
    time_max_fp = Path(mrg_p / ("tmp_max_of_" + str(k).split(' ')[0] + ".tif"))
    
    with rio.open(time_max_fp, 'w', **data_profile) as dst:
        dst.write(time_max, 1)
        
    time_max_data_src = rio.open(time_max_fp)
    mask_src = rio.open(mask_fp)
    out_arr, out_aff = merge([time_max_data_src, mask_src])
    time_max_data_src.close()
    mask_src.close()

    out_merge_fp = Path(mrg_p / ("ak_landfast_ice" + str(k).split(' ')[0] + ".tif")) 
    with rio.open(out_merge_fp, 'w', **new_profile) as dst:
        dst.write(out_arr[0], 1)
 
    
def use_fixed_many_arrs_both_regions(k):
    """This function handles the array-merge case #4."""
    beauf_srcs = [x for x in dt_di[k]["matching data"] if "beauf" in x]
    chuk_srcs = [x for x in dt_di[k]["matching data"] if "chuk" in x]
    
    if len(beauf_srcs) == 1:
        fp = dst_p / beauf_srcs[0]
        beauf_to_merge_src = fp.parent / ("arrfix_" + fp.name)
    else:
        data_arrs = []
        for src in beauf_srcs:
            fp = dst_p / src
            fixed_fp = fp.parent / ("arrfix_" + fp.name)
        
        with rio.open(fixed_fp) as data_src:
            data_arrs.append(data_src.read(1))
        
        stack = np.dstack(data_arrs)
        time_max = np.nanmax(stack, axis=2)
        beauf_to_merge_src = Path(mrg_p / ("tmp_beauf_max_of_" + str(k).split(' ')[0] + ".tif"))
    
        with rio.open(beauf_to_merge_src, 'w', **beauf_profile) as dst:
            dst.write(time_max, 1)

    # do same code above for chukchi
    if len(chuk_srcs) == 1:
        fp = dst_p / chuk_srcs[0]
        chuk_to_merge_src = fp.parent / ("arrfix_" + fp.name)
    else:
        data_arrs = []
        for src in chuk_srcs:
            fp = dst_p / src
            fixed_fp = fp.parent / ("arrfix_" + fp.name)
        
        with rio.open(fixed_fp) as data_src:
            data_arrs.append(data_src.read(1))
        
        stack = np.dstack(data_arrs)
        time_max = np.nanmax(stack, axis=2)
        chuk_to_merge_src = Path(mrg_p / ("tmp_chuk_max_of_" + str(k).split(' ')[0] + ".tif"))
    
        with rio.open(chuk_to_merge_src, 'w', **chuk_profile) as dst:
            dst.write(time_max, 1)

    # then merge chukchi_to_merge and beauf_to_merge
    with rio.open(chuk_to_merge_src) as a, rio.open(beauf_to_merge_src) as b:
        out_arr, out_aff = merge([a, b])
    
    out_merge_fp = Path(mrg_p / ("ak_landfast_ice" + str(k).split(' ')[0] + ".tif")) 
    with rio.open(out_merge_fp, 'w', **new_profile) as dst:
        dst.write(out_arr[0], 1)

Now we'll generate the arguments (timestamps) for use in the the above functions.

In [33]:
merge_case_2 = df[df.merge_case == 2]
dt_arr_di_case2 = merge_case_2.T.to_dict()

merge_case_3 = df[df.merge_case == 3]
dt_arr_di_case3 = merge_case_3.T.to_dict()

merge_case_4 = df[df.merge_case == 4]
dt_arr_di_case4 = merge_case_4.T.to_dict()

merge_case2_args = [k for k in dt_arr_di_case2.keys()]
merge_case3_args = [k for k in dt_arr_di_case3.keys()]
merge_case4_args = [k for k in dt_arr_di_case4.keys()]

As before, these multiprocessing functions may hang toward the end. You may need to restart the kernel and try again. This is getting the job done at the moment, but it may be worth trying to use `concurrent.futures.ProcessPoolExecutor` instead or just upgrading to Python 3.10.

In [None]:
def do_merge_case_two():   
    with Pool(NCORES) as pool:
        _ = [
            result for result in tqdm(
                pool.imap_unordered(use_fixed_one_arr_one_region, merge_case2_args), 
                total=len(merge_case2_args),
                desc=f"Merge single raster/region (case 2) array values and writing to new GeoTIFFs...",
            )
        ]
        pool.close()
        pool.terminate()
do_merge_case_two()

In [None]:
def do_merge_case_three():   
    with Pool(NCORES) as pool:
        _ = [
            result for result in tqdm(
                pool.imap_unordered(use_fixed_many_arrs_one_region, merge_case3_args), 
                total=len(merge_case3_args),
                desc=f"Merge multiple raster / single region (case 3) array values and writing to new GeoTIFFs...",
            )
        ]
        pool.close()
        pool.terminate()
do_merge_case_three()

In [None]:
def do_merge_case_four():   
    with Pool(NCORES) as pool:
        _ = [
            result for result in tqdm(
                pool.imap_unordered(use_fixed_many_arrs_both_regions, merge_case4_args), 
                total=len(merge_case4_args),
                desc=f"Merge multiple rasters from both regions (case 4) and write to new GeoTIFFs...",
            )
        ]
        pool.close()
        pool.terminate()
do_merge_case_four()

Verify that all data were created.

In [36]:
len(df[df.merge_case != 1])

3492

In [40]:
!ls $MRG/ak_landfast* | wc -l

3492
