In [1]:
import os
import pickle

from joblib import Parallel, delayed

import dl_helper as helper

## Load Regional Masks

In [None]:
# YOU-DO: Define the path_to_pkls_folder (where you stored the mask pkls)
group = '7'
pkl_folder = f'group_{group}'

path_to_pkls_folder = os.path.join('..', 'masks', pkl_folder)   

In [None]:
# get a list of .pkl files in path_to_pkls_folder
mask_filenames = [f for f in os.listdir(path_to_pkls_folder) if f.startswith('masked') and f.endswith('.pkl')]
bounding_file = [f for f in os.listdir(path_to_pkls_folder) if f.startswith('bounding') and f.endswith('.pkl')]
print("Masks: \t\t", mask_filenames)
print("Bounding box:\t", bounding_file)

print("\nNote: There should only be one bounding box file. If there are more, please organize your folder into groups.")

In [None]:
masked_arrays = {file.split('.')[-2].split('_')[-1]: pickle.load(open(os.path.join(path_to_pkls_folder, file), 'rb')) for file in mask_filenames}
bounding_box = pickle.load(open(os.path.join(path_to_pkls_folder, bounding_file[0]), 'rb'))

In [None]:
lons, lats = helper.get_lons_lats(bounding_box)

# Check that everything is in order

In [None]:
helper.visualize_masks(masked_arrays, lons, lats)

# Define your staging area and destination folder

Data is temporarily written to a temporary (staging) location, then is moved to the destination when the writing is complete.

Dont use a cloud drive for temp files

In [None]:
print(f'chla_data_group_{group}')

In [None]:
destination_foldername = f'chla_data_group_{group}'

destination_path = os.path.join('/Volumes/Seagate 5TB/OceanColour Data/', 'regional_chla_data', destination_foldername)
if not os.path.exists(destination_path):
    os.makedirs(destination_path)

# Dont use an icloud drive folder :( 
temp_path ='/Users/tara/MacDocuments/MLML/data/staging'
if not os.path.exists(temp_path):
    os.makedirs(temp_path)

In [None]:
step, batches_remaining = helper.get_steps_remaining(destination_path)
print("Batches remaining: ", batches_remaining)
print("Count: ", len(batches_remaining))

for attempt in range(5):
    try:
        Parallel(n_jobs=-1)(delayed(helper.process_batch)(i, step, bounding_box, lons, lats, destination_path, temp_path) for i in batches_remaining)
        break  # If the operation succeeds, break the loop
    except Exception as e:
        print(f"Attempt {attempt+1} failed with error: {e}")
        step, batches_remaining = helper.get_steps_remaining(destination_path)
else:
    print("Operation failed after 5 attempts. Quitting.")


steps, batches_remaining = helper.get_steps_remaining(destination_path)
if len(batches_remaining) == 0:
    print("All batches have been processed.")
else:
    print(f"{len(batches_remaining)} batches are still pending.")
    print("Please run the cell again to process the remaining batches.")

In [7]:
def download_region(i):
    # YOU-DO: Define the path_to_pkls_folder (where you stored the mask pkls)
    group = str(i)
    pkl_folder = f'group_{group}'

    path_to_pkls_folder = os.path.join('..', 'masks', pkl_folder) 

    # get a list of .pkl files in path_to_pkls_folder
    bounding_file = [f for f in os.listdir(path_to_pkls_folder) if f.startswith('bounding') and f.endswith('.pkl')]
    print("Bounding box:\t", bounding_file)

    bounding_box = pickle.load(open(os.path.join(path_to_pkls_folder, bounding_file[0]), 'rb'))
    lons, lats = helper.get_lons_lats(bounding_box)

    destination_foldername = f'chla_data_group_{group}'

    destination_path = os.path.join('/Volumes/Seagate 5TB/OceanColour Data/', 'regional_chla_data', destination_foldername)
    if not os.path.exists(destination_path):
        os.makedirs(destination_path)

    # Dont use an icloud drive folder :( 
    temp_path ='/Users/tara/MacDocuments/MLML/data/staging'
    if not os.path.exists(temp_path):
        os.makedirs(temp_path)

    step, batches_remaining = helper.get_steps_remaining(destination_path)
    print("Batches remaining: ", batches_remaining)
    print("Count: ", len(batches_remaining))

    for attempt in range(5):
        try:
            Parallel(n_jobs=-1)(delayed(helper.process_batch)(i, step, bounding_box, lons, lats, destination_path, temp_path) for i in batches_remaining)
            break  # If the operation succeeds, break the loop
        except Exception as e:
            print(f"Attempt {attempt+1} failed with error: {e}")
            step, batches_remaining = helper.get_steps_remaining(destination_path)
    else:
        print("Operation failed after 5 attempts. Quitting.")


    steps, batches_remaining = helper.get_steps_remaining(destination_path)
    if len(batches_remaining) == 0:
        print("All batches have been processed.")
    else:
        print(f"{len(batches_remaining)} batches are still pending.")
        print("Please run the cell again to process the remaining batches.")

    

In [10]:
for i in range(1, 14):
    print(f"Beginning regional group {i}...")
    download_region(i)

Beginning group 1...
Masks: 		 ['masked_array_region_034.pkl', 'masked_array_region_000.pkl', 'masked_array_region_001.pkl']
Bounding box:	 ['bounding_box.pkl']
Batches remaining:  []
Count:  0
All batches have been processed.
Beginning group 2...
Masks: 		 ['masked_array_region_003.pkl', 'masked_array_region_002.pkl']
Bounding box:	 ['bounding_box.pkl']
Batches remaining:  []
Count:  0
All batches have been processed.
Beginning group 3...
Masks: 		 ['masked_array_region_006.pkl', 'masked_array_region_005.pkl', 'masked_array_region_004.pkl']
Bounding box:	 ['bounding_box.pkl']
Batches remaining:  []
Count:  0
All batches have been processed.
Beginning group 4...
Masks: 		 ['masked_array_region_009.pkl', 'masked_array_region_008.pkl', 'masked_array_region_007.pkl', 'masked_array_region_010.pkl']
Bounding box:	 ['bounding_box.pkl']
Batches remaining:  []
Count:  0
All batches have been processed.
Beginning group 5...
Masks: 		 ['masked_array_region_012.pkl', 'masked_array_region_011.pkl'

# debug


In [None]:
import os
import re
import shutil

import time as tm

import numpy as np

import pandas as pd
import netCDF4 as nc

import matplotlib.pyplot as plt
import matplotlib.patheffects as pe


import cartopy.crs as ccrs

from joblib import Parallel, delayed

import time as tm

def get_ds(firstday, lastday, bounding_box):
    minx_index, maxx_index, miny_index, maxy_index = bounding_box

    url = "https://www.oceancolour.org/thredds/dodsC/CCI_ALL-v6.0-1km-DAILY?lat["+maxy_index+":1:"+miny_index+"],lon["+minx_index+":1:"+maxx_index+"],chlor_a["+firstday+":1:"+lastday+"]["+maxy_index+":1:"+miny_index+"]["+minx_index+":1:"+maxx_index+"],time["+firstday+":1:"+lastday+"]"
    ds = nc.Dataset(url)

    return ds

def get_mmddyy(timesinceepoch):
    return tm.gmtime(timesinceepoch * 86400)

def run_process_batch(i, step, bounding_box, lons, lats, destination_path, staging_path):
    firstday = str(i)
    lastday = str(i + step - 1)
    ds = get_ds(firstday, lastday, bounding_box)
    save_batch(ds, firstday, lons, lats, destination_path, staging_path)

def save_batch(ds, firstday, lons, lats, destination_path, staging_path):
    time_var = ds.variables['time'][:]
    
    year = get_mmddyy(time_var[0]).tm_year
    print(f"Working on year {year}...", end="\r")

    # get the current time in seconds
    start_time = tm.time()
    fp = os.path.join(staging_path, f'chlor_a_data_{firstday}.nc')
    with nc.Dataset(fp, 'w') as f:
        f.createDimension('time', len(time_var))
        f.createDimension('lat', len(lats))
        f.createDimension('lon', len(lons))

        time_var_out = f.createVariable('time', 'f8', ('time',))
        lat_out = f.createVariable('lat', 'f8', ('lat',))
        lon_out = f.createVariable('lon', 'f8', ('lon',))
        chlor_a_out = f.createVariable('chlor_a', 'f8', ('time', 'lat', 'lon'))

        time_var_out[:] = time_var
        chlor_a_out[:] = ds.variables['chlor_a'][:]
        lat_out[:] = lats
        lon_out[:] = lons
    
    writetime = tm.time()
    print(f"Writing took {writetime - start_time} seconds.")
    # close the ds
    ds.close()

    movestart = tm.time()
    # move the file to the destination folder
    shutil.move(fp, os.path.join(destination_path, f'chlor_a_data_{firstday}.nc'))
    print(f"Moving took {tm.time() - movestart} seconds.")

def get_steps_remaining(destination_path):
    start, stop, step = 0, 9600, 50

    destination_files = [f for f in os.listdir(destination_path) if f.endswith('.nc')]

    completed_steps = [int(f.split('.')[0].split('_')[-1]) for f in destination_files]

    steps_needed = [i for i in range(start, stop, step) if i not in completed_steps]

    # print("Batches remaining: ", len(steps_needed))
    return step, steps_needed

In [None]:
step, batches_remaining = get_steps_remaining(destination_path)
print("Batches remaining: ", batches_remaining)
print("Count: ", len(batches_remaining))

for i in batches_remaining:
    print(f"Processing batch # {i}...")
    run_process_batch(i, step, bounding_box, lons, lats, destination_path, temp_path) 