In [3]:
import os
import pickle
import pandas as pd
import time as tm
import datetime

import numpy as np
import netCDF4 as nc
import matplotlib.pyplot as plt

import cartopy.crs as ccrs
from shapely.geometry import Point, shape

from joblib import Parallel, delayed


# ATTENTION

**This file has been replaced with a more general pipeline code: _retrive_opendap_gl_chla_means.ipynb_**

**This file is no longer used and will be removed in the future.**

This file calculates the daily mean over an entire polygon (mask) and stores these daily means as a csv for each region.  

## Load Regional Masks

In [4]:
path_to_pkls_folder = os.path.join('..', 'masks')   

# get a list of all files in path_to_pkls_folder with .pkl extension (presumably the masks)
pkl_files = [f for f in os.listdir(path_to_pkls_folder) if f.endswith('.pkl')]
pkl_files.sort()
pkl_files

[]

In [2]:
num_pkls = 6
# Load the masks
masks = []
for x in range(num_pkls):
    with open(os.path.join(path_to_pkls_folder, 'masked_array'+str(x)+'.pkl'), 'rb') as f:
        masks.append(pickle.load(f))

In [4]:
# subset the data and get the lat, lons
minx_index, maxx_index, miny_index, maxy_index = '11800', '13029', '2894', '2325'   #Arrigo
# minx_index, maxx_index, miny_index, maxy_index = '11907', '12484', '2076', '1838' #Disko Bay

firstday = '0'
lastday = '1'

url = "https://www.oceancolour.org/thredds/dodsC/CCI_ALL-v6.0-1km-DAILY?lat["+maxy_index+":1:"+miny_index+"],lon["+minx_index+":1:"+maxx_index+"],chlor_a["+firstday+":1:"+lastday+"]["+maxy_index+":1:"+miny_index+"]["+minx_index+":1:"+maxx_index+"],time["+firstday+":1:"+lastday+"]"
ds = nc.Dataset(url)

lons = ds.variables['lon'][:]
lats = ds.variables['lat'][:]

In [5]:
minx_index, maxx_index, miny_index, maxy_index = '11800', '13029', '2894', '2325' #Arrigo
# minx_index, maxx_index, miny_index, maxy_index = '11907', '12484', '2076', '1838' #Disko Bay

def calculate_mean(mask_index, t, step, masks, time, max_chl=50):
    mask = masks[mask_index]
    data = []

    ds = get_url(t, step)
    for t in range(len(time)):

        if mask_index == 0:
            print(f"{t+1} of {len(time)}", end="\r")

        chlor_a = np.ma.masked_where(mask.mask, ds.variables['chlor_a'][t].data)    
        # chlor_a = np.ma.masked_where(mask.mask, ds[t].data)
        chlor_a = np.ma.masked_outside(chlor_a, 0, max_chl)
        mean = chlor_a.mean()
        data.append({'region': mask_index, 'time': time[t], 'mean': mean})
    print()

    return pd.DataFrame(data)

def get_url(i, step): 
    firstday = str(i)
    lastday = str(i+(step-1))
    url = "https://www.oceancolour.org/thredds/dodsC/CCI_ALL-v6.0-1km-DAILY?lat["+maxy_index+":1:"+miny_index+"],lon["+minx_index+":1:"+maxx_index+"],chlor_a["+firstday+":1:"+lastday+"]["+maxy_index+":1:"+miny_index+"]["+minx_index+":1:"+maxx_index+"],time["+firstday+":1:"+lastday+"]"
    return nc.Dataset(url)

def process_mask_index(start, step, max_chla, destination_folder):
# step = 10
    for t in range(start, 10000, step):
        # ...
        start_time = tm.time()
        print(f"Start time for t={t}: {datetime.datetime.fromtimestamp(start_time).strftime('%H:%M:%S')}")

        ds = get_url(t, step)    
        time = ds.variables['time'][:]

        results = Parallel(n_jobs=-1, backend="threading")(delayed(calculate_mean)(i, t, step, masks, time, max_chla) for i in range(6))

        mid_time = tm.time()
        print(f"finished parallel step: {datetime.datetime.fromtimestamp(mid_time).strftime('%H:%M:%S')}")
        
        df = pd.concat(results)
        
        output_file = os.path.join(destination_folder,  f'mean_chl_{t}_{t+(step-1)}.csv')
        df.to_csv(output_file, index=False)
        
        end_time = tm.time()
        print(f"End time for t={t}: {datetime.datetime.fromtimestamp(end_time).strftime('%H:%M:%S')}")
        # print(f"Elapsed time for t={t}: {datetime.datetime.fromtimestamp(end_time - start_time).strftime('%H:%M:%S')}")
        print()


In [5]:
destination_folder = os.path.join('..', 'means_100')
max_chla = 100
step = 500

# step = 9645-9500
start = 9000
try:
    process_mask_index(start, step, max_chla, destination_folder)
except RuntimeError as e:
    print(f"Error: {e}")

In [None]:
# Inspect the data in the directory ../means
def inspect_data(directory):
    directory = '../means/'
    df = pd.DataFrame()

    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            temp_df = pd.read_csv(filepath)
            df = pd.concat([df, temp_df])

    df['mean'] = df['mean'].replace('--', np.nan)
    df['region'] = df['region'].astype('category')
    df['datetime'] = pd.to_datetime(df['time'], unit='D', origin='1970-01-01')
    df['year'] = df['datetime'].dt.year
    df['mean'] = df['mean'].astype(float)

    return df
   

# Print the combined DataFrame
# print(inspect_data('../means'))