In [1]:
import os
import pickle
import pandas as pd
import time as tm
import datetime

import numpy as np
import netCDF4 as nc
import matplotlib.pyplot as plt

import cartopy.crs as ccrs
from shapely.geometry import Point, shape

from joblib import Parallel, delayed


In [2]:
# Load the masks
masks = []
for x in range(6):
    with open(os.path.join('..', 'masks', 'masked_array'+str(x)+'.pkl'), 'rb') as f:
    # with open('masked_array' + str(x) + '.pkl', 'rb') as f:
        masks.append(pickle.load(f))

In [3]:
# subset the data and get the lat, lons
minx_index, maxx_index, miny_index, maxy_index = '11800', '13029', '2894', '2325'

firstday = '0'
lastday = '1'

url = "https://www.oceancolour.org/thredds/dodsC/CCI_ALL-v6.0-1km-DAILY?lat["+maxy_index+":1:"+miny_index+"],lon["+minx_index+":1:"+maxx_index+"],chlor_a["+firstday+":1:"+lastday+"]["+maxy_index+":1:"+miny_index+"]["+minx_index+":1:"+maxx_index+"],time["+firstday+":1:"+lastday+"]"
ds = nc.Dataset(url)

lons = ds.variables['lon'][:]
lats = ds.variables['lat'][:]

In [7]:
minx_index, maxx_index, miny_index, maxy_index = '11800', '13029', '2894', '2325'

def calculate_mean(mask_index, t, step, masks, time):
    mask = masks[mask_index]
    data = []

    ds = get_url(t, step)
    for t in range(len(time)):
        if mask_index == 0:
            print(f"{t} of {len(time)}", end="\r")
        chlor_a = np.ma.masked_where(mask.mask, ds.variables['chlor_a'][t].data)    
        # chlor_a = np.ma.masked_where(mask.mask, ds[t].data)
        chlor_a = np.ma.masked_outside(chlor_a, 0, 10)
        mean = chlor_a.mean()
        data.append({'region': mask_index, 'time': time[t], 'mean': mean})
    print()
    return pd.DataFrame(data)

def get_url(i, step): 
    firstday = str(i)
    lastday = str(i+(step-1))
    url = "https://www.oceancolour.org/thredds/dodsC/CCI_ALL-v6.0-1km-DAILY?lat["+maxy_index+":1:"+miny_index+"],lon["+minx_index+":1:"+maxx_index+"],chlor_a["+firstday+":1:"+lastday+"]["+maxy_index+":1:"+miny_index+"]["+minx_index+":1:"+maxx_index+"],time["+firstday+":1:"+lastday+"]"
    return nc.Dataset(url)

def process_mask_index(step):
# step = 10
    for t in range(9500, 10000, step):
        # ...
        start_time = tm.time()
        print(f"Start time for t={t}: {datetime.datetime.fromtimestamp(start_time).strftime('%H:%M:%S')}")

        ds = get_url(t, step)    
        time = ds.variables['time'][:]

        results = Parallel(n_jobs=-1, backend="threading")(delayed(calculate_mean)(i, t, step, masks, time) for i in range(6))

        mid_time = tm.time()
        print(f"finished parallel step: {datetime.datetime.fromtimestamp(mid_time).strftime('%H:%M:%S')}")
        
        df = pd.concat(results)
        
        mid_time = tm.time()
        print(f"finished concatenating results: {datetime.datetime.fromtimestamp(mid_time).strftime('%H:%M:%S')}")
        
        output_file = os.path.join('..', 'means', f'mean_chl_{t}_{t+(step-1)}.csv')
        df.to_csv(output_file, index=False)
        
        end_time = tm.time()
        print("finished writing to csv...")
        print(f"End time for t={t}: {datetime.datetime.fromtimestamp(end_time).strftime('%H:%M:%S')}")
        print(f"Elapsed time for t={t}: {datetime.datetime.fromtimestamp(end_time - start_time).strftime('%H:%M:%S')}")
        print()
    # ...

In [9]:
try:
    process_mask_index(9646-9500-1)
except RuntimeError as e:
    print(f"Error: {e}")

In [10]:
# Inspect the data in the directory ../means
def inspect_data(directory):
    directory = '../means/'
    df = pd.DataFrame()

    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            temp_df = pd.read_csv(filepath)
            df = pd.concat([df, temp_df])

    df['mean'] = df['mean'].replace('--', np.nan)
    df['region'] = df['region'].astype('category')
    df['datetime'] = pd.to_datetime(df['time'], unit='D', origin='1970-01-01')
    df['year'] = df['datetime'].dt.year
    df['mean'] = df['mean'].astype(float)

    return df
   

# Print the combined DataFrame
print(inspect_data('../means'))

      region   time                mean
0          0  10630                  --
1          0  10631                  --
2          0  10632                  --
3          0  10633                  --
4          0  10634                  --
...      ...    ...                 ...
2995       5  19129   2.936336328209569
2996       5  19130  3.1125486965600864
2997       5  19131   2.809801807205972
2998       5  19132   1.908471051384421
2999       5  19133  2.8253495595011326

[57876 rows x 3 columns]
