### 1. Update the parameters dictionary

<div class="alert-warning">
Update the next cell.
</div>

In [None]:
config_params = {
    'project_dir': 's3://dev-nlcd-developer/junk3/timesync/', 
    'plot_file': './PlotList.csv', 
    'region': 'CU', 
    'chip_size': [255, 255]
}

### some of this AWS authentication stuff can be greatly simplified with %env or os.environment 
- like the requester pays bs

In [None]:
%env AWS_REQUEST_PAYER=requester

In [None]:
#! aws s3 ls | grep dev

### 2. Import libraries and define functions

Run the following cell, which contains all library imports and locally defined functions for data extraction.

In [None]:
import os
import csv
import time
import random
import itertools
import configparser
from copy import copy
from dataclasses import dataclass
from datetime import datetime as dt
from functools import partial, reduce, wraps
from typing import List, Tuple, Optional, Any, Callable, Iterable

import s3fs
import tqdm
import boto3
import fsspec
import numpy as np
import pandas as pd
import pystac_client
import rasterio as rio
from dask.distributed import as_completed, worker_client, Client
from dask.distributed.client import Future
from fsspec.implementations.local import LocalFileSystem

In [None]:
from ts_process_group import stac_records_for_plot, group_records
from ts_process_group import process_group

In [None]:












def report_status(func: Callable) -> Callable:
    @wraps(func)
    def wrapper(plot: Tuple[Any, ...], *args, **kwargs) -> Tuple[Tuple[Any, ...], str]:
        """
        Return the plot and any exception raised, or report complete
        """
        try:
            func(plot, *args, **kwargs)
            return plot, 'complete'
        except Exception as error:
            return plot, str(error)

    return wrapper


#@report_status
def process_plot(plot: Tuple[Any, ...], params: dict) -> None:
    """
    Process an individual plot
    """
    print('called process_plot')
    groups = group_records(stac_records_for_plot(plot, params))
    for group in groups:
        process_group(group, plot, params)





def check_bit(value: int, bit: int) -> bool:
    """
    Check whether a bit is set
    """
    return bool((value & (1 << bit)))


def passes_qa_check(qa: int, enable_cloud_filtering=False) -> bool:
    """
    Make sure the QA value is not indicating fill and (optionally) ensure clear or water bits are set
    """
    if check_bit(qa, QA_FILL):
        return False
    if enable_cloud_filtering and not (check_bit(qa, QA_CLEAR) or check_bit(qa, QA_WATER)):
        return False
    return True

In [None]:





def format_plot_data(plot_file: str) -> pd.DataFrame:
    """
    Read in the csv file containing geospatial plot data
    """
    return pd.read_csv(
        plot_file,
        usecols=['project_id', 'plot_id', 'x', 'y'],
        dtype={'project_id': str, 'plot_id': str, 'x': int, 'y': int})


def format_log_data(log_file: str) -> pd.DataFrame:
    """
    Read in the csv file containing a record of previous run(s)
    """
    return pd.read_csv(
        log_file,
        usecols=['project_id', 'plot_id', 'time', 'status'],
        dtype={'project_id': str, 'plot_id': str, 'time': str, 'status': str})






In [None]:
def append_to_csv(entry: list, csv_file: str) -> None:
    """
    Append a line to a csv file
    """
    with open(csv_file, mode='a', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(entry)


def log_plot_status(plot: Tuple[Any, ...], status: str, log_file: str) -> None:
    """
    Write plot status to a log file
    """
    if not os.path.exists(log_file) or not (os.path.getsize(log_file) > 0):
        append_to_csv(['project_id', 'plot_id', 'time', 'status'], log_file)
    append_to_csv([plot.project_id, plot.plot_id, dt.now(), status], log_file)


def data_preparation(plot_file: str, log_file: str) -> Tuple[pd.DataFrame, int, int]:
    """
    Read in the plot geolocation information and prior processing history
    """
    # Read in the plot data
    plots_df = format_plot_data(plot_file)

    if os.path.exists(log_file) and (os.path.getsize(log_file) > 0):
        log_df = format_log_data(log_file)

        # Get the most recent status from any previous processing run
        df = plots_df.merge(
            log_df.drop_duplicates(subset='plot_id', keep='last'),
            how='left', on=['project_id', 'plot_id'])

    else:
        df = plots_df.copy().reindex(columns=plots_df.columns.tolist() + ['status'])

    n_total, n_completed = len(df), len(df[df.status == 'complete'])
    plots_to_process = df.loc[df.status != 'complete', plots_df.columns]

    return plots_to_process, n_completed, n_total





def log_file_name(params: dict) -> str:
    """
    Define the output log file
    """
    return os.path.splitext(params['plot_file'])[0] + '.log'


def process_on_local(params: dict) -> None:
    """
    Local single-threaded processing
    """
    # Get input data
    plots_df, n_completed, n_total = data_preparation(params['plot_file'], log_file_name(params))
    print(plots_df)
    if n_completed == n_total:
        print(f'All {n_total} plots processed successfully! Exiting...')
        return

    # Define the processing function
    processing_func = partial(process_plot, params=params)

    for plot in plots_df.itertuples():
        print(plot)
        plot, status = processing_func(plot)

    # Iterate over the plots
    # for plot in tqdm.tqdm(plots_df.itertuples(), desc='Processing plots', initial=n_completed, total=n_total):
    #     plot, status = processing_func(plot)
    #     log_plot_status(plot, status, log_file_name(params))

In [None]:
def aws_setup() -> dict:
    """
    Extra setup for writing to an S3 bucket
    """
    # key, secret = aws_credentials(profile)
    return {
        # 'fs': fsspec.filesystem('s3', key=key, secret=secret),
        'fs': fsspec.filesystem('s3', anon=False, requester_pays=True),
        'rio_env': {
            'session': rio.session.AWSSession(),
            'GDAL_DISABLE_READDIR_ON_OPEN': 'EMPTY_DIR',
            'GDAL_PAM_ENABLED': 'NO',  # Set to 'YES' to write XML metadata
        }}


In [None]:
def timesync_data_extraction(project_dir: str, plot_file: str, region: str, chip_size: List[int]) -> None:
    """
    Run TimeSync data extraction
    """
    params = locals()

    storage = {
        'aws_s3': aws_setup,
    }

    # process = {
    #     'local': process_on_local,
    # }
    print(storage)
    params.update(storage)
    # process(params)

    process_on_local(params)

In [None]:
timesync_data_extraction(**config_params)  # docker and the cluster will not need dask
timesync_data_extraction(**params, client=client)

In [None]:
#! head -11 TxL2Test_PlotList.csv >10lines_PlotList.csv
! cat PlotList.csv