# Data Acquisition Pipeline
This notebook documents the data setup pipeline using Prefect, orchestrating tasks for downloading, extracting, and sampling files from an AWS S3 bucket.

In [None]:
from prefect import Flow
from appvocai-discover.setup.file.config import FileSetupPipelineConfig
from appvocai-discover.setup.file.download import DownloadFileTask
from appvocai-discover.setup.file.extract import ExtractFileTask
from appvocai-discover.setup.file.sample import SampleFileTask


## Configuration
The FileSetupPipelineConfig class encapsulates configuration parameters for AWS credentials, local file paths, and sampling settings.

In [None]:
# Instantiate the config
config = FileSetupPipelineConfig(aws_folder="test", aws_s3_key="test_file.txt")

## Task Definitions
### Download Task
Downloads files from AWS S3 based on configured parameters.

In [None]:
download_task = DownloadFileTask(
    aws_access_key=config.aws.access_key,
    aws_secret_access_key=config.aws.secret_access_key,
    aws_region_name=config.aws.region_name,
    aws_bucket_name=config.aws_file.bucket_name,
    aws_folder=config.aws_file.folder,
    aws_s3_key=config.aws_file.s3_key,
    local_download_folder=config.local_download_folder,
    local_download_filepath=config.local_download_filepath,
    force=config.force,
)

### Extract Task
Extracts downloaded files to a specified local destination.

In [None]:
extract_task = ExtractFileTask(
    source=config.local_download_filepath,
    extract_destination=config.extract_destination,
    force=config.force,
)

### Sample Task

In [None]:
sample_task = SampleFileTask(
    extract_destination=config.extract_destination,
    sample_destination=config.sample_destination,
    frac=config.frac,
    force=config.force,
)

## Prefect Flow Definition
Defines and executes the Prefect flow file-setup-pipeline, orchestrating task dependencies.

In [None]:
with Flow("file-setup-pipeline") as flow:
    # Define task dependencies
    download_result = download_task()
    extract_result = extract_task()
    sample_result = sample_task()


## Execution
Run the Prefect flow to execute the data setup tasks in sequence.

In [None]:
# Run the flow
flow.run()