In [0]:
import os
import shutil

# configuration
landing_zone_path = dbutils.widgets.get("landing_zone_path")
raw_data_path = dbutils.widgets.get("raw_data_path")
BATCH_SIZE = int(dbutils.widgets.get("batch_size"))

display(f"checking for new files in landing zone: {landing_zone_path}")
os.makedirs(raw_data_path, exist_ok=True)

# list all available files in the landing zone
files_in_landing_zone = [f for f in os.listdir(landing_zone_path) if os.path.isfile(os.path.join(landing_zone_path, f))]
if not files_in_landing_zone:
    display("no new files found in the landing zone.")
    dbutils.notebook.exit("no new files to ingest.")

files_in_raw = set(os.listdir(raw_data_path))
new_files = [f for f in files_in_landing_zone if f not in files_in_raw]

# select a batch of files to process
files_to_ingest = new_files[:BATCH_SIZE]
display(f"found {len(files_in_landing_zone)} files. ingesting a batch of {len(files_to_ingest)}.")

for file_name in files_to_ingest:
    source_path = os.path.join(landing_zone_path, file_name)
    destination_path = os.path.join(raw_data_path, file_name)
    
    display(f"  moving {source_path} to {destination_path}")
    shutil.copy(source_path, destination_path) # so that it stays in the landing zone for next runs
    
display(f"\ningestion complete. {len(files_to_ingest)} files moved to raw data directory.")
