In [None]:
# Cell 1: Setup
import sagemaker
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput
import boto3

session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = session.boto_region_name

print(f"Using SageMaker role: {role}")
print(f"Using AWS Region: {region}")

# Cell 2: Configuration
raw_news_bucket = "ai-trading-copilot-raw"
s3_input_path = f"s3://{raw_news_bucket}/gdelt/daily/" 
date_from = "2022-10-16" # Start of 3-year period
date_to = "2025-10-16"   # Current date

print(f"Input S3 Path: {s3_input_path}")
print(f"Date Range: {date_from} to {date_to}")

# Cell 3: Initialize Processor
processor = SKLearnProcessor(
    framework_version="1.2-1",
    role=role,
    instance_type="ml.t3.medium",
    instance_count=1,
    base_job_name="news-to-dynamodb",
    env={
        "AWS_REGION": region 
    },
    max_runtime_in_seconds=3 * 86400 
)

print("Processor initialized.")

# Cell 4: Run Job and Stream Logs
try:
    processor.run(
        code="processing/news_processor.py", 
        job_name=f"news-processor-range-{sagemaker.utils.unique_name_from_base('')}",
        arguments=[
            "--symbols", "AAPL,MSFT,AMZN",
            "--date-from", date_from,
            "--date-to", date_to,
        ],
        inputs=[
            ProcessingInput(
                source=s3_input_path, # Broad path for SageMaker download
                destination="/opt/ml/processing/input/raw-news"
            )
        ],
        wait=True, # Wait for job completion
        logs=True  # Stream logs to the notebook output
    )
    print("\n Processing job complete.")
except Exception as e:
    print(f"Error submitting or running job: {e}")