In [0]:

from pyspark.sql.functions import input_file_name
from delta.tables import DeltaTable
import os
from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional, Dict, Any
import json
import os
import re
import boto3
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, lit, current_timestamp, input_file_name, regexp_extract, concat
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, LongType, ArrayType, MapType, DoubleType

# S3 Path to Read From (Modify as Needed)

# Configuration constants
CATALOG_NAME = "tulip_sandbox"
SCHEMA_NAME = "sitewise_test"
S3_BUCKET = os.getenv('BUCKET_NAME', 'hannover-messe-tulip')
BASE_PREFIX = os.getenv('BASE_PREFIX', 'iot-sitewise/')
METADATA_TABLE = f"{CATALOG_NAME}.{SCHEMA_NAME}.file_metadata"
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.{SCHEMA_NAME}")

def get_aws_credentials():
    """Get AWS credentials from environment variables"""
    access_key = os.getenv('AWS_ACCESS_KEY_ID')
    secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')
    
    if not access_key or not secret_key:
        raise ValueError("AWS credentials not found in environment variables")
    
    return access_key, secret_key

In [0]:
@dataclass
class S3Config:
    """Configuration for S3 access"""
    bucket: str
    aws_access_key_id: str
    aws_secret_access_key: str
    region: str = os.getenv('AWS_REGION', 'us-east-1')
    base_prefix: str = BASE_PREFIX

    @classmethod
    def from_env(cls) -> 'S3Config':
        """Create S3Config from environment variables"""
        access_key, secret_key = get_aws_credentials()
        return cls(
            bucket=S3_BUCKET,
            aws_access_key_id=access_key,
            aws_secret_access_key=secret_key
        )


def get_s3_client(config: S3Config) -> boto3.client:
    """Create S3 client with explicit credentials"""
    return boto3.client(
        's3',
        aws_access_key_id=config.aws_access_key_id,
        aws_secret_access_key=config.aws_secret_access_key,
        region_name=config.region
    )

    

In [0]:

df = (spark.read.format("avro")
      .load(wildcard_uri)
      .withColumn("file_name", input_file_name()))  # Add file source for tracking



df.display()


In [0]:
from delta.tables import DeltaTable

# Check if Unity Catalog table exists
if spark._jsparkSession.catalog().tableExists(catalog_table):
    delta_table = DeltaTable.forName(spark, catalog_table)

    # Define composite key for merge
    merge_condition = """
        tgt.seriesId = src.seriesId AND
        tgt.timeInSeconds = src.timeInSeconds AND
        tgt.file_name = src.file_name
    """

    # Perform merge (upsert)
    delta_table.alias("tgt").merge(
        df.alias("src"),
        merge_condition
    ).whenNotMatchedInsertAll().execute()

else:
    # If the table doesn't exist, create it
    df.write.format("delta").mode("overwrite").saveAsTable(catalog_table)


✅ Use this format:

pgsql
Copy
Edit
s3://bucket-name/path/to/files/
For your case, it should be:

python
Copy
Edit
s3_path = "s3://bucket_name/sitewise/raw/startYear=*/startMonth=*/startDay=*/*.avro"
PySpark's spark.read.format("avro").load(s3_path) supports this.