In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, unix_timestamp, hour, to_date, row_number, trim
import logging
from io import StringIO, BytesIO
from minio import Minio
from minio.error import S3Error
import datetime
from pyspark.sql.window import Window

In [None]:

HIVE_URI = 'thrift://hive-metastore:9083'
MINIO_ACCESS_KEY = 'FrmF5fXO0bxpBepjVUSX'
MINIO_SECRET_KEY = '5RgPTIToiUPa16HAWnRv3KcsE7y21Oo3RPA3QXTb'
MINIO_ENDPOINT = "minio:9000"
MINIO_BUCKET = "logs"


spark = SparkSession.builder \
    .appName("IcebergAsDefaultCatalog") \
    .config('spark.jars.packages', 
            'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,'
            'org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.1,'
            'software.amazon.awssdk:bundle:2.17.178,'
            'software.amazon.awssdk:url-connection-client:2.17.178') \
    .config('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions') \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.spark_catalog.uri", "thrift://hive-metastore:9083") \
    .config("spark.sql.catalog.spark_catalog.warehouse", "s3a://warehouse/") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.hive.metastore.sasl.enabled", "false") \
    .getOrCreate()

## Start Spark Session
print("Spark Running")


In [None]:
spark.conf.set("spark.sql.iceberg.handle-timestamp-without-timezone", "true")

In [None]:
# In-memory log buffer
log_buffer = StringIO()

# Configure logger
log_handler = logging.StreamHandler(log_buffer)
log_handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s: %(message)s'))

logger = logging.getLogger("SilverLoader")
logger.setLevel(logging.INFO)
logger.addHandler(log_handler)
logger.addHandler(logging.StreamHandler())  # Optional: Also log to stdout

In [None]:
# Upload logs from buffer to MinIO
def flush_logs_to_minio(object_name):
    try:
        client = Minio(
            MINIO_ENDPOINT,
            access_key=MINIO_ACCESS_KEY,
            secret_key=MINIO_SECRET_KEY,
            secure=False  # Set to True if using HTTPS
        )
        if not client.bucket_exists(MINIO_BUCKET):
            client.make_bucket(MINIO_BUCKET)

        content = log_buffer.getvalue().encode("utf-8")
        client.put_object(
            MINIO_BUCKET,
            object_name,
            data=BytesIO(content),
            length=len(content),
            content_type='text/plain'
        )
        logger.info(f"Uploaded log to MinIO at {MINIO_BUCKET}/{object_name}")
        
        # Clear the buffer after upload
        log_buffer.truncate(0)
        log_buffer.seek(0)
        
    except S3Error as e:
        logger.error(f"Failed to upload log: {e}")