In [1]:
import os
import yaml
import nbimporter
from datetime import datetime, date
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType,  DateType, TimestampType
from pyspark.sql.functions import to_date, to_timestamp
from lab_table_manager import TableManager
import yfinance as yf
import time
import random
from lab_pg_database_manager import PGDatabaseManager

In [9]:
def create_spark_session(app_name):
    with open("cfg_connections.yaml","r") as file:
        config=yaml.safe_load(file)
        catalog_uri = config['docker_env']['catalog_uri'] 
        warehouse = config['docker_env']['warehouse']     # Minio Address to Write to
        storage_uri = config['docker_env']['storage_uri'] # Minio IP address from docker inspec
    
    # Configure Spark with necessary packages and Iceberg/Nessie settings
    conf = (
        pyspark.SparkConf()
            .setAppName(app_name)
            # Include necessary packages
            .set('spark.jars.packages',
                 'org.postgresql:postgresql:42.7.3,'
                 'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,'
                 'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,'
                 'software.amazon.awssdk:bundle:2.24.8,'
                 'software.amazon.awssdk:url-connection-client:2.24.8')
            # Enable Iceberg and Nessie extensions
            .set('spark.sql.extensions', 
                 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,'
                 'org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
            # Configure Nessie catalog
            .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
            .set('spark.sql.catalog.nessie.uri', catalog_uri)
            .set('spark.sql.catalog.nessie.ref', 'main')
            .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
            .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
            # Set Minio as the S3 endpoint for Iceberg storage
            .set('spark.sql.catalog.nessie.s3.endpoint', storage_uri)
            .set('spark.sql.catalog.nessie.warehouse', warehouse)
            .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')       
    )   
    
    # Start Spark session
    spark = SparkSession.builder.config(conf=conf).getOrCreate()  
    # Create the "raw" namespace
    spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.raw;")

    return spark
