In [None]:
import os
import yaml
import nbimporter
from datetime import datetime, date
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType,  DateType, TimestampType
from lab_table_manager import TableManager
import yfinance as yf
from lab_finalytics_database import FinalyticsDB

In [None]:
with open("cfg_connections.yaml","r") as file:
    config=yaml.safe_load(file)
    catalog_uri = config['docker_env']['catalog_uri'] 
    warehouse = config['docker_env']['warehouse']     # Minio Address to Write to
    storage_uri = config['docker_env']['storage_uri'] # Minio IP address from docker inspec

# Configure Spark with necessary packages and Iceberg/Nessie settings
conf = (
    pyspark.SparkConf()
        .setAppName('finalytics_app')
        # Include necessary packages
        .set('spark.jars.packages',
             'org.postgresql:postgresql:42.7.3,'
             'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,'
             'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,'
             'software.amazon.awssdk:bundle:2.24.8,'
             'software.amazon.awssdk:url-connection-client:2.24.8')
        # Enable Iceberg and Nessie extensions
        .set('spark.sql.extensions', 
             'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,'
             'org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
        # Configure Nessie catalog
        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.nessie.uri', catalog_uri)
        .set('spark.sql.catalog.nessie.ref', 'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
        # Set Minio as the S3 endpoint for Iceberg storage
        .set('spark.sql.catalog.nessie.s3.endpoint', storage_uri)
        .set('spark.sql.catalog.nessie.warehouse', warehouse)
        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')       
)   

# Start Spark session
spark = SparkSession.builder.config(conf=conf).getOrCreate()  
# Create the "sales" namespace
spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.raw;").show()


In [None]:
def fetch_yfinance_record(symbol_date_pairs):
    try:
        symbol, start_date = symbol_date_pairs
        # Fetch stock data using yfinance
        quote = yf.Ticker(symbol)
        current_date = date.today()
        hist = quote.history(start=start_date, end=current_date)

        # Reset index to include Date as a column and format it
        hist.reset_index(inplace=True)
        # hist['Date'] = hist['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')
        hist['Date'] = hist['Date'].dt.strftime('%Y-%m-%d').date()
        
        # Add symbol and import_time to each row
        record_list = [
            tuple(row) + (symbol, import_time) for row in hist.itertuples(index=False)
        ]
        

        return record_list

    except Exception as e:
        print(f"Error fetching data for {symbol}: {e}")
        return []  # Return an empty list on error

In [None]:
def parallel_fetch_yfinance_record(symbol_date_pairs, record_schema):    
    # Distribute (symbol, start_date) pairs across Spark workers
    record_rdd = spark.sparkContext.parallelize(symbol_date_pairs)
    
    # Fetch data in parallel
    mapped_record_rdd = record_rdd.flatMap(fetch_yfinance_record)

    # Convert RDD to DataFrame
    result_df = spark.createDataFrame(mapped_record_rdd, schema=record_schema)

    # Show or save the results
    # result_df.show()
    return result_df

In [None]:
def load_raw_eod_yfinance(symbol_date_pairs, sink_table, schema_config_file):
    table_manager=TableManager(schema_config_file)
    regd_struct_type=table_manager.get_struct_type(sink_table)   
    # regd_column_list = table_manager.get_column_list(sink_table)
    create_table_script = table_manager.get_create_table_query(sink_table)
    
    df_raw_eod_yfinance=parallel_fetch_yfinance_record(symbol_date_pairs, regd_struct_type)
    spark.sql(create_table_script)
    # df_raw_eod_yfinance.writeTo(sink_table).append()
    df_raw_eod_yfinance.write.mode("overwrite").saveAsTable(sink_table)    
    # df_raw_eod_yfinance.writeTo(sink_table).overwritePartitions()
    print(f"{sink_table} has been loaded")

         
          

In [None]:
          
# symbol_start_date_pairs = [
#     ('AAPL', '2024-12-10'),
#     ('MSFT', '2024-12-10'),
#     ('GOOGL', '2024-12-10'),
# ]


conn_config_file='cfg_connections.yaml'
finalytics=FinalyticsDB(conn_config_file)
query="select symbol, start_date from fin.vw_etl_stock_eod_start_date limit 3"
symbol_start_date_pairs=finalytics.get_symbol_start_date_pairs(query)
finalytics_url=finalytics.jdbc_url
finalytics_driver=finalytics.driver

regd_schema_config_file='cfg_registered_table_schemas.yaml'
sink_table='nessie.raw.stock_eod_yfinance'

# import_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.") + str(datetime.now().microsecond)[:3]
import_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.")
load_raw_eod_yfinance(symbol_start_date_pairs, sink_table, regd_schema_config_file)
          
    
df=spark.read.table(sink_table)
          
# table_name = "stage.stock_eod_quote_yahoo_new"  # Replace with the PostgreSQL table name
# # # Write Delta table DataFrame to PostgreSQL
# df.write.jdbc(url=finalytics_url, table=table_name, mode="append", properties={"driver": finalytics_driver})


In [None]:

spark.sql('select count(*) from nessie.raw.stock_eod_yfinance limit 4').show()

In [None]:
spark.sql('select * from nessie.raw.stock_eod_yfinance limit 4 order by import_time desc' ).show()

In [None]:
df=spark.read.table(sink_table)
# df.show()

In [None]:
table_name = "stage.stock_eod_quote_yahoo_new"  # Replace with the PostgreSQL table name
# # Write Delta table DataFrame to PostgreSQL
df.write.jdbc(url=finalytics_url, table=table_name, mode="overwrite", properties={"driver": finalytics_driver})