In [1]:
import os
import yaml
import nbimporter
from datetime import datetime, date
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType,  DateType, TimestampType
from pyspark.sql.functions import to_date, to_timestamp
from lab_table_manager import TableManager
import yfinance as yf
from lab_finalytics_database import FinalyticsDB

In [2]:
with open("cfg_connections.yaml","r") as file:
    config=yaml.safe_load(file)
    catalog_uri = config['docker_env']['catalog_uri'] 
    warehouse = config['docker_env']['warehouse']     # Minio Address to Write to
    storage_uri = config['docker_env']['storage_uri'] # Minio IP address from docker inspec

# Configure Spark with necessary packages and Iceberg/Nessie settings
conf = (
    pyspark.SparkConf()
        .setAppName('finalytics_app')
        # Include necessary packages
        .set('spark.jars.packages',
             'org.postgresql:postgresql:42.7.3,'
             'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,'
             'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,'
             'software.amazon.awssdk:bundle:2.24.8,'
             'software.amazon.awssdk:url-connection-client:2.24.8')
        # Enable Iceberg and Nessie extensions
        .set('spark.sql.extensions', 
             'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,'
             'org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
        # Configure Nessie catalog
        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.nessie.uri', catalog_uri)
        .set('spark.sql.catalog.nessie.ref', 'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
        # Set Minio as the S3 endpoint for Iceberg storage
        .set('spark.sql.catalog.nessie.s3.endpoint', storage_uri)
        .set('spark.sql.catalog.nessie.warehouse', warehouse)
        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')       
)   

# Start Spark session
spark = SparkSession.builder.config(conf=conf).getOrCreate()  
# Create the "sales" namespace
spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.raw;").show()


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.postgresql#postgresql added as a dependency
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12 added as a dependency
software.amazon.awssdk#bundle added as a dependency
software.amazon.awssdk#url-connection-client added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-ea40d514-a1b2-43e0-8cf5-fdefc5e77dca;1.0
	confs: [default]
	found org.postgresql#postgresql;42.7.3 in central
	found org.checkerframework#checker-qual;3.42.0 in central
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.0 in central
	found org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.77.1 in central
	found software.amazon.awssdk#bundle;2.24.8 in central
	found software.amazon.awssdk#url-connection-client;2.24.8 in central
	found software.amazon.awssdk#utils;2.24

++
||
++
++



In [3]:
def fetch_yfinance_record(symbol_date_pairs):
    try:
        symbol, start_date = symbol_date_pairs
        # Fetch stock data using yfinance
        quote = yf.Ticker(symbol)
        current_date = date.today()
        hist = quote.history(start=start_date, end=current_date)

        # Reset index to include Date as a column and format it
        hist.reset_index(inplace=True)
        # hist['Date'] = hist['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')
        # hist['Date'] = hist['Date'].dt.strftime('%Y-%m-%d').date()
        hist["Date"] = hist["Date"].dt.date 
        
        # Add symbol and import_time to each row
        record_list = [
            tuple(row) + (symbol, import_time) for row in hist.itertuples(index=False)
        ]
        
        # print(record_list)
        return record_list

    except Exception as e:
        print(f"Error fetching data for {symbol}: {e}")
        return []  # Return an empty list on error

In [4]:
def parallel_fetch_yfinance_record(symbol_date_pairs, record_schema):    
    # Distribute (symbol, start_date) pairs across Spark workers
    record_rdd = spark.sparkContext.parallelize(symbol_date_pairs)
    
    # Fetch data in parallel
    mapped_record_rdd = record_rdd.flatMap(fetch_yfinance_record)

    # Convert RDD to DataFrame
    result_df = spark.createDataFrame(mapped_record_rdd, schema=record_schema)

    # Show or save the results
    # result_df.show()
    return result_df

In [5]:
def load_raw_eod_yfinance(symbol_date_pairs, sink_table, schema_config_file):
    table_manager=TableManager(schema_config_file)
    regd_struct_type=table_manager.get_struct_type(sink_table)   
    
    # regd_column_list = table_manager.get_column_list(sink_table)
    create_table_script = table_manager.get_create_table_query(sink_table)
    spark.sql(create_table_script)
    
    df_raw_eod_yfinance=parallel_fetch_yfinance_record(symbol_date_pairs, regd_struct_type)
    # df_raw_eod_yfinance = df_raw_eod_yfinance.withColumn("date", to_date("date", "yyyy-MM-dd")).withColumn("import_time", to_timestamp("import_time", "yyyy-MM-dd HH:mm:ss"))
  
    # df_raw_eod_yfinance.writeTo(sink_table).append()
    df_raw_eod_yfinance.write.mode("overwrite").saveAsTable(sink_table)    
    # df_raw_eod_yfinance.writeTo(sink_table).overwritePartitions()
    
    print(f"{sink_table} has been loaded")
    

In [32]:
  
# symbol_start_date_pairs = [
#     ('AAPL', '2024-12-10'),
#     ('MSFT', '2024-12-10'),
#     ('GOOGL', '2024-12-10'),
# ]

conn_config_file='cfg_connections.yaml'
finalytics=FinalyticsDB(conn_config_file)

query="select symbol, start_date from fin.vw_etl_stock_eod_start_date limit 30"
symbol_start_date_pairs=finalytics.get_symbol_start_date_pairs(query)
finalytics_url=finalytics.jdbc_url
finalytics_driver=finalytics.driver

regd_schema_config_file='cfg_registered_table_schemas.yaml'
sink_table='nessie.raw.stock_eod_yfinance'

import_time = datetime.now().isoformat()
load_raw_eod_yfinance(symbol_start_date_pairs, sink_table, regd_schema_config_file)





nessie.raw.stock_eod_yfinance has been loaded


                                                                                

In [33]:
spark.sql('select count(*) from nessie.raw.stock_eod_yfinance').show()

+--------+
|count(1)|
+--------+
|     232|
+--------+



In [34]:
df=spark.read.table(sink_table)          
pg_table = "stage.stock_eod_quote_yahoo_new"  # Replace with the PostgreSQL table name
# # Write Delta table DataFrame to PostgreSQL
df.write.jdbc(url=finalytics_url, table=pg_table, mode="overwrite", properties={"driver": finalytics_driver})

query = "call fin.usp_load_stock_eod();"
finalytics.execute_sql_script(query)



In [19]:
spark.sql('select import_time from nessie.raw.stock_eod_yfinance order by import_time desc').show()

+--------------------+
|         import_time|
+--------------------+
|2024-12-25T01:44:...|
|2024-12-25T01:44:...|
|2024-12-25T01:44:...|
|2024-12-25T01:44:...|
|2024-12-25T01:44:...|
|2024-12-25T01:44:...|
|2024-12-25T01:44:...|
|2024-12-25T01:44:...|
|2024-12-25T01:44:...|
|2024-12-25T01:44:...|
|2024-12-25T01:44:...|
+--------------------+



In [26]:
import yfinance as yf

# Fetch historical data for AAPL
ticker = yf.Ticker("AAPL")
history = ticker.history(period="1mo", interval="1d")

In [27]:
print(history)

                                 Open        High         Low       Close  \
Date                                                                        
2024-11-25 00:00:00-05:00  231.460007  233.250000  229.740005  232.869995   
2024-11-26 00:00:00-05:00  233.330002  235.570007  233.330002  235.059998   
2024-11-27 00:00:00-05:00  234.470001  235.690002  233.809998  234.929993   
2024-11-29 00:00:00-05:00  234.809998  237.809998  233.970001  237.330002   
2024-12-02 00:00:00-05:00  237.270004  240.789993  237.160004  239.589996   
2024-12-03 00:00:00-05:00  239.809998  242.759995  238.899994  242.649994   
2024-12-04 00:00:00-05:00  242.869995  244.110001  241.250000  243.009995   
2024-12-05 00:00:00-05:00  243.990005  244.539993  242.130005  243.039993   
2024-12-06 00:00:00-05:00  242.910004  244.630005  242.080002  242.839996   
2024-12-09 00:00:00-05:00  241.830002  247.240005  241.750000  246.750000   
2024-12-10 00:00:00-05:00  246.889999  248.210007  245.339996  247.770004   

In [29]:
quote = yf.Ticker('C')
start_date = '2024-12-01'
current_date = date.today()
# print(start_date+timedelta(days=1))

hist = quote.history(start=start_date, end=current_date)

In [30]:
print(hist)

                                Open       High        Low      Close  \
Date                                                                    
2024-12-02 00:00:00-05:00  71.260002  71.650002  70.879997  71.389999   
2024-12-03 00:00:00-05:00  72.190002  72.800003  71.269997  71.419998   
2024-12-04 00:00:00-05:00  71.500000  71.720001  70.500000  71.500000   
2024-12-05 00:00:00-05:00  71.830002  72.849998  71.639999  72.230003   
2024-12-06 00:00:00-05:00  72.309998  72.599998  71.709999  72.150002   
2024-12-09 00:00:00-05:00  72.300003  72.800003  71.839996  71.860001   
2024-12-10 00:00:00-05:00  72.000000  73.379997  71.580002  72.500000   
2024-12-11 00:00:00-05:00  73.000000  73.260002  71.269997  71.959999   
2024-12-12 00:00:00-05:00  71.860001  72.330002  71.410004  71.430000   
2024-12-13 00:00:00-05:00  71.709999  71.910004  70.760002  71.010002   
2024-12-16 00:00:00-05:00  71.269997  71.769997  70.830002  71.489998   
2024-12-17 00:00:00-05:00  70.900002  71.349998  70