In [1]:
import os
import yaml
import nbimporter
from datetime import datetime, date
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from raw_yfinance_ingestion import RawYFIngestion  
from lab_table_manager import TableManager
import yfinance as yf

In [2]:
with open("config_connections.yaml","r") as file:
    config=yaml.safe_load(file)
    catalog_uri = config['docker_env']['catalog_uri'] 
    warehouse = config['docker_env']['warehouse']     # Minio Address to Write to
    storage_uri = config['docker_env']['storage_uri'] # Minio IP address from docker inspec

# Configure Spark with necessary packages and Iceberg/Nessie settings
conf = (
    pyspark.SparkConf()
        .setAppName('finalytics_app')
        # Include necessary packages
        .set('spark.jars.packages',
             'org.postgresql:postgresql:42.7.3,'
             'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,'
             'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,'
             'software.amazon.awssdk:bundle:2.24.8,'
             'software.amazon.awssdk:url-connection-client:2.24.8')
        # Enable Iceberg and Nessie extensions
        .set('spark.sql.extensions', 
             'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,'
             'org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
        # Configure Nessie catalog
        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.nessie.uri', catalog_uri)
        .set('spark.sql.catalog.nessie.ref', 'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
        # Set Minio as the S3 endpoint for Iceberg storage
        .set('spark.sql.catalog.nessie.s3.endpoint', storage_uri)
        .set('spark.sql.catalog.nessie.warehouse', warehouse)
        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')       
)   

# Start Spark session
spark = SparkSession.builder.config(conf=conf).getOrCreate()  
# Create the "sales" namespace
spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.raw;")


FileNotFoundError: [Errno 2] No such file or directory: 'config_connections.yaml'

In [4]:
def fetch_yfinance_record(multi_param_pairs):
    try:
        symbol, start_date = multi_param_pairs
        # Fetch stock data using yfinance
        quote = yf.Ticker(symbol)
        current_date = date.today()
        hist = quote.history(start=start_date, end=current_date)

        # Reset index to include Date as a column and format it
        hist.reset_index(inplace=True)
        hist['Date'] = hist['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')

        # Add symbol and import_time to each row
        record_list = [
            tuple(row) + (symbol, import_time) for row in hist.itertuples(index=False)
        ]

        return record_list

    except Exception as e:
        print(f"Error fetching data for {symbol}: {e}")
        return []  # Return an empty list on error

In [5]:
# Function to process the records (pass through parameters)
def process_yfinance_record(single_param_pair):
    # print(f"Processing {single_param_pair}")
    return fetch_yfinance_record(single_param_pair)

In [9]:
# Parallel fetch function
def parallel_fetch(multi_param_pairs, column_list):
    # Create RDD from the input parameter pairs
    record_rdd = spark.sparkContext.parallelize(multi_param_pairs)

    # Use flatMap to return a flattened list of records
    results_rdd = record_rdd.flatMap(process_yfinance_record)

    # Collect the results from the RDD and convert to a list of tuples
    # results = results_rdd.collect()        
    df = spark.createDataFrame(results_rdd, column_list)   
    
    return df
 

In [11]:
# List of stock symbols and start dates
yf_param_pairs = [
    ('AAPL', '2024-12-10'),
    ('MSFT', '2024-12-10'),
    ('GOOGL', '2024-12-10'),
]

import_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.") + str(datetime.now().microsecond)[:3]
zone='raw'
sink_table='nessie.raw.stock_eod_yfinance'
config_file_path='cfg_registered_table_schemas.yaml'

table_manager=TableManager(config_file_path)
registered_column_list = table_manager.get_column_list(sink_table)
create_table_query = table_manager.get_create_table_query(sink_table)

# Fetch data in parallel
df_raw_eod_yfinance = parallel_fetch(yf_param_pairs, registered_column_list)
# df_raw_eod_yfinance.show()

# print(create_table_query)
spark.sql(create_table_query)


# df_raw_eod_yfinance.writeTo(sink_table).append()
spark.sql(f"select * from {sink_table}").show()





                                                                                

+-------------------+------+------+------+------+---------+---------+------------+------+--------------------+
|               date|  open|  high|   low| close|   volume|dividends|stock_splits|symbol|         import_time|
+-------------------+------+------+------+------+---------+---------+------------+------+--------------------+
|2024-12-10 00:00:00|246.89|248.21|245.34|247.77| 36914800|      0.0|         0.0|  AAPL|2024-12-24 15:23:...|
|2024-12-11 00:00:00|247.96| 250.8|246.26|246.49| 45205800|      0.0|         0.0|  AAPL|2024-12-24 15:23:...|
|2024-12-12 00:00:00|246.89|248.74|245.68|247.96| 32777500|      0.0|         0.0|  AAPL|2024-12-24 15:23:...|
|2024-12-13 00:00:00|247.82|249.29|246.24|248.13| 33155300|      0.0|         0.0|  AAPL|2024-12-24 15:23:...|
|2024-12-16 00:00:00|247.99|251.38|247.65|251.04| 51694800|      0.0|         0.0|  AAPL|2024-12-24 15:23:...|
|2024-12-17 00:00:00|250.08|253.83|249.78|253.48| 51356400|      0.0|         0.0|  AAPL|2024-12-24 15:23:...|
|