In [1]:
import os
import yaml
import nbimporter
from datetime import datetime, date
import time
import random
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType,  DateType, TimestampType
from pyspark.sql.functions import to_date, to_timestamp
from lab_database_manager import PgDBManager
from lab_spark import create_spark_session
from lab_schema_manager import SchemaManager
from lab_raw_yahoo import get_raw_yahooquery, get_raw_yfinance
from collections import defaultdict
import warnings
import logging

ImportError: cannot import name 'get_raw_yahooquery' from 'lab_raw_yahoo' (lab_raw_yahoo.ipynb)

# Create Spark Session

In [2]:
# Create Spark Session
connection_config_file="cfg_connections.yaml"
spark_app_name="raw_yfinance"
spark=create_spark_session(connection_config_file, spark_app_name)

# Set logging level to ERROR to reduce verbosity
spark.sparkContext.setLogLevel("ERROR")
# Optional: You can also adjust Python logging for third-party libraries
logging.getLogger("py4j").setLevel(logging.ERROR)
spark.sql("CREATE NAMESPACEcentral IF NOT EXISTS nessie.raw;")


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.postgresql#postgresql added as a dependency
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12 added as a dependency
software.amazon.awssdk#bundle added as a dependency
software.amazon.awssdk#url-connection-client added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-140d698d-f949-428d-951c-d3b417af3241;1.0
	confs: [default]
	found org.postgresql#postgresql;42.7.3 in central
	found org.checkerframework#checker-qual;3.42.0 in central
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.5.0 in central
	found org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.77.1 in central
	found software.amazon.awssdk#bundle;2.24.8 in central
	found software.amazon.awssdk#url-connection-client;2.24.8 in central
	found software.amazon.awssdk#utils;2.24

DataFrame[]

# Function: Load Iceberg Table Function

In [3]:
def insert_into_iceberg_table(schema_config_file, spark_source_df, iceberg_sink_table):
    try: 
        schema_manager=SchemaManager(schema_config_file)
        schema_struct_type=schema_manager.get_struct_type("tables", iceberg_sink_table)  
        
        create_table_script = schema_manager.get_create_table_query("tables", iceberg_sink_table)
        spark.sql(create_table_script)
     
        spark_source_df.writeTo(iceberg_sink_table).append()
        # source_spark_df.write.mode("overwrite").saveAsTable(iceberg_sink_table) 

        incremental_count=spark_source_df.count()
        total_count=spark.table(iceberg_sink_table).count()

        print(f"{iceberg_sink_table} was loaded with {incremental_count} records, totally {total_count} records.")
        
    except Exception as e:
        print(f"Error loading lceberg raw table: {e}")
        

# Function: Insert Data into PG Table

In [4]:
def insert_iceberg_data_into_pg(conn_config_file, iceberg_source_table, pg_database, pg_sink_table, is_pg_truncate_enabled, is_pg_merge_enabled):   
    try:    
        df_source=spark.read.table(iceberg_source_table)          

        pg_db_mgr=PgDBManager(conn_config_file, pg_database)
        pg_url=pg_db_mgr.jdbc_url
        pg_driver=pg_db_mgr.driver

        if is_pg_truncate_enabled == True:
            pg_truncate_script=f"TRUNCATE TABLE {pg_sink_table}"
            pg_db_mgr.execute_sql_script(pg_truncate_script)
        
        # Write DataFrame to PostgreSQL
        df_source.write.jdbc(url=pg_url, table=pg_sink_table, mode="append", properties={"driver": pg_driver}) 

        if is_pg_merge_enabled == True:
            pg_merge_script = "call fin.usp_load_stock_eod();"
            pg_db_mgr.execute_sql_script(pg_merge_script)
            
    except Exception as e:
        print(f"Error loading pg finalytics: {e}")       

# Truncate Iceberg Table

In [5]:
# Get iceberg table config info
schema_config_file='cfg_schemas.yaml'
iceberg_raw_stock_eod_table='nessie.raw.stock_eod_yahooquery'

# Check if the Iceberg table exists and truncate it if it does
if spark.catalog.tableExists(iceberg_raw_stock_eod_table):
    spark.sql(f"TRUNCATE TABLE {iceberg_raw_stock_eod_table}")
    print(f"Iceberg table {iceberg_raw_stock_eod_table} truncated successfully.")
else:
    print(f"Iceberg table {iceberg_raw_stock_eod_table} does not exist.")



SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


Iceberg table nessie.raw.stock_eod_yahooquery truncated successfully.


# Get Symbol Group Info

In [2]:
# Get finalytics connetion info
conn_config_file='cfg_connections.yaml'
pg_db="finalytics"
pg_db_mgr=PgDBManager(conn_config_file, pg_db)
# pg_url=pg_db_mgr.jdbc_url
# pg_driver=pg_db_mgr.driver
import_time = datetime.now()


# Get symbol_start_date_pairs from finalytics
query="SELECT group_id, group_start_date, symbol from fin.vw_etl_stock_eod_start_date_grouped  WHERE group_start_date <'2025-1-9' Limit 5;"
query_result=pg_db_mgr.get_sql_script_result_list(query)

# Initialize a defaultdict to store the symbols for each (group_date, group_id)
grouped_symbols = defaultdict(list)

# Iterate over the data to group symbols by (group_date, group_id)
for group_id, group_start_date, symbol in query_result:
    # Use a tuple of (group_date, group_id) as the key and append the symbol to the list
    grouped_symbols[(group_id, group_start_date)].append(symbol)


NameError: name 'defaultdict' is not defined

# Loop Groups and Insert Data into Iceberg Table

In [2]:
symbol_list=['GTBP', 'ACLS', 'ISDR', 'TDY', 'MMT']
get_raw_yahooquery(symbol_list, '2024-11-15', datetime.now())

['GTBP', 'ACLS', 'ISDR', 'TDY', 'MMT']
<class 'str'>


  has_live_indice = index_utc[-1] >= last_trade - pd.Timedelta(2, "S")


                   open   high    low  close  volume  adjclose  dividends
symbol date                                                              
GTBP   2024-11-15  3.05  3.200  2.910  2.920   12300     2.920        0.0
       2024-11-18  2.90  3.046  2.728  2.950   56700     2.950        0.0
       2024-11-19  2.95  3.136  2.760  3.136   16700     3.136        0.0
       2024-11-20  3.19  3.200  2.930  3.190   15600     3.190        0.0
       2024-11-21  3.19  3.190  2.970  3.010    5800     3.010        0.0
...                 ...    ...    ...    ...     ...       ...        ...
MMT    2025-01-02  4.68  4.720  4.650  4.660   47200     4.660        0.0
       2025-01-03  4.66  4.700  4.660  4.680   47500     4.680        0.0
       2025-01-06  4.67  4.700  4.670  4.670   84400     4.670        0.0
       2025-01-07  4.67  4.670  4.630  4.630   79600     4.630        0.0
       2025-01-08  4.62  4.690  4.620  4.640   78600     4.640        0.0

[180 rows x 7 columns]
    symbol    

  has_live_indice = index_utc[-1] >= last_trade - pd.Timedelta(2, "S")
  has_live_indice = index_utc[-1] >= last_trade - pd.Timedelta(2, "S")
  has_live_indice = index_utc[-1] >= last_trade - pd.Timedelta(2, "S")
  has_live_indice = index_utc[-1] >= last_trade - pd.Timedelta(2, "S")
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["dividends"].fillna(0, inplace=True)


Unnamed: 0,date,symbol,open,high,low,close,volume,import_time
0,2024-11-15,GTBP,3.05,3.200,2.910,2.920,12300,2025-01-10 00:44:22.652197
1,2024-11-18,GTBP,2.90,3.046,2.728,2.950,56700,2025-01-10 00:44:22.652197
2,2024-11-19,GTBP,2.95,3.136,2.760,3.136,16700,2025-01-10 00:44:22.652197
3,2024-11-20,GTBP,3.19,3.200,2.930,3.190,15600,2025-01-10 00:44:22.652197
4,2024-11-21,GTBP,3.19,3.190,2.970,3.010,5800,2025-01-10 00:44:22.652197
...,...,...,...,...,...,...,...,...
175,2025-01-02,MMT,4.68,4.720,4.650,4.660,47200,2025-01-10 00:44:22.652197
176,2025-01-03,MMT,4.66,4.700,4.660,4.680,47500,2025-01-10 00:44:22.652197
177,2025-01-06,MMT,4.67,4.700,4.670,4.670,84400,2025-01-10 00:44:22.652197
178,2025-01-07,MMT,4.67,4.670,4.630,4.630,79600,2025-01-10 00:44:22.652197


In [3]:
# warnings.filterwarnings("ignore", category=FutureWarning, module="yahooquery")

# Display the results
for group, group_symbols in grouped_symbols.items():
    group_id, group_start_date = group
    print(f"Group Date: {group_start_date}, Group Number: {group_id}, Symbols: {group_symbols}")
    # hist_data=get_raw_yfinance(group_symbols, group_start_date, import_time)
    hist_data=get_raw_yahooquery(group_symbols, group_start_date, import_time)
    print(hist_data)
    # hist_df = spark.createDataFrame(hist_data)
    
    # insert_into_iceberg_table(schema_config_file, hist_df, iceberg_raw_stock_eod_table)
    time.sleep(5)


NameError: name 'grouped_symbols' is not defined

In [None]:
print(hist_data)

In [None]:
pg_table='stage.stock_eod_quote_yahoo'
is_pg_truncate_enabled=True
is_pg_merge_enabled=True
insert_iceberg_data_into_pg(conn_config_file, iceberg_raw_stock_eod_table, pg_db, pg_table, is_pg_truncate_enabled, is_pg_merge_enabled)  

In [None]:
spark.sql('select * from nessie.raw.stock_eod_yahooquery').show()