In [39]:
import os
import yaml
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

with open("config.yaml","r") as file_object:
    config_dict=yaml.load(file_object,Loader=yaml.SafeLoader)

## DEFINE SENSITIVE VARIABLES
CATALOG_URI = config_dict["iceberg_env"]["CATALOG_URI"]    # Nessie Server URI
WAREHOUSE = config_dict["iceberg_env"]["WAREHOUSE"]               # Minio Address to Write to
STORAGE_URI = config_dict["iceberg_env"]["STORAGE_URI"]      # Minio IP address from docker inspect

# Configure Spark with necessary packages and Iceberg/Nessie settings
conf = (
    pyspark.SparkConf()
        .setAppName('sales_data_app')
        # Include necessary packages
        .set('spark.jars.packages', 'org.postgresql:postgresql:42.7.3,org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,software.amazon.awssdk:bundle:2.24.8,software.amazon.awssdk:url-connection-client:2.24.8')
        # Enable Iceberg and Nessie extensions
        .set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
        # Configure Nessie catalog
        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.nessie.uri', CATALOG_URI)
        .set('spark.sql.catalog.nessie.ref', 'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
        # Set Minio as the S3 endpoint for Iceberg storage
        .set('spark.sql.catalog.nessie.s3.endpoint', STORAGE_URI)
        .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE)
        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
)


In [25]:
# import yaml
# # Create an empty dictionary
# config_dict=dict()
# # Add configuration data to the dictionary
# config_dict["server"]={'port': 8080, 'host': '0.0.0.0'}
# config_dict["logging"]={'level': 'info', 'file': '/var/log/web-server.log'}
# config_dict["database"]={'url': 'postgres://user:password@host:port/database','pool': 100}
# # Save the dictionary to a YAML file
# with open("./config.yaml","w") as file_object:
#     print (file_object)
#     yaml.dump(config_dict,file_object)
#     print(1)

<_io.TextIOWrapper name='./config.yaml' mode='w' encoding='UTF-8'>
1


In [26]:
import yaml
with open("./config.yaml","r") as file_object:
    data=yaml.load(file_object,Loader=yaml.SafeLoader)
    print(data)

{'database': {'pool': 100, 'url': 'postgres://user:password@host:port/database'}, 'logging': {'file': '/var/log/web-server.log', 'level': 'info'}, 'server': {'host': '0.0.0.0', 'port': 8080}, 'iceberg_env': {'CATALOG_URI': 'http://nessie:19120/api/v1', 'WAREHOUSE': 's3://warehouse/', 'STORAGE_URI': 'http://172.22.0.3:9000'}}


In [28]:
import yaml
# Create an empty dictionary
config_dict=dict()
# Add configuration data to the dictionary
config_dict["server"]={'port': 8080, 'host': '0.0.0.0'}
config_dict["logging"]={'level': 'info', 'file': '/var/log/web-server.log'}
config_dict["database"]={'url': 'postgres://user:password@host:port/database','pool': 100}
# Create another dictionary
details_dict= {"Website Name":"HoneyBadger","Author":"Aditya", "Topic":"Configuration Files", "Content Type":"Blog"}
x_dict= {"Website Name":"HoneyBadger","Author":"Aditya", "Topic":"Configuration Files", "Content Type":"Blog"}
list_of_dicts=[config_dict,details_dict, x_dict]
# Save data to a YAML file
with open("web-server-details-1.yaml","w") as file_object:
    yaml.dump_all(list_of_dicts,file_object)

In [34]:
import yaml
with open("config.yaml","r") as file_object:
    config_dict=yaml.load(file_object,Loader=yaml.SafeLoader)
print(config_dict)
print(config_dict["iceberg_env"]["STORAGE_URI"])

        

{'database': {'pool': 100, 'url': 'postgres://user:password@host:port/database'}, 'logging': {'file': '/var/log/web-server.log', 'level': 'info'}, 'server': {'host': '0.0.0.0', 'port': 8080}, 'iceberg_env': {'CATALOG_URI': 'http://nessie:19120/api/v1', 'WAREHOUSE': 's3://warehouse/', 'STORAGE_URI': 'http://172.22.0.3:9000'}}
http://nessie:19120/api/v1


In [40]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType
import yfinance as yf
from pyspark.sql import Row
from multiprocessing import Pool

# Initialize PySpark session
spark = SparkSession.builder \
    .appName("StockDataLoader") \
    .getOrCreate()

# Define a schema for the stock data
schema = StructType([
    StructField("Ticker", StringType(), True),
    StructField("Date", TimestampType(), True),
    StructField("Open", FloatType(), True),
    StructField("High", FloatType(), True),
    StructField("Low", FloatType(), True),
    StructField("Close", FloatType(), True),
    StructField("Adj Close", FloatType(), True),
    StructField("Volume", FloatType(), True)
])

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/21 19:32:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [70]:
# Function to fetch stock data for a single ticker
from datetime import date, datetime, timedelta


def fetch_stock_data(symbol):
    try:
        quote = yf.Ticker(symbol)
        start_date = '2024-12-01'
        current_date = date.today()
        hist = quote.history(start=start_date, end=current_date)
        import_time=datetime.now()
        # if hist.empty:
        #     sql_script = f"UPDATE fin.stock_symbol SET is_valid= false WHERE symbol='{symbol}';"
        #     # print(sql_script)
        #     self.execute_sql_script(sql_script)

        # Reset index to include the Date column in the DataFrame
        hist.reset_index(inplace=True)
        # # get column list with extra fields
        column_list = [x.lower().replace(" ", "_") for x in hist.columns]
        extra_field_list = ['symbol', 'import_time']
        column_list.extend(extra_field_list)

        # get records with appended extra fields
        hist_records_map = hist.itertuples(index=False)
        record_list = [tuple(row) + (symbol,) + (import_time,) for row in hist_records_map]

        return column_list, record_list
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

In [72]:
column_list, record_list=fetch_stock_data('C')
print(column_list)

['date', 'open', 'high', 'low', 'close', 'volume', 'dividends', 'stock_splits', 'symbol', 'import_time']


In [42]:
def parallel_fetch(tickers):
    with Pool(processes=4) as pool:  # Adjust the number of processes based on your machine's capacity
        results = pool.map(fetch_stock_data, tickers)
    return [row for sublist in results for row in sublist]

In [43]:
# List of tickers to fetch
tickers = ["AAPL", "MSFT", "GOOGL", "AMZN"]  # Add more tickers as needed

# Fetch data in parallel
stock_data_rows = parallel_fetch(tickers)

# Convert the data to a Spark DataFrame
stock_df = spark.createDataFrame(stock_data_rows, schema=schema)

# Show some rows
stock_df.show()

[*********************100%***********************]  1 of 1 completed



Error fetching data for AAPL: 'Adj Close'Error fetching data for MSFT: 'Adj Close'



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Error fetching data for AMZN: 'Adj Close'
Error fetching data for GOOGL: 'Adj Close'


                                                                                

+------+----+----+----+---+-----+---------+------+
|Ticker|Date|Open|High|Low|Close|Adj Close|Volume|
+------+----+----+----+---+-----+---------+------+
+------+----+----+----+---+-----+---------+------+



In [60]:
from datetime import date, datetime
def get_yfinance_record(symbol):
    try:
        quote = yf.Ticker(symbol)
        start_date = '2024-12-01'
        current_date = date.today()
        print(current_date)
        # print(start_date+timedelta(days=1))

        hist = quote.history(start=start_date, end=current_date)
        print(hist)
        # if hist.empty:
        #     sql_script = f"UPDATE fin.stock_symbol SET is_valid= false WHERE symbol='{symbol}';"
        #     # print(sql_script)
        #     self.execute_sql_script(sql_script)

        # Reset index to include the Date column in the DataFrame
        hist.reset_index(inplace=True)

        # # get column list with extra fields
        # column_list = [x.lower().replace(" ", "_") for x in hist.columns]
        # extra_field_list = ['symbol', 'import_time']
        # column_list.extend(extra_field_list)
        # print(column_list)

        # # get records with extra fields
        # hist_records_map = hist.itertuples(index=False)

        # record_list = [tuple(row) + (symbol,) + (self.import_time,) for row in hist_records_map]

        # print(record_list)

        # return column_list, record_list
    except Exception as e:
        print("x")
        # print(f"An error occurred: {e}")
        # if "delisted" in e:
        #     print("abc")

In [61]:
get_yfinance_record('C')

2024-12-21
                                Open       High        Low      Close  \
Date                                                                    
2024-12-02 00:00:00-05:00  71.260002  71.650002  70.879997  71.389999   
2024-12-03 00:00:00-05:00  72.190002  72.800003  71.269997  71.419998   
2024-12-04 00:00:00-05:00  71.500000  71.720001  70.500000  71.500000   
2024-12-05 00:00:00-05:00  71.830002  72.849998  71.639999  72.230003   
2024-12-06 00:00:00-05:00  72.309998  72.599998  71.709999  72.150002   
2024-12-09 00:00:00-05:00  72.300003  72.800003  71.839996  71.860001   
2024-12-10 00:00:00-05:00  72.000000  73.379997  71.580002  72.500000   
2024-12-11 00:00:00-05:00  73.000000  73.260002  71.269997  71.959999   
2024-12-12 00:00:00-05:00  71.860001  72.330002  71.410004  71.430000   
2024-12-13 00:00:00-05:00  71.709999  71.910004  70.760002  71.010002   
2024-12-16 00:00:00-05:00  71.269997  71.769997  70.830002  71.489998   
2024-12-17 00:00:00-05:00  70.900002  71