In [19]:
import os
import yaml
import nbimporter
from datetime import datetime, date
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from raw_yfinance_ingestion import RawYFIngestion  
from lab_registered_tables import RegisteredTables

In [20]:
with open("config_connections.yaml","r") as file:
    config=yaml.safe_load(file)
    catalog_uri = config['docker_env']['catalog_uri'] 
    warehouse = config['docker_env']['warehouse']     # Minio Address to Write to
    storage_uri = config['docker_env']['storage_uri'] # Minio IP address from docker inspec

# Configure Spark with necessary packages and Iceberg/Nessie settings
conf = (
    pyspark.SparkConf()
        .setAppName('finalytics_app')
        # Include necessary packages
        .set('spark.jars.packages',
             'org.postgresql:postgresql:42.7.3,'
             'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,'
             'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.77.1,'
             'software.amazon.awssdk:bundle:2.24.8,'
             'software.amazon.awssdk:url-connection-client:2.24.8')
        # Enable Iceberg and Nessie extensions
        .set('spark.sql.extensions', 
             'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,'
             'org.projectnessie.spark.extensions.NessieSparkSessionExtensions')
        # Configure Nessie catalog
        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.nessie.uri', catalog_uri)
        .set('spark.sql.catalog.nessie.ref', 'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
        # Set Minio as the S3 endpoint for Iceberg storage
        .set('spark.sql.catalog.nessie.s3.endpoint', storage_uri)
        .set('spark.sql.catalog.nessie.warehouse', warehouse)
        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')       
)   

# Start Spark session
spark = SparkSession.builder.config(conf=conf).getOrCreate()  


In [21]:
import_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.") + str(datetime.now().microsecond)[:3]
zone='raw'
sink_table='raw.stock_eod_yfinance'
config_file_path='registered_table_schemas.yaml'
rt=RegisteredTables(zone, sink_table, config_file_path)
registered_column_list = rt.get_column_list()
       

In [22]:
def fetch_yfinance_record(multi_param_pairs):
    try:
        symbol, start_date = multi_param_pairs
        # Fetch stock data using yfinance
        quote = yf.Ticker(symbol)
        current_date = date.today()
        hist = quote.history(start=start_date, end=current_date)

        # Reset index to include Date as a column and format it
        hist.reset_index(inplace=True)
        hist['Date'] = hist['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')

        # Add symbol and import_time to each row
        record_list = [
            tuple(row) + (symbol, import_time) for row in hist.itertuples(index=False)
        ]

        return record_list

    except Exception as e:
        print(f"Error fetching data for {symbol}: {e}")
        return []  # Return an empty list on error

In [23]:
# Function to process the records (pass through parameters)
def process_yfinance_record(single_param_pair):
    # print(f"Processing {single_param_pair}")
    return fetch_yfinance_record(single_param_pair)

In [24]:
# Parallel fetch function
def parallel_fetch(multi_param_pairs):
    # Create RDD from the input parameter pairs
    record_rdd = spark.sparkContext.parallelize(multi_param_pairs)

    # Use flatMap to return a flattened list of records
    results_rdd = record_rdd.flatMap(process_yfinance_record)

    # Collect the results from the RDD and convert to a list of tuples
    # results = results_rdd.collect()        
    df = spark.createDataFrame(results_rdd, registered_column_list)   
    df.show()
    return df
 

In [25]:
# List of stock symbols and start dates

import_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.") + str(datetime.now().microsecond)[:3]
zone='raw'
sink_table='raw.stock_eod_yfinance'
config_file_path='registered_table_schemas.yaml'
rt=RegisteredTables(zone, sink_table, config_file_path)
registered_column_list = rt.get_column_list()

yf_param_pairs = [
    ('AAPL', '2024-12-10'),
    ('MSFT', '2024-12-10'),
    ('GOOGL', '2024-12-10'),
]

# Instantiate the class

# Fetch data in parallel
stock_data_rows = parallel_fetch(yf_param_pairs)

Error fetching data for AAPL: name 'yf' is not defined                          
Error fetching data for MSFT: name 'yf' is not defined
Error fetching data for GOOGL: name 'yf' is not defined


ValueError: RDD is empty

In [None]:


# Main class for ingestion
class RawYFIngestion:
    # Basic attributes of the class
    def __init__(self, equity_type, zone, sink_table, config_file_path):
        
        self.equity_type = equity_type
        self.zone=zone
        self.sink_table = sink_table
        self.config_file_path = config_file_path
        self.import_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.") + str(datetime.now().microsecond)[:3]
        rt=RegisteredTables(self.zone, self.sink_table, self.config_file_path)
        self.registered_column_list = rt.get_column_list()      
        
    # Function to fetch data from Yahoo Finance
    def fetch_yfinance_record(self, multi_param_pairs):
        try:
            symbol, start_date = multi_param_pairs
            # Fetch stock data using yfinance
            quote = yf.Ticker(symbol)
            current_date = date.today()
            hist = quote.history(start=start_date, end=current_date)
            
            # Reset index to include Date as a column and format it
            hist.reset_index(inplace=True)
            hist['Date'] = hist['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')

            # Add symbol and import_time to each row
            record_list = [
                tuple(row) + (symbol, self.import_time) for row in hist.itertuples(index=False)
            ]
            
            return record_list
        
        except Exception as e:
            print(f"Error fetching data for {symbol}: {e}")
            return []  # Return an empty list on error
    
    # Function to process the records (pass through parameters)
    def process_yfinance_record(self, single_param_pair):
        # print(f"Processing {single_param_pair}")
        return self.fetch_yfinance_record(single_param_pair)

    # Parallel fetch function
    def parallel_fetch(self, multi_param_pairs):        
      
        # Create RDD from the input parameter pairs
        record_rdd = spark.sparkContext.parallelize(multi_param_pairs)
        
        # Use flatMap to return a flattened list of records
        results_rdd = record_rdd.flatMap(self.process_yfinance_record)
        
        # Collect the results from the RDD and convert to a list of tuples
        # results = results_rdd.collect()        
        df = spark.createDataFrame(results_rdd, self.registered_column_list)   
        df.show()
        return df
 

    


In [None]:

# List of stock symbols and start dates
yf_param_pairs = [
    ('AAPL', '2024-12-10'),
    ('MSFT', '2024-12-10'),
    ('GOOGL', '2024-12-10'),
]

# Instantiate the class
stock_stage = RawYFIngestion('stock', 'raw', 'raw.stock_eod_yfinance', 'registered_table_schemas.yaml')

# Fetch data in parallel
stock_data_rows = stock_stage.parallel_fetch(yf_param_pairs)

In [None]:

# Main class for ingestion
class RawYFIngestion:
    # Basic attributes of the class
    def __init__(self, spark, equity_type, zone, sink_table, config_file_path):
        self.spark=spark
        self.equity_type = equity_type
        self.zone=zone
        self.sink_table = sink_table
        self.config_file_path = config_file_path
        self.import_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.") + str(datetime.now().microsecond)[:3]
        rt=RegisteredTables(self.zone, self.sink_table, self.config_file_path)
        self.registered_column_list = rt.get_column_list()
        self.registered_struct_type = rt.get_struct_type()        
        
    # Function to fetch data from Yahoo Finance
    def fetch_yfinance_record(self, multi_param_pairs):
        try:
            symbol, start_date = multi_param_pairs
            # Fetch stock data using yfinance
            quote = yf.Ticker(symbol)
            current_date = date.today()
            hist = quote.history(start=start_date, end=current_date)
            
            # Reset index to include Date as a column and format it
            hist.reset_index(inplace=True)
            hist['Date'] = hist['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')

            # Add symbol and import_time to each row
            record_list = [
                tuple(row) + (symbol, self.import_time) for row in hist.itertuples(index=False)
            ]
            
            return record_list
        
        except Exception as e:
            print(f"Error fetching data for {symbol}: {e}")
            return []  # Return an empty list on error
    
    # Function to process the records (pass through parameters)
    def process_yfinance_record(self, single_param_pair):
        # print(f"Processing {single_param_pair}")
        return self.fetch_yfinance_record(single_param_pair)

    # Parallel fetch function
    def parallel_fetch(self, multi_param_pairs):        
      
        # Create RDD from the input parameter pairs
        record_rdd = self.spark.sparkContext.parallelize(multi_param_pairs)
        
        # Use flatMap to return a flattened list of records
        results_rdd = record_rdd.flatMap(self.process_yfinance_record)
        
        # Collect the results from the RDD and convert to a list of tuples
        # results = results_rdd.collect()        
        df = self.spark.createDataFrame(results_rdd, self.registered_column_list)   
        df.show()
        return df
 

    

# # List of stock symbols and start dates
# yf_param_pairs = [
#     ('AAPL', '2024-12-10'),
#     ('MSFT', '2024-12-10'),
#     ('GOOGL', '2024-12-10'),
# ]

# # Instantiate the class
# stock_stage = RawYFIngestion('stock', 'raw', 'raw.stock_eod_yfinance', 'registered_table_schemas.yaml')

# # Fetch data in parallel
# stock_data_rows = stock_stage.parallel_fetch(yf_param_pairs)


In [None]:
import yaml
# Create an empty dictionary
config_dict=dict()
# Add configuration data to the dictionary
config_dict["server"]={'port': 8080, 'host': '0.0.0.0'}
config_dict["logging"]={'level': 'info', 'file': '/var/log/web-server.log'}
config_dict["database"]={'url': 'postgres://user:password@host:port/database','pool': 100}
# Create another dictionary
details_dict= {"Website Name":"HoneyBadger","Author":"Aditya", "Topic":"Configuration Files", "Content Type":"Blog"}
x_dict= {"Website Name":"HoneyBadger","Author":"Aditya", "Topic":"Configuration Files", "Content Type":"Blog"}
list_of_dicts=[config_dict,details_dict, x_dict]
# Save data to a YAML file
with open("web-server-details-1.yaml","w") as file_object:
    yaml.dump_all(list_of_dicts,file_object)

In [None]:
import yaml
with open("config.yaml","r") as file_object:
    config_dict=yaml.load(file_object,Loader=yaml.SafeLoader)
print(config_dict)
print(config_dict["iceberg_env"]["STORAGE_URI"])

        

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType
import yfinance as yf
from pyspark.sql import Row
from multiprocessing import Pool

# Initialize PySpark session
spark = SparkSession.builder \
    .appName("StockDataLoader") \
    .getOrCreate()

# Define a schema for the stock data
schema = StructType([
    StructField("Ticker", StringType(), True),
    StructField("Date", TimestampType(), True),
    StructField("Open", FloatType(), True),
    StructField("High", FloatType(), True),
    StructField("Low", FloatType(), True),
    StructField("Close", FloatType(), True),
    StructField("Adj Close", FloatType(), True),
    StructField("Volume", FloatType(), True)
])

In [None]:
# Function to fetch stock data for a single ticker
from datetime import date, datetime, timedelta


def fetch_stock_data(symbol):
    try:
        quote = yf.Ticker(symbol)
        start_date = '2024-12-01'
        current_date = date.today()
        hist = quote.history(start=start_date, end=current_date)
        import_time=datetime.now()
        # if hist.empty:
        #     sql_script = f"UPDATE fin.stock_symbol SET is_valid= false WHERE symbol='{symbol}';"
        #     # print(sql_script)
        #     self.execute_sql_script(sql_script)

        # Reset index to include the Date column in the DataFrame
        hist.reset_index(inplace=True)
        # # get column list with extra fields
        column_list = [x.lower().replace(" ", "_") for x in hist.columns]
        extra_field_list = ['symbol', 'import_time']
        column_list.extend(extra_field_list)

        # get records with appended extra fields
        hist_records_map = hist.itertuples(index=False)
        record_list = [tuple(row) + (symbol,) + (import_time,) for row in hist_records_map]

        return column_list, record_list
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

In [None]:
column_list, record_list=fetch_stock_data('C')
print(column_list)

In [None]:
def parallel_fetch(tickers):
    with Pool(processes=4) as pool:  # Adjust the number of processes based on your machine's capacity
        results = pool.map(fetch_stock_data, tickers)
    return [row for sublist in results for row in sublist]

In [None]:
# List of tickers to fetch
tickers = ["AAPL", "MSFT", "GOOGL", "AMZN"]  # Add more tickers as needed

# Fetch data in parallel
stock_data_rows = parallel_fetch(tickers)

# Convert the data to a Spark DataFrame
stock_df = spark.createDataFrame(stock_data_rows, schema=schema)

# Show some rows
stock_df.show()

In [None]:
from datetime import date, datetime
def get_yfinance_record(symbol):
    try:
        quote = yf.Ticker(symbol)
        start_date = '2024-12-01'
        current_date = date.today()
        print(current_date)
        # print(start_date+timedelta(days=1))

        hist = quote.history(start=start_date, end=current_date)
        print(hist)
        # if hist.empty:
        #     sql_script = f"UPDATE fin.stock_symbol SET is_valid= false WHERE symbol='{symbol}';"
        #     # print(sql_script)
        #     self.execute_sql_script(sql_script)

        # Reset index to include the Date column in the DataFrame
        hist.reset_index(inplace=True)

        # # get column list with extra fields
        # column_list = [x.lower().replace(" ", "_") for x in hist.columns]
        # extra_field_list = ['symbol', 'import_time']
        # column_list.extend(extra_field_list)
        # print(column_list)

        # # get records with extra fields
        # hist_records_map = hist.itertuples(index=False)

        # record_list = [tuple(row) + (symbol,) + (self.import_time,) for row in hist_records_map]

        # print(record_list)

        # return column_list, record_list
    except Exception as e:
        print("x")
        # print(f"An error occurred: {e}")
        # if "delisted" in e:
        #     print("abc")

In [None]:
get_yfinance_record('C')

In [None]:
list1 = [1, 2, 3, 4, 5]
list2 = [3, 4, 5]

# Convert lists to sets
set1 = set(list1)
set2 = set(list2)

# Find the difference between the sets
difference1 = set1 - set2  # Elements in list1 but not in list2
difference2 = set2 - set1  # Elements in list2 but not in list1


print(len(difference1))
print("Difference (list1 - list2):", difference1)
print(len(difference2))
print("Difference (list2 - list1):", difference2)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import yfinance as yf

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Input Data: Tickers with their respective start dates
tickers_data = [
    {"ticker": "AAPL", "start_date": "2020-01-01"},
    {"ticker": "MSFT", "start_date": "2019-01-01"},
    {"ticker": "GOOGL", "start_date": "2021-01-01"},
]

# Create RDD from input data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# Function to fetch Yahoo Finance data
def fetch_data(params):
    ticker = params["ticker"]
    start_date = params["start_date"]
    try:
        data = yf.download(ticker, start=start_date)
        data.reset_index(inplace=True)
        data["Ticker"] = ticker  # Add a column for ticker
        return data.to_dict("records")  # Convert to a list of dictionaries
    except Exception as e:
        return [{"Ticker": ticker, "Error": str(e)}]

# Fetch data in parallel
results = tickers_rdd.flatMap(fetch_data).collect()

# Convert results to a DataFrame
flattened_results = [item for sublist in results for item in sublist]
df = spark.createDataFrame(flattened_results)

# Show results
df.show(truncate=False)

In [None]:
import yfinance as yf
from datetime import date, datetime, timedelta
# import psycopg2.extras
# from joblib import Parallel, delayed
# import psycopg2
import multiprocessing
# from pgcopy import CopyManager
from io import StringIO
import yaml




# with open("config.yaml","r") as file_object:
#     documents=yaml.safe_load_all(file_object)
#     for doc in documents:
#         doc_name = doc['document_name']
#         if doc_name=='yfinance_stock':
#             registered_col_list=doc['column_list']

#         if doc_name=='iceberg_env':
#             CATALOG_URI = doc['catalog_uri'] # Nessie Server URI
#             WAREHOUSE = doc['warehouse']     # Minio Address to Write to
#             STORAGE_URI = doc['storage_uri'] # Minio IP address from docker inspec



class MyCustomException(Exception):
    pass

class YFinanceStageIngestion:
    def __init__(self, equity_type, destination):
        self.equity_type = equity_type
        self.destination = destination
        self.import_time=datetime.now()
        
        # Get yfinance stock data registered column list
        with open("config.yaml","r") as file_object:
            documents=yaml.safe_load_all(file_object)
            for doc in documents:
                doc_name = doc['document_name']
                if doc_name==f"yfinance_{equity_type}":
                    self.registered_col_list=doc['registered_column_list']

    
        # self.import_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.") + str(datetime.now().microsecond)[:3]
    
    def fetch_yfinance_data(self, args):
        try:
            symbol = args["symbol"]
            start_date = args["start_date"]
            
            quote = yf.Ticker(symbol)
            # start_date = '2024-12-01'
            current_date = date.today()
            hist = quote.history(start=start_date, end=current_date)
            # import_time=datetime.now()
    
            # Reset index to include the Date column in the DataFrame
            hist.reset_index(inplace=True)
            
            # Standardize the hist column name by lowering the case of the original column name and replacing space with underscore
            # This standardized column names are reginstered in configuration file
            standardized_column_list = [x.lower().replace(" ", "_") for x in hist.columns]  
            
            # Add symbol and import_time in column list
            extra_field_list = ['symbol', 'import_time']
            standardized_column_list.extend(extra_field_list)
            
            # Check whether the standardized column names match the registered ones   
            set_standardized = set(standardized_column_list)
            set_registered = set(self.registered_column_list)            
            if set_standardized!=set_registered:
                raise MyCustomException(f"Error: standardized_column_list {str(standardized_column_list)} does not match registered_column_list {str(registered_column_list)}!")

            # Add symbol and import_time in each record
            hist_records_map = hist.itertuples(index=False)            
            record_list = [tuple(row) + (symbol,) + (self.import_time,) for row in hist_records_map]  
            
            return standardized_column_list, record_list
        except Exception as e:
            print(f"Error fetching data for {symbol}: {e}")
            return []


    def parallel_fetch(self, param_pairs):
        yfinance_data_rdd = spark.sparkContext.parallelize(param_pairs)
        results = yfinance_data_rdd.flatMap(self.fetch_yfinance_data).collect()
        flattened_results = [item for sublist in results for item in sublist]
        df = spark.createDataFrame(flattened_results)
        df.show(truncate=False)
            
            

stock_stage = YFinanceStageIngestion('stock', 'mytable')
        
# List of tickers to fetch
stock_param_pairs = [("AAPL", "2024-12-1"), ("MSFT", "2024-12-5"), ("GOOGL", "2024-12-9")]  # Add more tickers as needed

stock_stage.parallel_fetch(stock_param_pairs)

# Fetch data in parallel
stock_data_rows = parallel_fetch(stock_param_pairs)

# # Convert the data to a Spark DataFrame
# stock_df = spark.createDataFrame(stock_data_rows, schema=schema)

# # Show some rows
# stock_df.show()


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
import yfinance as yf

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Input Data: Tickers with their respective start dates
tickers_data = [
    {"ticker": "AAPL", "start_date": "2020-01-01"},
    {"ticker": "MSFT", "start_date": "2019-01-01"},
    {"ticker": "GOOGL", "start_date": "2021-01-01"},
]

# Create RDD from input data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# Function to fetch Yahoo Finance data
def fetch_data(params):
    ticker = params["ticker"]
    start_date = params["start_date"]
    try:
        quote = yf.Ticker(ticker)
        hist = quote.history(start=start_date, end=current_date)
        hist.reset_index(inplace=True)
        return hist.to_dict("records")  # Convert to a list of dictionaries
    except Exception as e:
        return [{"Ticker": ticker, "Error": str(e)}]

# Fetch data in parallel
results = tickers_rdd.flatMap(fetch_data).collect()

# Convert results to a DataFrame
flattened_results = [item for sublist in results for item in sublist]
df = spark.createDataFrame(flattened_results)

# Show results
df.show(truncate=False)


In [None]:

from pyspark.sql import SparkSession
import yfinance as yf
import numpy as np
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()




symbol = 'aasdfljksadfjlksd'
start_date = ""

quote = yf.Ticker('AAPL')
# start_date = '2024-12-01'
current_date = date.today()
hist = quote.history()
# import_time=datetime.now()

# Reset index to include the Date column in the DataFrame
hist.reset_index(inplace=True)

# Standardize the hist column name by lowering the case of the original column name and replacing space with underscore
# This standardized column names are reginstered in configuration file
standardized_column_list = [x.lower().replace(" ", "_") for x in hist.columns]  

# Add symbol and import_time in column list
extra_field_list = ['symbol', 'import_time']
standardized_column_list.extend(extra_field_list)
print(standardized_column_list)

# # Check whether the standardized column names match the registered ones   
# set_standardized = set(standardized_column_list)
# set_registered = set(self.registered_column_list)            
# if set_standardized!=set_registered:
#     raise MyCustomException(f"Error: standardized_column_list {str(standardized_column_list)} does not match registered_column_list {str(registered_column_list)}!")

# # Add symbol and import_time in each record
# hist_records_map = hist.itertuples(index=False)            
# record_list = [tuple(row) + (symbol,) + (self.import_time,) for row in hist_records_map]  

# return standardized_column_list, record_list

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType, IntegerType
import yfinance as yf

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Input data: List of tickers (start_date is fixed as '2024-12-01' for all)
tickers_data = [
    ('AAPL', '2024-12-01'),
    ('MSFT', '2024-12-01'),
    ('GOOGL', '2024-12-01'),
]

# Create an RDD from the input data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# Fetch function to retrieve all stock data fields with ticker and fixed start_date
def fetch_stock_data_with_ticker_and_date(ticker, start_date):
    try:
        # Fetch stock data using yfinance
        stock_data = yf.Ticker(ticker).history(start=start_date)
        stock_data.reset_index(inplace=True)  # Reset index to include 'Date' as a column
        stock_data['Date'] = stock_data['Date'].dt.strftime('%Y-%m-%d %H:%M:%S%z')
        
        # Convert all rows to tuples and append ticker and start_date
        return [tuple(row) + (ticker, start_date) for row in stock_data.itertuples(index=False)]
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

# Function to process ticker data and call the fetch function with unpacked params
def process_ticker_data(ticker_data):
    print(ticker_data)
    return fetch_stock_data_with_ticker_and_date(*ticker_data)

# Use flatMap to process data in parallel and collect all rows as a single list of tuples
results_rdd = tickers_rdd.flatMap(process_ticker_data)

# Collect the data as a list of tuples
results = results_rdd.collect()

# Define the schema for the DataFrame
schema = StructType([
    StructField("Date", StringType(), True),        # 'Date' column as Timestamp
    StructField("Open", FloatType(), True),            # 'Open' column as Float
    StructField("High", FloatType(), True),            # 'High' column as Float
    StructField("Low", FloatType(), True),             # 'Low' column as Float
    StructField("Close", FloatType(), True),           # 'Close' column as Float
    StructField("Volume", IntegerType(), True),            # 'Volume' column as Int
    StructField("Dividends", FloatType(), True),       # 'Dividends' column as Float
    StructField("Stock Splits", FloatType(), True),    # 'Stock Splits' column as Float
    StructField("Ticker", StringType(), True),         # 'Ticker' column as String
    StructField("Start Date", StringType(), True)      # 'Start Date' column as String
])

# Create a DataFrame using the schema
df = spark.createDataFrame(results, schema)

# Show the DataFrame content
df.show(truncate=False)

In [None]:
from pyspark.sql import SparkSession
import yfinance as yf
import numpy as np

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Input data: List of tuples with tickers and start dates
tickers_data = [
    ('AAPL', '2024-12-01'),
    ('MSFT', '2024-12-01'),
    ('GOOGL', '2024-12-01'),
]

# Create an RDD from the input data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# Function to fetch daily data for a ticker using yf.Ticker
def fetch_data(params):
    ticker, start_date = params
    try:
        # Fetch the ticker object
        quote = yf.Ticker(ticker)
        
        # Get historical market data
        data = quote.history(start=start_date)
        
        # Reset index to include the 'Date' column
        data.reset_index(inplace=True)
        
        
        rows = data.apply(lambda row: tuple(row), axis=1).tolist()       

        return rows
    except Exception as e:
        # Return empty list on error
        return []

# Use flatMap to fetch data in parallel
results_rdd = tickers_rdd.flatMap(fetch_data)

# Create the DataFrame by inferring schema from the RDD
df = spark.createDataFrame(results_rdd)

# Show the results
df.show(truncate=False)

In [None]:

from pyspark.sql import SparkSession

import numpy as np
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType, IntegerType
import yfinance as yf


from datetime import date, datetime, timedelta

class MyCustomException(Exception):
    pass

class YFinanceStageIngestion:
    def __init__(self, equity_type, destination):
        self.equity_type = equity_type
        self.destination = destination
        self.import_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.") + str(datetime.now().microsecond)[:3]      
        
   
    def fetch_yfinance_record(self, params):
        try:
            symbol, start_date = params      
            quote = yf.Ticker(symbol)
            current_date = date.today()
            hist = quote.history(start=start_date, end=current_date)
   
            # Reset index to include the Date column in the DataFrame
            hist.reset_index(inplace=True)
            hist['Date'] = hist['Date'].dt.strftime('%Y-%m-%d %H:%M:%S%z')

            # Add symbol and import_time in each record
            hist_records_map = hist.itertuples(index=False)            
            record_list = [tuple(row) + (symbol, self.import_time) for row in hist_records_map]
            record_list=[]
            return record_list
        except Exception as e:
            print(f"Error fetching data for {symbol}: {e}")
            return []    
    
    def process_yfinance_record(self, param):
        print(param)
        return self.fetch_yfinance_record(*param)

        
    def parallel_fetch(self, param_pairs):       
        spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()
        record_rdd = spark.sparkContext.parallelize(param_pairs)        
        results_rdd = record_rdd.flatMap(self.process_yfinance_record)  
        results = results_rdd.collect()
        print(results)
        

stock_param_pairs = [
    ('AAPL', '2024-12-10'),
    ('MSFT', '2024-12-10'),
    ('GOOGL', '2024-12-10'),
]
      
stock_stage = YFinanceStageIngestion('stock', 'mytable')
        

# Fetch data in parallel
stock_data_rows = stock_stage.parallel_fetch(stock_param_pairs)




# # Input data: List of tickers (start_date is fixed as '2024-12-01' for all)
# tickers_data = [
#     ('AAPL', '2024-12-01'),
#     ('MSFT', '2024-12-01'),
#     ('GOOGL', '2024-12-01'),
# ]

# # Create an RDD from the input data
# tickers_rdd = spark.sparkContext.parallelize(tickers_data)













In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType
import yfinance as yf
from datetime import datetime, date

# Define the custom exception
class MyCustomException(Exception):
    pass

# Main class for ingestion
class YFinanceStageIngestion:
    def __init__(self, equity_type, destination):
        self.equity_type = equity_type
        self.destination = destination
        self.import_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.") + str(datetime.now().microsecond)[:3]      

    # Function to fetch data from Yahoo Finance
    def fetch_yfinance_record(self, params):
        try:
            symbol, start_date = params
            # Fetch stock data using yfinance
            quote = yf.Ticker(symbol)
            current_date = date.today()
            hist = quote.history(start=start_date, end=current_date)
            
            # Reset index to include Date as a column and format it
            hist.reset_index(inplace=True)
            hist['Date'] = hist['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')

            # Add symbol and import_time to each row
            record_list = [
                tuple(row) + (symbol, self.import_time) for row in hist.itertuples(index=False)
            ]
            return record_list
        except Exception as e:
            print(f"Error fetching data for {symbol}: {e}")
            return []  # Return an empty list on error
    
    # Function to process the records (pass through parameters)
    def process_yfinance_record(self, param):
        print(f"Processing {param}")
        return self.fetch_yfinance_record(param)

    # Parallel fetch function
    def parallel_fetch(self, param_pairs):
        # Create Spark session
        spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()
        
        # Create RDD from the input parameter pairs
        record_rdd = spark.sparkContext.parallelize(param_pairs)
        
        # Use flatMap to return a flattened list of records
        results_rdd = record_rdd.flatMap(self.process_yfinance_record)
        
        # Collect the results from the RDD
        results = results_rdd.collect()
        return results

# List of stock symbols and start dates
stock_param_pairs = [
    ('AAPL', '2024-12-10'),
    ('MSFT', '2024-12-10'),
    ('GOOGL', '2024-12-10'),
]

# Instantiate the class
stock_stage = YFinanceStageIngestion('stock', 'mytable')

# Fetch data in parallel
stock_data_rows = stock_stage.parallel_fetch(stock_param_pairs)

spark = SparkSession.builder.appName("YahooFinanceDatax").getOrCreate()


# You can also load the result into a DataFrame if required
from pyspark.sql import Row
schema = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits', 'Ticker', 'ImportTime']
rdd_rows = spark.sparkContext.parallelize(stock_data_rows)
df = spark.createDataFrame(rdd_rows, schema)
df.show()


In [None]:
from pyspark.sql import SparkSession
import yfinance as yf
import numpy as np

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Input data: List of tuples with tickers and start dates
tickers_data = [
    ('AAPL', '2024-12-01'),
    ('MSFT', '2024-12-01'),
    ('GOOGL', '2024-12-01'),
]

# Create an RDD from the input data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# Function to fetch daily data for a ticker using yf.Ticker
def fetch_data(params):
    ticker, start_date = params
    try:
        # Fetch the ticker object
        quote = yf.Ticker(ticker)
        
        # Get historical market data
        data = quote.history(start=start_date)
        
        # Reset index to include the 'Date' column
        data.reset_index(inplace=True)
        
        
        rows = data.apply(lambda row: tuple(row), axis=1).tolist()       

        return rows
    except Exception as e:
        # Return empty list on error
        return []

# Use flatMap to fetch data in parallel
results_rdd = tickers_rdd.flatMap(fetch_data)

# Create the DataFrame by inferring schema from the RDD
df = spark.createDataFrame(results_rdd)

# Show the results
df.show(truncate=False)


In [None]:
from pyspark.sql import SparkSession
import yfinance as yf
import numpy as np

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Input data: List of tuples with tickers and start dates
tickers_data = [
    ('AAPL', '2020-01-01'),
    ('MSFT', '2019-01-01'),
    ('GOOGL', '2021-01-01'),
]

# Create an RDD from the input data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# Function to fetch daily data for a ticker using yf.Ticker
def fetch_data(params):
    ticker, start_date = params
    try:
        # Fetch the ticker object
        quote = yf.Ticker(ticker)
        
        # Get historical market data
        data = quote.history(start=start_date)
        
        # Reset index to include the 'Date' column
        data.reset_index(inplace=True)
        
        # Convert the DataFrame to a list of tuples
        rows = data.apply(lambda row: tuple(row), axis=1).tolist()
        
        return rows
    except Exception as e:
        # Return empty list on error
        print(f"Error fetching data for {ticker}: {e}")
        return []

# Use flatMap to fetch data in parallel
results_rdd = tickers_rdd.flatMap(fetch_data)

# Extract column names dynamically from the data
# After resetting the index, the columns should include 'Date', 'Open', 'Close', etc.
data = next(iter(results_rdd))  # Get the first element to infer columns
columns = [col for col in data[0]._fields]  # Extract column names dynamically from the first row

# Create the DataFrame by inferring schema from the RDD
df = spark.createDataFrame(results_rdd, schema=columns)

# Show the results
df.show(truncate=False)


In [None]:
symbol='C'
quote = yf.Ticker(symbol)
start_date ='2024-12-11'
current_date = date.today()
# print(start_date+timedelta(days=1))

hist = quote.history(start=start_date, end=current_date)
import_time=datetime.now()

# Reset index to include the Date column in the DataFrame
hist.reset_index(inplace=True)

# get column list with extra fields
column_list = [x.lower().replace(" ", "_") for x in hist.columns]
extra_field_list = ['symbol', 'import_time']
column_list.extend(extra_field_list)
# print(column_list)

# get records with extra fields
hist_records_map = hist.itertuples(index=False)

record_list = [tuple(row) + (symbol,) + (import_time,) for row in hist_records_map]

print(record_list)


In [None]:
from pyspark.sql import SparkSession
import yfinance as yf
import pandas as pd
import numpy as np

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Input data: List of tuples with ticker and start date
tickers_data = [
    ('AAPL', '2024-01-01'),
    ('MSFT', '2024-01-01'),
    ('GOOGL', '2024-01-01'),
]

# Create an RDD from the input data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# General fetch function
# def fetch_stock_data(ticker, start_date):
#     try:
#         # Fetch the stock data using yfinance
#         stock_data = yf.Ticker(ticker).history(start=start_date)
#         stock_data.reset_index(inplace=True)  # Reset index to get 'Date' as a column
        
#         # Convert the DataFrame into a list of tuples
#         rows = [
#             (
#                 row['Date'].strftime('%Y-%m-%d') if isinstance(row['Date'], pd.Timestamp) else row['Date'],
#                 float(row['Open']) if not np.isnan(row['Open']) else None,
#                 float(row['High']) if not np.isnan(row['High']) else None,
#                 float(row['Low']) if not np.isnan(row['Low']) else None,
#                 float(row['Close']) if not np.isnan(row['Close']) else None,
#                 float(row['Volume']) if not np.isnan(row['Volume']) else None,
#             )
#             for _, row in stock_data.iterrows()
#         ]
#         return rows
#     except Exception as e:
#         print(f"Error fetching data for {ticker}: {e}")
#         return []

def fetch_stock_data(ticker, start_date):
    try:
        # Fetch stock data using yfinance
        stock_data = yf.Ticker(ticker).history(start=start_date)
        stock_data.reset_index(inplace=True)  # Reset index to include 'Date' as a column
        
        # Convert all fields from the DataFrame into a list of tuples
        data_tuples = [tuple(row) for row in stock_data.itertuples(index=False)]
        
        return data_tuples
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

# Wrapper function for map
def wrapper(params, fetch_fn):
    ticker, start_date = params
    return fetch_fn(ticker, start_date)

# Use map with fetch function as parameter
results_rdd = tickers_rdd.map(lambda params: wrapper(params, fetch_stock_data))

# Flatten the results RDD
flattened_rdd = results_rdd.flatMap(lambda x: x)

# Define schema for the final DataFrame
schema = ["date", "open", "high", "low", "close", "volume"]

# Create a DataFrame from the flattened RDD
stock_df = spark.createDataFrame(flattened_rdd, schema=schema)

# Show the resulting DataFrame
stock_df.show(truncate=False)


In [None]:
import yfinance as yf
import pandas as pd
import numpy as np

def fetch_stock_data_as_tuples(ticker, start_date):
    try:
        # Fetch the stock data using yfinance
        stock_data = yf.Ticker(ticker).history(start=start_date)
        stock_data.reset_index(inplace=True)  # Reset index to include 'Date' as a column

        # Convert the DataFrame into a list of tuples
        data_tuples = [
            (
                row['Date'].strftime('%Y-%m-%d') if isinstance(row['Date'], pd.Timestamp) else row['Date'],
                float(row['Open']) if not np.isnan(row['Open']) else None,
                float(row['High']) if not np.isnan(row['High']) else None,
                float(row['Low']) if not np.isnan(row['Low']) else None,
                float(row['Close']) if not np.isnan(row['Close']) else None,
                float(row['Volume']) if not np.isnan(row['Volume']) else None,
            )
            for _, row in stock_data.iterrows()
        ]
        return data_tuples
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

# Example usage
ticker = "AAPL"
start_date = "2020-01-01"
stock_tuples = fetch_stock_data_as_tuples(ticker, start_date)

# Print the first few tuples
print(stock_tuples[:5])


In [None]:
import yfinance as yf
import pandas as pd

def fetch_all_fields_as_tuples(ticker, start_date):
    try:
        # Fetch stock data using yfinance
        stock_data = yf.Ticker(ticker).history(start=start_date)
        stock_data.reset_index(inplace=True)  # Reset index to include 'Date' as a column
        
        # Convert all fields from the DataFrame into a list of tuples
        data_tuples = [tuple(row) for row in stock_data.itertuples(index=False)]
        
        return data_tuples
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

# Example usage
ticker = "AAPL"
start_date = "2020-01-01"
stock_tuples = fetch_all_fields_as_tuples(ticker, start_date)

# Print the first few tuples
print(stock_tuples[:5])

In [None]:
from pyspark.sql import SparkSession
import yfinance as yf

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Input data: List of tuples with tickers and start dates
tickers_data = [
    ('AAPL', '2020-01-01'),
    ('MSFT', '2019-01-01'),
    ('GOOGL', '2021-01-01'),
]

# Create an RDD from the input data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# Fetch function to retrieve all stock data fields
def fetch_stock_data_all_fields(ticker, start_date):
    try:
        # Fetch stock data using yfinance
        stock_data = yf.Ticker(ticker).history(start=start_date)
        stock_data.reset_index(inplace=True)  # Reset index to include 'Date' as a column
        
        # Convert all rows to tuples
        return [tuple(row) for row in stock_data.itertuples(index=False)]
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

# Use map instead of flatMap
results_rdd = tickers_rdd.map(lambda params: fetch_stock_data_all_fields(params[0], params[1]))

# Collect and print results
results = results_rdd.collect()
print(results)
# for result in results:
#     print(result[:3])  # Print the first 3 rows for each ticker

In [None]:
from pyspark.sql import SparkSession
import yfinance as yf

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Input data: List of tickers (start_date is fixed as '2024-12-01' for all)
tickers_data = [
    ('AAPL', '2024-12-01'),
    ('MSFT', '2024-12-01'),
    ('GOOGL', '2024-12-01'),
]

# Create an RDD from the input data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# Fetch function to retrieve all stock data fields with ticker and fixed start_date
def fetch_stock_data_with_ticker_and_date(ticker, start_date):
    try:
        # Fetch stock data using yfinance
        stock_data = yf.Ticker(ticker).history(start=start_date)
        stock_data.reset_index(inplace=True)  # Reset index to include 'Date' as a column
        
        # Convert all rows to tuples and append ticker and start_date
        return [tuple(row) + (ticker, start_date) for row in stock_data.itertuples(index=False)]
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

# Use map to process data in parallel
results_rdd = tickers_rdd.map(lambda params: fetch_stock_data_with_ticker_and_date(params[0], params[1]))

# Collect and print results
results = results_rdd.collect()
for ticker_data in results:
    print(ticker_data[:3])  # Print the first 3 rows for each ticker


In [None]:
from pyspark.sql import SparkSession
import yfinance as yf

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Input data: List of tickers (start_date is fixed as '2024-12-01' for all)
tickers_data = [
    ('AAPL', '2024-12-01'),
    ('MSFT', '2024-12-01'),
    ('GOOGL', '2024-12-01'),
]

# Create an RDD from the input data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# Fetch function to retrieve all stock data fields with ticker and fixed start_date
def fetch_stock_data_with_ticker_and_date(ticker, start_date):
    try:
        # Fetch stock data using yfinance
        stock_data = yf.Ticker(ticker).history(start=start_date)
        stock_data.reset_index(inplace=True)  # Reset index to include 'Date' as a column
        
        # Convert all rows to tuples and append ticker and start_date
        return [tuple(row) + (ticker, start_date) for row in stock_data.itertuples(index=False)]
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

# Use flatMap to process data in parallel and collect all rows as tuples
results_rdd = tickers_rdd.flatMap(lambda params: fetch_stock_data_with_ticker_and_date(params[0], params[1]))

# Collect the data as a list of tuples
results = results_rdd.collect()

# Define the schema based on the yfinance columns and additional fields
columns = [
    "Date", "Open", "High", "Low", "Close", "Volume", "Dividends", "Stock Splits",
    "Ticker", "Start_Date"
]

# Create a Spark DataFrame
df = spark.createDataFrame(results, schema=columns)

# Show the DataFrame
df.show(5)


In [None]:
from pyspark.sql import SparkSession
import yfinance as yf

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Input data: List of tickers (start_date is fixed as '2024-12-01' for all)
tickers_data = [
    ('AAPL', '2024-12-01'),
    ('MSFT', '2024-12-01'),
    ('GOOGL', '2024-12-01'),
]

# Create an RDD from the input data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# Fetch function to retrieve all stock data fields with ticker and fixed start_date
def fetch_stock_data_with_ticker_and_date(ticker, start_date):
    try:
        # Fetch stock data using yfinance
        stock_data = yf.Ticker(ticker).history(start=start_date)
        stock_data.reset_index(inplace=True)  # Reset index to include 'Date' as a column
        
        # Convert 'Date' to string format and append ticker and start_date
        result = []
        for row in stock_data.itertuples(index=False):
            row_date = row[0].strftime('%Y-%m-%d')  # Convert Date to string
            result.append(tuple([row_date] + list(row[1:]) + [ticker, start_date]))
        
        return result
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

# Use flatMap to process data in parallel and collect all rows as tuples
results_rdd = tickers_rdd.flatMap(lambda params: fetch_stock_data_with_ticker_and_date(params[0], params[1]))

# Collect the data as a list of tuples
results = results_rdd.collect()

# Define the schema based on the yfinance columns and additional fields
columns = [
    "Date", "Open", "High", "Low", "Close", "Volume", "Dividends", "Stock Splits",
    "Ticker", "Start_Date"
]

# Create a Spark DataFrame
df = spark.createDataFrame(results, schema=columns)

# Show the DataFrame
df.show(5)


In [None]:
from pyspark.sql import SparkSession
import yfinance as yf

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Input data: List of tickers (start_date is fixed as '2024-12-01' for all)
tickers_data = [
    ('AAPL', '2024-12-01'),
    ('MSFT', '2024-12-01'),
    ('GOOGL', '2024-12-01'),
]

# Create an RDD from the input data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# Fetch function to retrieve all stock data fields with ticker and fixed start_date
def fetch_stock_data_with_ticker_and_date(ticker, start_date):
    try:
        # Fetch stock data using yfinance
        stock_data = yf.Ticker(ticker).history(start=start_date)
        stock_data.reset_index(inplace=True)  # Reset index to include 'Date' as a column
        
        # Convert all rows to tuples and append ticker and start_date
        return [tuple(row) + (ticker, start_date) for row in stock_data.itertuples(index=False)]
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

# Use flatMap to process data in parallel and collect all rows as a single list of tuples
results_rdd = tickers_rdd.flatMap(lambda params: fetch_stock_data_with_ticker_and_date(params[0], params[1]))

# Collect the data as a list of tuples
results = results_rdd.collect()
print(results)

# Print the first 3 tuples to check the result
for row in results[:3]:
    print(row)


In [None]:
from pyspark.sql import SparkSession
import yfinance as yf

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Input data: List of tickers (start_date is fixed as '2024-12-01' for all)
tickers_data = [
    ('AAPL', '2024-12-10'),
    ('MSFT', '2024-12-10'),
    ('GOOGL', '2024-12-10'),
]

# Create an RDD from the input data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# Fetch function to retrieve all stock data fields with ticker and fixed start_date
def fetch_stock_data_with_ticker_and_date(ticker, start_date):
    try:
        # Fetch stock data using yfinance
        stock_data = yf.Ticker(ticker).history(start=start_date)
        stock_data.reset_index(inplace=True)  # Reset index to include 'Date' as a column

        stock_data['Date'] = stock_data['Date'].dt.strftime('%Y-%m-%d %H:%M:%S%z')
        
        
        # Convert all rows to tuples and append ticker and start_date
        return [tuple(row) + (ticker, start_date) for row in stock_data.itertuples(index=False)]
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

# Function to process ticker data and call the fetch function with unpacked params
def process_ticker_data(ticker_data):
    return fetch_stock_data_with_ticker_and_date(*ticker_data)

# Use flatMap to process data in parallel and collect all rows as a single list of tuples
results_rdd = tickers_rdd.flatMap(process_ticker_data)

# Collect the data as a list of tuples
results = results_rdd.collect()
print(results)
# # Print the first 3 tuples to check the result
# for row in results[:10]:
#     print(row)


In [None]:
import yfinance as yf
import pandas as pd

def fetch_stock_data_with_ticker_and_date(ticker, start_date):
    try:
        # Fetch stock data using yfinance
        stock_data = yf.Ticker(ticker).history(start=start_date)
        stock_data.reset_index(inplace=True)  # Reset index to include 'Date' as a column
        
        if stock_data['Date'].dt.tz is not None:
            stock_data['Date'] = stock_data['Date'].dt.tz_localize(None)
        # Convert all rows to tuples, include ticker and start_date, dynamically using all fields
        return [
            tuple(row) + (ticker, start_date)  # Add ticker and start_date to the tuple
            for row in stock_data.itertuples(index=False)
        ]
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

    
x=fetch_stock_data_with_ticker_and_date('C', '2024-12-11')
print(x)
                                    

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType, IntegerType
import yfinance as yf

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Define the schema for the DataFrame
schema = StructType([
    StructField("Date", StringType(), True),
    StructField("Open", FloatType(), True),
    StructField("High", FloatType(), True),
    StructField("Low", FloatType(), True),
    StructField("Close", FloatType(), True),
    StructField("Volume", IntegerType(), True),
    StructField("Dividends", FloatType(), True),
    StructField("Stock Splits", FloatType(), True),
    StructField("Ticker", StringType(), True),
    StructField("Start Date", StringType(), True)
])

# Input data: List of tickers and start date
tickers_data = [
    ('AAPL', '2024-12-01'),
    ('MSFT', '2024-12-01'),
    ('GOOGL', '2024-12-01')
]

# Fetch function to retrieve stock data
def fetch_stock_data_with_ticker_and_date(ticker, start_date):
    try:
        # Fetch stock data using yfinance
        stock_data = yf.Ticker(ticker).history(start=start_date)
        stock_data.reset_index(inplace=True)  # Reset index to include 'Date' as a column
        
        
        stock_data['Date'] = stock_data['Date'].dt.strftime('%Y-%m-%d %H:%M:%S%z')
        
        # Convert all rows to tuples, include ticker and start_date
        return [
            tuple(row) + (ticker, start_date)  # Add ticker and start_date to the tuple
            for row in stock_data.itertuples(index=False)
        ]
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

# Create an RDD from the tickers data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# Fetch the stock data for all tickers using map and collect the results
results_rdd = tickers_rdd.flatMap(lambda params: fetch_stock_data_with_ticker_and_date(params[0], params[1]))

# Collect the results as a list of tuples
results = results_rdd.collect()

# Create a Spark DataFrame from the results
df = spark.createDataFrame(results, schema)

# Show the DataFrame
df.show(truncate=False)


In [None]:
# from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType, IntegerType


print(results)
# Define the schema for the DataFrame
new_schema = [
    StructField("Date", TimestampType(), True),        # 'Date' column as Timestamp
    StructField("Open", FloatType(), True),            # 'Open' column as Float
    StructField("High", FloatType(), True),            # 'High' column as Float
    StructField("Low", FloatType(), True),             # 'Low' column as Float
    StructField("Close", FloatType(), True),           # 'Close' column as Float
    StructField("Volume", IntegerType(), True),              # 'Volume' column as Float
    StructField("Dividends", FloatType(), True),       # 'Dividends' column as Float
    StructField("Stock Splits", FloatType(), True),    # 'Stock Splits' column as Float
    StructField("Ticker", StringType(), True),         # 'Ticker' column as String
    StructField("Start Date", StringType(), True)      # 'Start Date' column as String
]

# Create a StructType object
schema = StructType(new_schema)

df = spark.createDataFrame(results, schema)

# Show the DataFrame
df.show(truncate=False)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType
from datetime import datetime
import pytz

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Example result list containing tuples (with timezone info)
results = [
    ('2024-12-01 00:00:00-05:00', 145.22, 146.40, 144.50, 145.50, 1200000.0, 0.0, 0.0, 'AAPL', '2024-12-01'),
    ('2024-12-02 00:00:00-05:00', 146.50, 147.60, 145.70, 146.80, 1100000.0, 0.0, 0.0, 'AAPL', '2024-12-01'),
    ('2024-12-01 00:00:00-05:00', 101.00, 102.20, 100.50, 101.80, 1000000.0, 0.0, 0.0, 'MSFT', '2024-12-01')
]

# Convert 'Date' field (with timezone) to timestamp without timezone
def convert_to_timestamp(date_str):
    # Remove the timezone part and convert the string to datetime object
    return datetime.strptime(date_str.split(' ')[0], "%Y-%m-%d")

# Create a new list with the converted date
converted_results = [
    (convert_to_timestamp(row[0]), *row[1:]) for row in results
]

# Define the schema for the DataFrame
new_schema = [
    StructField("Date", TimestampType(), True),
    StructField("Open", FloatType(), True),
    StructField("High", FloatType(), True),
    StructField("Low", FloatType(), True),
    StructField("Close", FloatType(), True),
    StructField("Volume", FloatType(), True),
    StructField("Dividends", FloatType(), True),
    StructField("Stock Splits", FloatType(), True),
    StructField("Ticker", StringType(), True),
    StructField("Start Date", StringType(), True)
]

# Create a DataFrame from the converted results and schema
df = spark.createDataFrame(converted_results, schema=new_schema)

# Show the DataFrame
df.show(truncate=False)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType, IntegerType
import yfinance as yf

# Initialize Spark Session
spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()

# Input data: List of tickers (start_date is fixed as '2024-12-01' for all)
tickers_data = [
    ('AAPL', '2024-12-01'),
    ('MSFT', '2024-12-01'),
    ('GOOGL', '2024-12-01'),
]

# Create an RDD from the input data
tickers_rdd = spark.sparkContext.parallelize(tickers_data)

# Fetch function to retrieve all stock data fields with ticker and fixed start_date
def fetch_stock_data_with_ticker_and_date(ticker, start_date):
    try:
        # Fetch stock data using yfinance
        stock_data = yf.Ticker(ticker).history(start=start_date)
        stock_data.reset_index(inplace=True)  # Reset index to include 'Date' as a column
        stock_data['Date'] = stock_data['Date'].dt.strftime('%Y-%m-%d %H:%M:%S%z')
        
        # Convert all rows to tuples and append ticker and start_date
        return [tuple(row) + (ticker, start_date) for row in stock_data.itertuples(index=False)]
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return []

# Function to process ticker data and call the fetch function with unpacked params
def process_ticker_data(ticker_data):
    return fetch_stock_data_with_ticker_and_date(*ticker_data)

# Use flatMap to process data in parallel and collect all rows as a single list of tuples
results_rdd = tickers_rdd.flatMap(process_ticker_data)

# Collect the data as a list of tuples
results = results_rdd.collect()

# Define the schema for the DataFrame
schema = StructType([
    StructField("Date", StringType(), True),        # 'Date' column as Timestamp
    StructField("Open", FloatType(), True),            # 'Open' column as Float
    StructField("High", FloatType(), True),            # 'High' column as Float
    StructField("Low", FloatType(), True),             # 'Low' column as Float
    StructField("Close", FloatType(), True),           # 'Close' column as Float
    StructField("Volume", IntegerType(), True),            # 'Volume' column as Int
    StructField("Dividends", FloatType(), True),       # 'Dividends' column as Float
    StructField("Stock Splits", FloatType(), True),    # 'Stock Splits' column as Float
    StructField("Ticker", StringType(), True),         # 'Ticker' column as String
    StructField("Start Date", StringType(), True)      # 'Start Date' column as String
])

# Create a DataFrame using the schema
df = spark.createDataFrame(results, schema)

# Show the DataFrame content
df.show(truncate=False)

In [None]:
import yfinance as yf
import pandas as pd

# Get data
data = yf.download("AAPL", start="2023-12-18", end="2023-12-22")

# Convert the index to datetime and remove timezone
data.index = pd.to_datetime(data.index).tz_localize(None)

print(data.head())

In [None]:
import yfinance as yf

from pandas import Timedelta



# Download data

ticker = "AAPL" 

df = yf.download(ticker, start="2023-01-01", end="2023-12-22")



# Convert date to UTC

df.index = df.index.tz_localize('UTC') 



# Access a specific date in UTC

print(df.index[0])  

In [None]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType
import yfinance as yf
from datetime import date, datetime, timedelta
from io import StringIO
import yaml

# import psycopg2.extras
# import psycopg2
# from pgcopy import CopyManager

# Define the custom exception
class MyCustomException(Exception):
    pass

# Main class for ingestion
class RawYFIngestion:
    def __init__(self, equity_type, destination):
        self.equity_type = equity_type
        self.destination = destination
        self.import_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.") + str(datetime.now().microsecond)[:3]
        
       # Get yfinance registered column list
        with open("config.yaml","r") as file_object:
            documents=yaml.safe_load_all(file_object)
            for doc in documents:
                doc_name = doc['document_name']
                if doc_name==f"yfinance_{equity_type}":
                    self.registered_col_list=doc['registered_column_list']

    # Function to fetch data from Yahoo Finance
    def fetch_yfinance_record(self, multi_param_pairs):
        try:
            symbol, start_date = multi_param_pairs
            # Fetch stock data using yfinance
            quote = yf.Ticker(symbol)
            current_date = date.today()
            hist = quote.history(start=start_date, end=current_date)
            
            # Reset index to include Date as a column and format it
            hist.reset_index(inplace=True)
            hist['Date'] = hist['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')

            # Add symbol and import_time to each row
            record_list = [
                tuple(row) + (symbol, self.import_time) for row in hist.itertuples(index=False)
            ]
            return record_list
        except Exception as e:
            print(f"Error fetching data for {symbol}: {e}")
            return []  # Return an empty list on error
    
    # Function to process the records (pass through parameters)
    def process_yfinance_record(self, single_param_pair):
        print(f"Processing {single_param_pair}")
        return self.fetch_yfinance_record(single_param_pair)

    # Parallel fetch function
    def parallel_fetch(self, multi_param_pairs):
        # Create Spark session
        spark = SparkSession.builder.appName("YahooFinanceData").getOrCreate()
        
        # Create RDD from the input parameter pairs
        record_rdd = spark.sparkContext.parallelize(multi_param_pairs)
        
        # Use flatMap to return a flattened list of records
        results_rdd = record_rdd.flatMap(self.process_yfinance_record)
        
        # # Collect the results from the RDD
        # results = results_rdd.collect()
        
        schema = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits', 'Ticker', 'ImportTime']
        # rdd_rows = spark.sparkContext.parallelize(stock_data_rows)
        df = spark.createDataFrame(results_rdd, schema)
        df.show()
        
        
        return df

# List of stock symbols and start dates
yf_param_pairs = [
    ('AAPL', '2024-12-10'),
    ('MSFT', '2024-12-10'),
    ('GOOGL', '2024-12-10'),
]

# Instantiate the class
stock_stage = RawYFIngestion('stock', 'mytable')

# Fetch data in parallel
stock_data_rows = stock_stage.parallel_fetch(yf_param_pairs)




















In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
import os

## DEFINE SENSITIVE VARIABLES
CATALOG_URI = "http://nessie:19120/api/v1"  # Nessie Server URI
WAREHOUSE = "s3://warehouse/"               # Minio Address to Write to
STORAGE_URI = "http://172.22.0.3:9000"      # Minio IP address from docker inspect




import yaml


def load_config(file_path):
    """Load the YAML configuration file."""
    with open(file_path, "r") as file:
        config = yaml.safe_load(file)
    return config

load_config('raw_table_schema.yaml')

In [None]:
import yaml
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType

# Load the YAML configuration file
def load_config(file_path):
    with open(file_path, 'r') as f:
        return yaml.safe_load(f)

# Map YAML types to PySpark types
type_mapping = {
    "StringType": StringType,
    "FloatType": FloatType,
    "IntegerType": IntegerType,
}

# Convert YAML schema to PySpark StructType
def create_struct_type(schema_config):
    fields = [
        StructField(field["name"], type_mapping[field["type"]](), field["nullable"])
        for field in schema_config
    ]
    return StructType(fields)

# Initialize Spark session
spark = SparkSession.builder \
    .appName("IcebergTableCreator") \
    .config("spark.sql.catalog.my_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.my_catalog.type", "hadoop") \
    .config("spark.sql.catalog.my_catalog.warehouse", "path/to/warehouse") \
    .getOrCreate()

# Load the YAML configuration for raw tables
config = load_config('raw_table_schemas.yaml')

# Extract the schema and partitioning for the table
table_config = config['raw_tables']['raw.stock_eod_data']
schema = create_struct_type(table_config['schema'])
partition_by = table_config.get('partition_by', [])

# Build the SQL query for creating the Iceberg table
table_name = "my_catalog.raw.stock_eod_data"
schema_columns = ", ".join([f"{field.name} {field.dataType.simpleString()}" for field in schema.fields])
partition_columns = ", ".join([p["field"] for p in partition_by]) if partition_by else ""

# Create the Iceberg table
create_table_query = f"""
CREATE TABLE {table_name} ({schema_columns})
USING iceberg
"""
if partition_columns:
    create_table_query += f" PARTITIONED BY ({partition_columns})"

print(create_table_query)
# # Run the query
# spark.sql(create_table_query)

# print(f"Table {table_name} created successfully.")


In [None]:
import yaml

config_file_path='table_schemas.yaml'
zone='raw'
table='raw.stock_eod_data'
with open(config_file_path, 'r') as f:
    config=yaml.safe_load(f)
print(config[zone][table])



# self.yaml_table_schema=config['schema']?

#     def __init__(self, config_file_path, zone, table):
#         self.config_file_path=config_file_path
#         self.zone=zone
#         self.table=table
#         with open(self.config_file_path, 'r') as f:
#             config=yaml.safe_load(f)
#         self.yaml_table_schema=config['schema']

In [None]:
import yaml
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType

# Load the YAML configuration file
def load_config(file_path):
    with open(file_path, 'r') as f:
        return yaml.safe_load(f)

# Map YAML types to PySpark types
type_mapping = {
    "StringType": StringType,
    "FloatType": FloatType,
    "IntegerType": IntegerType,
}

# Convert YAML schema to PySpark StructType
def create_struct_type(schema_config):
    print(schema_config)
    fields = [
        StructField(field["name"], type_mapping[field["type"]](), field["nullable"])
        for field in schema_config
    ]
    return StructType(fields)

# Initialize Spark session
spark = SparkSession.builder \
    .appName("CreateDataFrameFromYAML") \
    .getOrCreate()

# Load the schema from YAML file
config = load_config('table_schemas.yaml')

# Extract the schema
schema = create_struct_type(config['raw']['raw.stock_eod_data']['schema'])

In [None]:
    def parallel_fetch(self, multi_param_pairs):        
      
        # Create RDD from the input parameter pairs
        
        # record_rdd = self.spark.sparkContext.parallelize(multi_param_pairs)
        
        all_records = []  # Collect all results in a list
        for pair in multi_param_pairs:
            records = self.fetch_yfinance_record(pair)  # Fetch data on the driver
            all_records.extend(records)
        # Use flatMap to return a flattened list of records
        # results_rdd = record_rdd.flatMap(self.process_yfinance_record)
        results_rdd = self.spark.sparkContext.parallelize(all_records)
        
        # Collect the results from the RDD and convert to a list of tuples
        # results = results_rdd.collect()        
        df = self.spark.createDataFrame(results_rdd, self.registered_column_list)   
        df.show()
        return df
