In [0]:
!pip install yfinance

Collecting yfinance
  Downloading yfinance-0.2.65-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.6-py311-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.2.tar.gz (949 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/949.2 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m949.2/949.2 kB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting beautifulsoup4

In [0]:
dbutils.library.restartPython()

In [0]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
import re
import time
from pyspark.sql.functions import regexp_replace

In [0]:
TICKER_TABLE = "finance_catalog.db_landing.src_raw_index_keys"
TABLE_NAME = "finance_catalog.db_landing.src_raw_stock_prices"
DEFAULT_LOOKBACK_DAYS = 365
BATCH_SIZE = 100

In [0]:
tickers = (
    spark
        .table(TICKER_TABLE)
        .select(regexp_replace("ticker", "\\.", "-").alias("ticker"))
        .toPandas()["ticker"]
        .tolist()
)

In [0]:
def chunk_list(lst, size):
    for i in range(0, len(lst), size):
        yield lst[i:i + size]

In [0]:
ticker_data = []

for batch in chunk_list(tickers, BATCH_SIZE):
    for ticker in batch:
 
        max_date_row = spark.sql(f"SELECT MAX(date) as max_date FROM {TABLE_NAME} WHERE ticker = '{ticker}' GROUP BY ticker").collect()
        max_date = max_date_row[0]['max_date'] if max_date_row else None
        start_date = max_date if max_date else datetime.now() - timedelta(days=DEFAULT_LOOKBACK_DAYS)

        df = yf.download(ticker, start=start_date, end=datetime.now() + timedelta(days=1), auto_adjust=True)
        df = df.reset_index()
        df.columns = df.columns.droplevel(1)
        df.columns = [re.sub(r"\W+", "_", col).strip("_").lower() for col in df.columns]
        df['ticker'] = ticker
        df['id'] = ticker + df['date'].dt.strftime('%Y%m%d')
        df.set_index('id', inplace=True)
        df.reset_index(inplace=True)
        ticker_data.append(df)

        time.sleep(1)     

    if ticker_data:
        batch_df = pd.concat(ticker_data, ignore_index=True)
        ticker_data.clear()

    batch_df['volume'] = batch_df['volume'].astype(float)

    (
        spark
            .createDataFrame(batch_df)
            .orderBy(["ticker", "date"], ascending=[True, False])
            .write
            .format("delta")
            .mode("append")
            .saveAsTable(TABLE_NAME)
    )

In [0]:
(
    spark
        .table(TABLE_NAME)
        .dropDuplicates(subset=['ticker', 'date'])
        .orderBy(['ticker', 'date'], ascending=[True, False])
        .write
        .format("delta")
        .mode("overwrite")
        .saveAsTable(TABLE_NAME)
)