In [0]:
!pip install BeautifulSoup4 lxml html5lib

In [0]:
dbutils.library.restartPython() 

In [0]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import io
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

In [0]:
SRC_TABLE_NAME = "finance_catalog.db_landing.src_raw_index_keys"
TABLE_NAME = "finance_catalog.db_landing.src_raw_insiders_trx"

schema = StructType([
    StructField("filing", StringType(), True),
    StructField("symbol", StringType(), True),
    StructField("security", StringType(), True),
    StructField("reporting_name", StringType(), True),
    StructField("relationship", StringType(), True),
    StructField("trans_date", StringType(), True),
    StructField("purchase_sale", StringType(), True),
    StructField("shares", DoubleType(), True),
    StructField("price", DoubleType(), True),
    StructField("amount", DoubleType(), True),
    StructField("d_i", StringType(), True)
])

In [0]:
index = (
    spark
        .table(SRC_TABLE_NAME)
        .select("ticker")
        .toPandas()["ticker"]
        .tolist()
)

In [0]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Connection': 'keep-alive',
    'Referer': 'https://www.dataroma.com/',
    'Upgrade-Insecure-Requests': '1',
}

results = []
data_frames = []
batch_ini = 0
batch_end = 0

for ticker in index[batch_ini:batch_end]:

    session = requests.Session()
    session.headers.update(headers)

    results = []
    data_frames = []

    base_url = f"https://www.dataroma.com/m/ins/ins.php?t=d&am=0&sym={ticker}&o=fd&d=d"
    response = session.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    pages_div = soup.find("div", id="pages")

    print(response.status_code)
    print(response.text[:1000]) 

    if pages_div is None:

        tables = pd.read_html(response.text, flavor='lxml')
        print(f"{len(tables)} tables found.")
        trx_table = tables[2]

        if not trx_table.empty:
            results.append(trx_table)

    else:

        links = pages_div.find_all('a')
        pages = [a.get_text() for a in links if a.get_text().strip().isdigit()]

        for page in pages:

            url = f"{base_url}&L={page}"

            response = session.get(url)
            tables = pd.read_html(response.text, flavor='lxml') 
            trx_table = tables[2]

            results.append(trx_table)

    if results:
        df = pd.concat(results, ignore_index=True)
        data_frames.append(df)
        results.clear()

if data_frames:
    df = pd.concat(data_frames, ignore_index=True)
    df.columns = [re.sub(r"\W+", "_", col).strip("_").lower() for col in df.columns]

(
    spark
        .createDataFrame(df, schema=schema)
        .write
        .format("delta")
        .mode("append")
        .saveAsTable(TABLE_NAME)
)