In [0]:
!pip install BeautifulSoup4 lxml html5lib

Collecting BeautifulSoup4
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting lxml
  Downloading lxml-6.0.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl.metadata (6.6 kB)
Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl.metadata (16 kB)
Collecting soupsieve>1.2 (from BeautifulSoup4)
  Downloading soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Collecting webencodings (from html5lib)
  Downloading webencodings-0.5.1-py2.py3-none-any.whl.metadata (2.1 kB)
Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
Downloading lxml-6.0.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl (5.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/5.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m121.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
Downloading soupsieve-2.7-py3-n

In [0]:
dbutils.library.restartPython()

In [0]:
import requests
import pandas as pd
import re
import io
from bs4 import BeautifulSoup
import time
import random
from datetime import datetime, timedelta
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

In [0]:
TABLE_NAME = "finance_catalog.db_landing.src_raw_insiders_trx"

schema = StructType([
    StructField("filing", StringType(), True),
    StructField("symbol", StringType(), True),
    StructField("security", StringType(), True),
    StructField("reporting_name", StringType(), True),
    StructField("relationship", StringType(), True),
    StructField("trans_date", StringType(), True),
    StructField("purchase_sale", StringType(), True),
    StructField("shares", DoubleType(), True),
    StructField("price", DoubleType(), True),
    StructField("amount", DoubleType(), True),
    StructField("d_i", StringType(), True)
])

In [0]:
run_date: str = dbutils.widgets.get("run_date")

In [0]:
run_date = datetime.strptime(run_date, "%Y-%m-%d") - timedelta(days=1)
run_date = run_date.strftime("%d %b %Y")

date_exists = (
    spark.sql(f"SELECT 1 FROM {TABLE_NAME} WHERE filing LIKE '{run_date}%' LIMIT 1")
    .count() > 0
)

if date_exists:
    print(f"Data for date {run_date} already exists. Skipping task.")
    dbutils.notebook.exit(f"Skipped - Data for {run_date} already exists")

In [0]:
HEADERS_LIST = [
    {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1"
    },
    {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4; rv:109.0) Gecko/20100101 Firefox/115.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "DNT": "1"
    },
    {
        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive"
    },
    {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/115.0.0.0 Safari/537.36",
        "Accept": "*/*",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Referer": "https://www.google.com/"
    }
]

In [0]:
headers = random.choice(HEADERS_LIST)

session = requests.Session()
session.headers.update(headers)

results = []
data_frames = []

base_url = "https://www.dataroma.com/m/ins/ins.php?t=d&am=0&sym=&o=fd&d=d"
response = session.get(base_url, timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')

pages_div = soup.find("div", id="pages")

if pages_div is None:
    response = session.get(base_url, timeout=10)
    tables = pd.read_html(io.StringIO(response.text)) 
    trx_table = tables[2]

    if not trx_table.empty:
        results.append(trx_table)
else:
    links = pages_div.find_all('a')
    pages = [a.get_text() for a in links if a.get_text().strip().isdigit()]

    for page in pages:
        url = f"{base_url}&L={page}"
        response = session.get(url, timeout=10)
        time.sleep(random.uniform(1.5, 3.5))

        tables = pd.read_html(io.StringIO(response.text)) 
        trx_table = tables[2]
        results.append(trx_table)

if results:
    df = pd.concat(results, ignore_index=True)
    df.columns = [re.sub(r"\W+", "_", col).strip("_").lower() for col in df.columns]

    if data_frames:
        data_frames.append(df)
        df = pd.concat(data_frames, ignore_index=True).dropna(subset=['symbol'])

(
    spark
        .createDataFrame(df, schema=schema)
        .write
        .format("delta")
        .mode("append")
        .saveAsTable(TABLE_NAME)
)