In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Constants
HEADERS = {"User-Agent": "OwnershipDataScraper/1.0 (contact: example@example.com)"}
BASE_URL = "https://data.sec.gov/submissions/CIK{cik}.json"
DOWNLOAD_FOLDER = "downloaded_htm_files"

# Ensure the download folder exists
if not os.path.exists(DOWNLOAD_FOLDER):
    os.makedirs(DOWNLOAD_FOLDER)

# Step 1: Fetch Filing Folders
def fetch_filing_folders(cik, start_year, end_year, form_types):
    """
    Fetches filing folder URLs for a given CIK, year range, and form types.
    """
    url = BASE_URL.format(cik=cik.zfill(10))
    response = requests.get(url, headers=HEADERS)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch filings for CIK {cik}. Status code: {response.status_code}")

    data = response.json()
    filings = data.get("filings", {}).get("recent", {})
    form_types_list = filings.get("form", [])
    accession_numbers = filings.get("accessionNumber", [])
    filing_dates = filings.get("filingDate", [])

    folders = []
    for form, acc_no, filing_date in zip(form_types_list, accession_numbers, filing_dates):
        if form in form_types:
            year = int(filing_date.split("-")[0])
            if start_year <= year <= end_year:
                formatted_acc_no = acc_no.replace("-", "")
                folder_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{formatted_acc_no}/"
                folders.append(folder_url)
    return folders

# Step 2: Find and Process All .htm Files
def process_htm_files(folder_url):
    """
    Extracts ownership data from all .htm files in a given folder URL.
    """
    try:
        response = requests.get(folder_url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Failed to access folder: {folder_url} (Status Code: {response.status_code})")
            return []

        # Parse the folder listing
        soup = BeautifulSoup(response.text, "html.parser")
        links = soup.find_all("a", href=True)
        htm_files = [link['href'] for link in links if link['href'].endswith(".htm")]

        ownership_data = []
        for htm_file in htm_files:
            htm_url = folder_url + htm_file
            print(f"Processing file: {htm_url}")
            data = extract_data_from_htm(htm_url)
            if data:
                ownership_data.extend(data)

        return ownership_data

    except Exception as e:
        print(f"Error processing folder {folder_url}: {e}")
        return []

# Step 3: Extract Data from .htm File
def extract_data_from_htm(htm_url):
    """
    Parses an .htm file to extract ownership data.
    """
    try:
        response = requests.get(htm_url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Failed to download {htm_url} (Status Code: {response.status_code})")
            return []

        soup = BeautifulSoup(response.content, "html.parser")
        ownership_data = []

        # Extract specific data fields (adjust selectors based on actual file structure)
        tables = soup.find_all("table")
        for table in tables:
            rows = table.find_all("tr")
            for row in rows:
                cols = row.find_all("td")
                cols = [col.text.strip() for col in cols]
                if len(cols) > 1:  # Adjust condition based on expected structure
                    ownership_data.append({
                        "Owner Name": cols[0],
                        "Shares Owned": cols[1],
                        "Percent Owned": cols[2] if len(cols) > 2 else "N/A",
                        "Source URL": htm_url
                    })

        return ownership_data

    except Exception as e:
        print(f"Error extracting data from {htm_url}: {e}")
        return []

# Step 4: Save Data to CSV
def save_to_csv(data, filename="ownership_data.csv"):
    """
    Save extracted data to a CSV file.
    """
    if data:
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")
    else:
        print("No data to save.")

# Main Workflow
if __name__ == "__main__":
    # User-defined parameters
    company_name = "Moderna, Inc."
    cik = "1682852"
    start_year = 2020
    end_year = 2021
    form_types = ["SC 13G", "SC 13D", "SC 13G/A", "SC 13D/A"]  # Add more if needed

    print(f"Fetching filing folders for {company_name} (CIK: {cik}) from {start_year} to {end_year}...")
    filing_folders = fetch_filing_folders(cik, start_year, end_year, form_types)

    if filing_folders:
        print(f"Found {len(filing_folders)} filing folders. Processing...")
        all_ownership_data = []

        for folder in filing_folders:
            print(f"Processing folder: {folder}")
            ownership_data = process_htm_files(folder)
            if ownership_data:
                all_ownership_data.extend(ownership_data)

        # Save the data
        save_to_csv(all_ownership_data)
    else:
        print(f"No filing folders found for {company_name} in the specified period.")


Fetching filing folders for Moderna, Inc. (CIK: 1682852) from 2020 to 2021...
Found 8 filing folders. Processing...
Processing folder: https://www.sec.gov/Archives/edgar/data/1682852/000083423721008698/
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000083423721008698//index.htm
Failed to download https://www.sec.gov/Archives/edgar/data/1682852/000083423721008698//index.htm (Status Code: 404)
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000083423721008698//search/search.htm
Failed to download https://www.sec.gov/Archives/edgar/data/1682852/000083423721008698//search/search.htm (Status Code: 404)
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000083423721008698//investor/brokers.htm
Failed to download https://www.sec.gov/Archives/edgar/data/1682852/000083423721008698//investor/brokers.htm (Status Code: 404)
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000083423721008698//edgar/quickedgar.htm
Failed to download h

In [11]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Constants
HEADERS = {"User-Agent": "OwnershipDataScraper/1.0 (contact: example@example.com)"}
BASE_URL = "https://data.sec.gov/submissions/CIK{cik}.json"
DOWNLOAD_FOLDER = "downloaded_txt_files"

# Ensure the download folder exists
if not os.path.exists(DOWNLOAD_FOLDER):
    os.makedirs(DOWNLOAD_FOLDER)

# Utility: Clean up file name
def clean_file_name(file_name):
    """
    Cleans up the file name by removing any `/Archives/.../` prefix,
    leaving only the file name itself.
    """
    if "/Archives/" in file_name:
        return file_name.split("/")[-1]  # Keep only the last part (actual file name)
    return file_name

# Step 1: Fetch Filing Folders
def fetch_filing_folders(cik, start_year, end_year, form_types):
    """
    Fetches filing folder URLs for a given CIK, year range, and form types.
    """
    url = BASE_URL.format(cik=cik.zfill(10))
    response = requests.get(url, headers=HEADERS)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch filings for CIK {cik}. Status code: {response.status_code}")

    data = response.json()
    filings = data.get("filings", {}).get("recent", {})
    form_types_list = filings.get("form", [])
    accession_numbers = filings.get("accessionNumber", [])
    filing_dates = filings.get("filingDate", [])

    folders = []
    for form, acc_no, filing_date in zip(form_types_list, accession_numbers, filing_dates):
        if form in form_types:
            year = int(filing_date.split("-")[0])
            if start_year <= year <= end_year:
                formatted_acc_no = acc_no.replace("-", "")
                folder_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{formatted_acc_no}/"
                folders.append(folder_url)
    return folders

# Step 2: Find and Process All .txt Files
def process_txt_files(folder_url):
    """
    Identifies and processes all .txt files in the given folder URL to extract ownership data.
    """
    try:
        response = requests.get(folder_url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Failed to access folder: {folder_url} (Status Code: {response.status_code})")
            return []

        # Parse the folder listing
        soup = BeautifulSoup(response.text, "html.parser")
        links = soup.find_all("a", href=True)

        # Filter for .txt files
        txt_files = [link['href'] for link in links if link['href'].endswith(".txt")]

        ownership_data = []
        for txt_file in txt_files:
            # Clean the file name and construct the full URL
            clean_name = clean_file_name(txt_file)
            txt_url = f"{folder_url.rstrip('/')}/{clean_name}"
            print(f"Processing file: {txt_url}")

            # Validate file content before processing
            if is_target_form(txt_url):
                data = extract_data_from_txt(txt_url)
                if data:
                    ownership_data.extend(data)
            else:
                print(f"Skipped file: {txt_url} (Not a target form)")

        return ownership_data

    except Exception as e:
        print(f"Error processing folder {folder_url}: {e}")
        return []

# New Helper Function: Validate if File Contains Target Form
def is_target_form(file_url):
    """
    Checks if the file contains the target form type (e.g., SC 13G, SC 13D).
    """
    try:
        response = requests.get(file_url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Failed to download {file_url} for validation (Status Code: {response.status_code})")
            return False

        content = response.text
        # Check for target form types
        target_forms = ["SC 13G", "SC 13D", "SC 13G/A", "SC 13D/A"]
        return any(form in content for form in target_forms)

    except Exception as e:
        print(f"Error validating file {file_url}: {e}")
        return False

# Step 3: Extract Data from .txt File
def extract_data_from_txt(file_url):
    """
    Parses a .txt file to extract ownership data.
    """
    try:
        response = requests.get(file_url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Failed to download {file_url} (Status Code: {response.status_code})")
            return []

        content = response.text
        ownership_data = []

        # Split content into lines
        lines = content.splitlines()
        
        # Variables to track extracted data
        current_owner = None
        current_shares = None
        current_percent = None

        # Iterate through lines to find relevant data
        for line in lines:
            line = line.strip()

            # Check for patterns in the file (adjust these based on actual file content)
            if "Name of Owner" in line:
                current_owner = line.split(":")[-1].strip()
            elif "Number of Shares" in line or "Shares Owned" in line:
                current_shares = line.split(":")[-1].strip()
            elif "Percent of Class" in line or "Ownership Percentage" in line:
                current_percent = line.split(":")[-1].strip()

            # If all fields are found, add them to the data list
            if current_owner and current_shares and current_percent:
                ownership_data.append({
                    "Owner Name": current_owner,
                    "Shares Owned": current_shares,
                    "Percent Owned": current_percent,
                    "Source URL": file_url
                })

                # Reset variables for the next record
                current_owner = None
                current_shares = None
                current_percent = None

        # Return the extracted ownership data
        return ownership_data

    except Exception as e:
        print(f"Error extracting data from {file_url}: {e}")
        return []

# Step 4: Save Data to CSV
def save_to_csv(data, filename="ownership_data.csv"):
    """
    Save extracted data to a CSV file.
    """
    if data:
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")
    else:
        print("No data to save.")

# Main Workflow
if __name__ == "__main__":
    # User-defined parameters
    company_name = "Moderna, Inc."
    cik = "1682852"
    start_year = 2020
    end_year = 2021
    form_types = ["SC 13G", "SC 13D", "SC 13G/A", "SC 13D/A"]  # Add more if needed

    print(f"Fetching filing folders for {company_name} (CIK: {cik}) from {start_year} to {end_year}...")
    filing_folders = fetch_filing_folders(cik, start_year, end_year, form_types)

    if filing_folders:
        print(f"Found {len(filing_folders)} filing folders. Processing...")
        all_ownership_data = []

        for folder in filing_folders:
            print(f"Processing folder: {folder}")
            ownership_data = process_txt_files(folder)
            if ownership_data:
                all_ownership_data.extend(ownership_data)

        # Save the data
        save_to_csv(all_ownership_data)
    else:
        print(f"No filing folders found for {company_name} in the specified period.")


Fetching filing folders for Moderna, Inc. (CIK: 1682852) from 2020 to 2021...
Found 8 filing folders. Processing...
Processing folder: https://www.sec.gov/Archives/edgar/data/1682852/000083423721008698/
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000083423721008698/0000834237-21-008698.txt
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000083423721008698/us60770k1079_071021.txt
Processing folder: https://www.sec.gov/Archives/edgar/data/1682852/000108887521000063/
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000108887521000063/0001088875-21-000063.txt
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000108887521000063/Moderna26022021.txt
Processing folder: https://www.sec.gov/Archives/edgar/data/1682852/000119312521045149/
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000119312521045149/0001193125-21-045149.txt
Processing folder: https://www.sec.gov/Archives/edgar/data/1682852/00011931252104000

In [12]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Constants
HEADERS = {"User-Agent": "OwnershipDataScraper/1.0 (contact: example@example.com)"}
BASE_URL = "https://data.sec.gov/submissions/CIK{cik}.json"
LINKS_FILE = "file_links.txt"
OUTPUT_FILE = "ownership_data.csv"

# Utility: Save links to a file
def save_links(links, filename=LINKS_FILE):
    """
    Save a list of links to a text file.
    """
    with open(filename, "w") as f:
        f.write("\n".join(links))
    print(f"Links saved to {filename}")

# Utility: Load links from a file
def load_links(filename=LINKS_FILE):
    """
    Load a list of links from a text file.
    """
    if os.path.exists(filename):
        with open(filename, "r") as f:
            links = f.read().splitlines()
        print(f"Loaded {len(links)} links from {filename}")
        return links
    return []

# Utility: Clean up file name
def clean_file_name(file_name):
    """
    Cleans up the file name by removing any `/Archives/.../` prefix,
    leaving only the file name itself.
    """
    if "/Archives/" in file_name:
        return file_name.split("/")[-1]  # Keep only the last part (actual file name)
    return file_name

# Step 1: Fetch Filing Folders
def fetch_filing_folders(cik, start_year, end_year, form_types):
    """
    Fetches filing folder URLs for a given CIK, year range, and form types.
    """
    url = BASE_URL.format(cik=cik.zfill(10))
    response = requests.get(url, headers=HEADERS)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch filings for CIK {cik}. Status code: {response.status_code}")

    data = response.json()
    filings = data.get("filings", {}).get("recent", {})
    form_types_list = filings.get("form", [])
    accession_numbers = filings.get("accessionNumber", [])
    filing_dates = filings.get("filingDate", [])

    folders = []
    for form, acc_no, filing_date in zip(form_types_list, accession_numbers, filing_dates):
        if form in form_types:
            year = int(filing_date.split("-")[0])
            if start_year <= year <= end_year:
                formatted_acc_no = acc_no.replace("-", "")
                folder_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{formatted_acc_no}/"
                folders.append(folder_url)
    return folders

# Step 2: Extract All Links from Folders
def extract_links_from_folders(filing_folders):
    """
    Extract all .txt file links from the provided filing folders.
    """
    all_links = []
    for folder_url in filing_folders:
        try:
            response = requests.get(folder_url, headers=HEADERS)
            if response.status_code != 200:
                print(f"Failed to access folder: {folder_url} (Status Code: {response.status_code})")
                continue

            # Parse the folder listing
            soup = BeautifulSoup(response.text, "html.parser")
            links = soup.find_all("a", href=True)

            # Filter for .txt files and construct full URLs
            for link in links:
                if link['href'].endswith(".txt"):
                    clean_name = clean_file_name(link['href'])
                    txt_url = f"{folder_url.rstrip('/')}/{clean_name}"
                    all_links.append(txt_url)

        except Exception as e:
            print(f"Error processing folder {folder_url}: {e}")
            continue

    print(f"Extracted {len(all_links)} links.")
    return all_links

# Step 3: Extract Data from a Single .txt File
def extract_data_from_txt(file_url):
    """
    Parses a .txt file to extract ownership data.
    """
    try:
        response = requests.get(file_url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Failed to download {file_url} (Status Code: {response.status_code})")
            return []

        content = response.text
        ownership_data = []

        # Extract fields using regular expressions (adjust as needed)
        name_match = re.search(r"Name of Reporting Person.*?:\s*(.+)", content, re.IGNORECASE)
        amount_match = re.search(r"Amount beneficially owned.*?:\s*([\d,]+)", content, re.IGNORECASE)
        percent_match = re.search(r"Percent of class.*?:\s*([\d.]+)%?", content, re.IGNORECASE)

        reporting_person = name_match.group(1).strip() if name_match else "Not Found"
        beneficially_owned = amount_match.group(1).replace(",", "") if amount_match else "Not Found"
        percent_of_class = percent_match.group(1) if percent_match else "Not Found"

        ownership_data.append({
            "Name of Reporting Person": reporting_person,
            "Amount Beneficially Owned": beneficially_owned,
            "Percent of Class": percent_of_class,
            "Source URL": file_url
        })

        return ownership_data

    except Exception as e:
        print(f"Error extracting data from {file_url}: {e}")
        return []

# Step 4: Save Data to CSV
def save_to_csv(data, filename=OUTPUT_FILE):
    """
    Save extracted data to a CSV file.
    """
    if data:
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")
    else:
        print("No data to save.")

# Main Workflow
if __name__ == "__main__":
    # User-defined parameters
    company_name = "Moderna, Inc."
    cik = "1682852"
    start_year = 2020
    end_year = 2021
    form_types = ["SC 13G", "SC 13D", "SC 13G/A", "SC 13D/A"]  # Add more if needed

    # Load existing links if available
    file_links = load_links()

    if not file_links:
        # Fetch filing folders and extract links if no saved links exist
        print(f"Fetching filing folders for {company_name} (CIK: {cik}) from {start_year} to {end_year}...")
        filing_folders = fetch_filing_folders(cik, start_year, end_year, form_types)

        if filing_folders:
            print(f"Found {len(filing_folders)} filing folders. Extracting links...")
            file_links = extract_links_from_folders(filing_folders)
            save_links(file_links)
        else:
            print(f"No filing folders found for {company_name} in the specified period.")
            exit()

    # Extract data from the saved links
    print("Processing links...")
    all_ownership_data = []
    for file_url in file_links:
        ownership_data = extract_data_from_txt(file_url)
        if ownership_data:
            all_ownership_data.extend(ownership_data)

    # Save the extracted data to a CSV
    save_to_csv(all_ownership_data)


Fetching filing folders for Moderna, Inc. (CIK: 1682852) from 2020 to 2021...
Found 8 filing folders. Extracting links...
Extracted 12 links.
Links saved to file_links.txt
Processing links...
Error extracting data from https://www.sec.gov/Archives/edgar/data/1682852/000083423721008698/0000834237-21-008698.txt: name 're' is not defined
Error extracting data from https://www.sec.gov/Archives/edgar/data/1682852/000083423721008698/us60770k1079_071021.txt: name 're' is not defined
Error extracting data from https://www.sec.gov/Archives/edgar/data/1682852/000108887521000063/0001088875-21-000063.txt: name 're' is not defined
Error extracting data from https://www.sec.gov/Archives/edgar/data/1682852/000108887521000063/Moderna26022021.txt: name 're' is not defined
Error extracting data from https://www.sec.gov/Archives/edgar/data/1682852/000119312521045149/0001193125-21-045149.txt: name 're' is not defined
Error extracting data from https://www.sec.gov/Archives/edgar/data/1682852/00011931252104

In [13]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Constants
HEADERS = {"User-Agent": "OwnershipDataScraper/1.0 (contact: example@example.com)"}
LINKS_FILE = "file_links.txt"
OUTPUT_FILE = "ownership_data.csv"

# Utility: Load links from a file
def load_links(filename=LINKS_FILE):
    """
    Load a list of links from a text file.
    """
    if os.path.exists(filename):
        with open(filename, "r") as f:
            links = f.read().splitlines()
        print(f"Loaded {len(links)} links from {filename}")
        return links
    return []

# Utility: Save data to a CSV file
def save_to_csv(data, filename=OUTPUT_FILE):
    """
    Save extracted data to a CSV file.
    """
    if data:
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")
    else:
        print("No data to save.")

# Function to extract data from a single .txt file
def extract_data_from_txt(file_url):
    """
    Parses a .txt file to extract ownership data.
    """
    try:
        response = requests.get(file_url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Failed to download {file_url} (Status Code: {response.status_code})")
            return []

        content = response.text
        ownership_data = []

        # Extract fields using regular expressions
        name_match = re.search(r"Name of Reporting Person.*?:\s*(.+)", content, re.IGNORECASE)
        amount_match = re.search(r"Amount beneficially owned.*?:\s*([\d,]+)", content, re.IGNORECASE)
        percent_match = re.search(r"Percent of class.*?:\s*([\d.]+)%?", content, re.IGNORECASE)

        reporting_person = name_match.group(1).strip() if name_match else "Not Found"
        beneficially_owned = amount_match.group(1).replace(",", "") if amount_match else "Not Found"
        percent_of_class = percent_match.group(1) if percent_match else "Not Found"

        ownership_data.append({
            "Name of Reporting Person": reporting_person,
            "Amount Beneficially Owned": beneficially_owned,
            "Percent of Class": percent_of_class,
            "Source URL": file_url
        })

        return ownership_data

    except Exception as e:
        print(f"Error extracting data from {file_url}: {e}")
        return []

# Main Workflow
if __name__ == "__main__":
    # Load links from the saved file
    file_links = load_links()

    if not file_links:
        print(f"No links found in {LINKS_FILE}. Please extract the links first.")
        exit()

    # Process the saved links
    print("Processing links...")
    all_ownership_data = []
    for file_url in file_links:
        ownership_data = extract_data_from_txt(file_url)
        if ownership_data:
            all_ownership_data.extend(ownership_data)

    # Save the extracted data to a CSV
    save_to_csv(all_ownership_data)


Loaded 12 links from file_links.txt
Processing links...
Data saved to ownership_data.csv


In [16]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Constants
HEADERS = {"User-Agent": "OwnershipDataScraper/1.0 (contact: example@example.com)"}
LINKS_FILE = "file_links.txt"
OUTPUT_FILE = "ownership_data.csv"

# Utility: Load links from a file
def load_links(filename=LINKS_FILE):
    """
    Load a list of links from a text file.
    """
    if os.path.exists(filename):
        with open(filename, "r") as f:
            links = f.read().splitlines()
        print(f"Loaded {len(links)} links from {filename}")
        return links
    return []

# Utility: Save data to a CSV file
def save_to_csv(data, filename=OUTPUT_FILE):
    """
    Save extracted data to a CSV file.
    """
    if data:
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")
    else:
        print("No data to save.")

# Function to extract data from a single .txt file
def extract_data_from_txt(file_url):
    """
    Parses a .txt file to extract ownership data, handling multi-line structures and variations.
    """
    try:
        response = requests.get(file_url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Failed to download {file_url} (Status Code: {response.status_code})")
            return []

        content = response.text

        # Normalize the content (collapse multi-line sections and whitespace)
        normalized_content = re.sub(r"\s+", " ", content)

        ownership_data = []

        # Extract 'Name of Reporting Person'
        name_match = re.search(r"(Name of Reporting Person|Name of Filer|Reporting Person).*?:\s*(.+?)(?=\s+(Item|Amount|Percent|$))", normalized_content, re.IGNORECASE)
        reporting_person = name_match.group(2).strip() if name_match else "Not Found"

        # Extract 'Amount Beneficially Owned'
        amount_match = re.search(r"(Amount beneficially owned|Number of Shares).*?:\s*([\d,]+)", normalized_content, re.IGNORECASE)
        beneficially_owned = amount_match.group(2).replace(",", "") if amount_match else "Not Found"

        # Extract 'Percent of Class'
        percent_match = re.search(r"(Percent of class|Percentage of Class).*?:\s*([\d.]+)%?", normalized_content, re.IGNORECASE)
        percent_of_class = percent_match.group(2) if percent_match else "Not Found"

        # Extract 'Voting Power'
        sole_voting_match = re.search(r"(sole power to vote|sole voting power).*?:\s*([\d,]+)", normalized_content, re.IGNORECASE)
        shared_voting_match = re.search(r"(shared power to vote|shared voting power).*?:\s*([\d,]+)", normalized_content, re.IGNORECASE)
        sole_dispose_match = re.search(r"(sole power to dispose|sole dispositive power).*?:\s*([\d,]+)", normalized_content, re.IGNORECASE)
        shared_dispose_match = re.search(r"(shared power to dispose|shared dispositive power).*?:\s*([\d,]+)", normalized_content, re.IGNORECASE)

        sole_voting_power = sole_voting_match.group(2).replace(",", "") if sole_voting_match else "Not Found"
        shared_voting_power = shared_voting_match.group(2).replace(",", "") if shared_voting_match else "Not Found"
        sole_dispose_power = sole_dispose_match.group(2).replace(",", "") if sole_dispose_match else "Not Found"
        shared_dispose_power = shared_dispose_match.group(2).replace(",", "") if shared_dispose_match else "Not Found"

        # Append the extracted data
        ownership_data.append({
            "Name of Reporting Person": reporting_person,
            "Amount Beneficially Owned": beneficially_owned,
            "Percent of Class": percent_of_class,
            "Sole Voting Power": sole_voting_power,
            "Shared Voting Power": shared_voting_power,
            "Sole Dispositive Power": sole_dispose_power,
            "Shared Dispositive Power": shared_dispose_power,
            "Source URL": file_url
        })

        return ownership_data

    except Exception as e:
        print(f"Error extracting data from {file_url}: {e}")
        return []

# Main Workflow
if __name__ == "__main__":
    # Load links from the saved file
    file_links = load_links()

    if not file_links:
        print(f"No links found in {LINKS_FILE}. Please extract the links first.")
        exit()

    # Process the saved links
    print("Processing links...")
    all_ownership_data = []
    for file_url in file_links:
        ownership_data = extract_data_from_txt(file_url)
        if ownership_data:
            all_ownership_data.extend(ownership_data)

    # Save the extracted data to a CSV
    save_to_csv(all_ownership_data)


Loaded 12 links from file_links.txt
Processing links...
Data saved to ownership_data.csv
