In [2]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Constants
HEADERS = {"User-Agent": "OwnershipDataScraper/1.0 (contact: example@example.com)"}
BASE_URL = "https://data.sec.gov/submissions/CIK{cik}.json"
DOWNLOAD_FOLDER = "downloaded_files"

# Ensure the download folder exists
if not os.path.exists(DOWNLOAD_FOLDER):
    os.makedirs(DOWNLOAD_FOLDER)

# Step 1: Fetch Filing URLs
def fetch_filing_urls(cik, start_year, end_year, form_type="4"):
    """
    Fetches filing URLs for a given CIK, year range, and form type.
    """
    url = BASE_URL.format(cik=cik.zfill(10))
    response = requests.get(url, headers=HEADERS)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch filings for CIK {cik}. Status code: {response.status_code}")

    data = response.json()
    filings = data.get("filings", {}).get("recent", {})
    form_types = filings.get("form", [])
    accession_numbers = filings.get("accessionNumber", [])
    filing_dates = filings.get("filingDate", [])

    links = []
    for form, acc_no, filing_date in zip(form_types, accession_numbers, filing_dates):
        if form == form_type:
            year = int(filing_date.split("-")[0])
            if start_year <= year <= end_year:
                formatted_acc_no = acc_no.replace("-", "")
                link = f"https://www.sec.gov/Archives/edgar/data/{cik}/{formatted_acc_no}/form4.xml"
                links.append(link)
    return links

# Step 2: Download and Save Files
def download_file(url, folder):
    """
    Downloads a file from the given URL and saves it to the specified folder.
    """
    try:
        file_name = url.split("/")[-1]
        file_path = os.path.join(folder, file_name)
        response = requests.get(url, headers=HEADERS)

        if response.status_code == 200:
            with open(file_path, "wb") as file:
                file.write(response.content)
            print(f"File downloaded and saved: {file_path}")
            return file_path
        else:
            print(f"Failed to download {url}. Status Code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error downloading file {url}: {e}")
        return None

# Step 3: Parse Ownership Data from Individual XML File
def parse_ownership_data_from_xml(file_path):
    """
    Parse ownership data from a downloaded XML file.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()

        soup = BeautifulSoup(content, "xml")
        ownership_data = []

        # Reporting Owner Information
        reporting_owner = soup.find("reportingOwner")
        if reporting_owner:
            owner_name = reporting_owner.find("rptOwnerName").text if reporting_owner.find("rptOwnerName") else "Unknown"
            shares_owned_post = reporting_owner.find("sharesOwnedFollowingTransaction")

            # Derivative Holdings (if applicable)
            transactions = soup.find_all("derivativeTransaction")
            for transaction in transactions:
                security_title = transaction.find("value").text if transaction.find("value") else "Unknown"
                transaction_date = transaction.find("transactionDate").find("value").text if transaction.find("transactionDate") else "Unknown"
                
                # Handle missing or invalid numeric values
                try:
                    transaction_shares = int(transaction.find("transactionShares").find("value").text) if transaction.find("transactionShares") else 0
                except ValueError:
                    transaction_shares = 0

                try:
                    post_transaction_shares = int(shares_owned_post.text) if shares_owned_post else 0
                except ValueError:
                    post_transaction_shares = 0

                ownership_type = transaction.find("directOrIndirectOwnership").find("value").text if transaction.find("directOrIndirectOwnership") else "Unknown"

                ownership_data.append({
                    "Owner Name": owner_name,
                    "Security Title": security_title,
                    "Transaction Date": transaction_date,
                    "Transaction Shares": transaction_shares,
                    "Post-Transaction Shares": post_transaction_shares,
                    "Ownership Type": ownership_type
                })

        return ownership_data

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

# Step 4: Save Data to CSV
def save_to_csv(data, filename="ownership_data.csv"):
    """
    Save extracted data to a CSV file.
    """
    if data:
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")
    else:
        print("No data to save.")

# Main Workflow
if __name__ == "__main__":
    # User-defined parameters
    company_name = "Moderna, Inc."  # Specify the company name
    cik = "1682852"  # CIK for Moderna
    start_year = 2020  # Start year for filings
    end_year = 2021  # End year for filings

    print(f"Fetching filings for {company_name} (CIK: {cik}) from {start_year} to {end_year}...")
    filing_urls = fetch_filing_urls(cik, start_year, end_year, form_type="4")

    if filing_urls:
        print(f"Found {len(filing_urls)} filings. Processing...")
        all_ownership_data = []

        for url in filing_urls:
            print(f"Processing file: {url}")
            file_path = download_file(url, DOWNLOAD_FOLDER)
            if file_path:
                ownership_data = parse_ownership_data_from_xml(file_path)
                if ownership_data:
                    all_ownership_data.extend(ownership_data)

        # Save the data
        save_to_csv(all_ownership_data)
    else:
        print(f"No filings found for {company_name} in the specified period.")


Fetching filings for Moderna, Inc. (CIK: 1682852) from 2020 to 2021...
Found 366 filings. Processing...
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000112760221032072/form4.xml
File downloaded and saved: downloaded_files/form4.xml
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000112760221032070/form4.xml
File downloaded and saved: downloaded_files/form4.xml
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000112760221031990/form4.xml
File downloaded and saved: downloaded_files/form4.xml
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000112760221031988/form4.xml
File downloaded and saved: downloaded_files/form4.xml
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000112760221031926/form4.xml
File downloaded and saved: downloaded_files/form4.xml
Processing file: https://www.sec.gov/Archives/edgar/data/1682852/000112760221031923/form4.xml
File downloaded and saved: downloaded_files/form4.xml
Processi