In [1]:
import os
from dotenv import load_dotenv
from sec_api import QueryApi, RenderApi
# Load environment variables from .env file
load_dotenv()

# Retrieve the API key from the environment
QUERY_API_KEY = os.getenv("QUERY_API_KEY")
RENDER_API_KEY = os.getenv("RENDER_API_KEY")
# print(f"QUERY_API_KEY: {QUERY_API_KEY}")

# Initialize API clients
queryApi = QueryApi(api_key=QUERY_API_KEY)
renderApi = RenderApi(api_key=RENDER_API_KEY)

query = {
  "query": { "query_string": { 
      "query": "formType:\"10-Q\" AND ticker:AAPL", # only 10-Ks
  }},
  "from": "0", # start returning matches from position null, i.e. the first matching filing 
  "size": "10"  # return just one filing
}

response = queryApi.get_filings(query)

In [None]:
# Base query template
base_query = {
  "query": { 
      "query_string": { 
          "query": "PLACEHOLDER",  # Placeholder to be replaced by the specific query
          "time_zone": "America/New_York"
      } 
  },
  "from": "0",
  "size": "200",
  # sort returned filings by the filedAt key/value
  "sort": [{"filedAt": {"order": "desc"}}]
}

# Open the file we use to store the filing URLs
log_file = open("filing_urls_AAPL_10Q.txt", "a")

# Fetch filings for years 2022 and 2021
for year in range(2024, 2025, 1):
    print(f"Starting download for year {year}")
    
    for month in range(1, 13, 1):  # Iterate over each month
        # Construct search query for 10-K filings within the given year-month for AAPL
        universe_query = (
            "formType:(\"10-Q\") AND " +
            "filedAt:[{year}-{month:02d}-01 TO {year}-{month:02d}-31] AND " +
            "ticker:AAPL"
        ).format(year=year, month=month)

        # Debugging: Print the query being generated
        print(f"Generated query: {universe_query}")

        # Update query template with specific query string
        base_query["query"]["query_string"]["query"] = universe_query

        # Pagination: Fetch results in batches of 200
        for from_batch in range(0, 400, 200):  # Update pagination start index 
            base_query["from"] = from_batch

            # Debugging: Print the page we're fetching
            print(f"Fetching batch starting at index {from_batch}...")

            try:
                response = queryApi.get_filings(base_query)
                
                # Debugging: Print the raw response
                print(f"Response received: {response}")

                # Stop if no more filings are found
                if len(response["filings"]) == 0:
                    print("No more filings found.")
                    break

                # Extract and store filing URLs
                urls_list = [x["linkToFilingDetails"] for x in response["filings"]]

                # Debugging: Print the number of URLs found in this batch
                print(f"Found {len(urls_list)} URLs in this batch")

                # Transform list of URLs into one string by joining all list elements
                # and add a new-line character between each element.
                urls_string = "\n".join(urls_list) + "\n"
                
                # Write URLs to log file
                log_file.write(urls_string)

            except Exception as e:
                print(f"Error during API request: {e}")
                break

# Close the log file
log_file.close()
print("All AAPL URLs downloaded")


Starting download for year 2024
Generated query: formType:("10-Q") AND filedAt:[2024-01-01 TO 2024-01-31] AND ticker:AAPL
Fetching batch starting at index 0...
Response received: {'total': {'value': 0, 'relation': 'eq'}, 'query': {'from': 0, 'size': 200}, 'filings': []}
No more filings found.
Generated query: formType:("10-Q") AND filedAt:[2024-02-01 TO 2024-02-31] AND ticker:AAPL
Fetching batch starting at index 0...
Response received: {'total': {'value': 1, 'relation': 'eq'}, 'query': {'from': 0, 'size': 200}, 'filings': [{'ticker': 'AAPL', 'formType': '10-Q', 'accessionNo': '0000320193-24-000006', 'cik': '320193', 'companyNameLong': 'Apple Inc. (Filer)', 'companyName': 'Apple Inc.', 'linkToFilingDetails': 'https://www.sec.gov/Archives/edgar/data/320193/000032019324000006/aapl-20231230.htm', 'description': 'Form 10-Q - Quarterly report [Sections 13 or 15(d)]', 'linkToTxt': 'https://www.sec.gov/Archives/edgar/data/320193/000032019324000006/0000320193-24-000006.txt', 'filedAt': '2024-0

In [3]:
import multiprocessing

# Download 10-k Filings from Stored URLs
def download_filing(url):
  """
  Downloads a 10-Q filing from SEC-API and saves it to the 'filings' folder.
  """
  filing = renderApi.get_filing(url)
  file_name = url.split("/")[-2] + "-" + url.split("/")[-1] # Generate filename
  download_to = "./filings/" + file_name
  
  with open(download_to, "w") as f:
    f.write(filing) # Save filing content to file


# load URLs from log file
def load_urls():
  """
  Loads filing URLs from the log file.
  Returns a list of URLs.
  """
  log_file = open("filing_urls_AAPL_10Q.txt", "r")
  urls = log_file.read().split("\n") # convert long string of URLs into a list 
  log_file.close()
  return urls

def download_all_filings():
  """
  Downloads all filings in parallel using multiprocessing.
  """
  print("Start downloading all filings")

  # Create download folder if it doesn't exist
  download_folder = "./filings" 
  if not os.path.isdir(download_folder):
    os.makedirs(download_folder)
    
# Load filing URLs
  urls = load_urls()[1:40] # Downloading the first 40 for testing
  print("{length} filing URLs loaded".format(length=len(urls)))

  # Number of parallel processes for downloading
  number_of_processes = 20

  with multiprocessing.Pool(number_of_processes) as pool:
    pool.map(download_filing, urls)
  
  print("All filings downloaded")

In [None]:
# Execute download process
download_all_filings()

In [5]:
from bs4 import BeautifulSoup
# Directory containing .htm filings
FILINGS_DIR = "filings"
EXTRACTED_TEXT_DIR = "apple_filings_text_10Q"

# Ensure output directory exists
if not os.path.exists(EXTRACTED_TEXT_DIR):
    os.makedirs(EXTRACTED_TEXT_DIR)

def extract_text_from_html(file_path):
    """
    Extracts text from an HTML (.htm) filing.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.extract()
        
        # Extract clean text
        text = soup.get_text(separator="\n", strip=True)
    return text

# Process all .htm files in the folder
for filename in os.listdir(FILINGS_DIR):
    if filename.endswith(".htm"):
        file_path = os.path.join(FILINGS_DIR, filename)
        extracted_text = extract_text_from_html(file_path)

        # Save extracted text to a .txt file
        output_file_path = os.path.join(EXTRACTED_TEXT_DIR, filename.replace(".htm", ".txt"))
        with open(output_file_path, "w", encoding="utf-8") as output_file:
            output_file.write(extracted_text)

        print(f"Extracted text from {filename} and saved to {output_file_path}")

print("\n All HTML filings processed. Extracted text saved in `filings_text/`.")


Extracted text from 000032019324000123-aapl-20240928.htm and saved to apple_filings_text_10Q/000032019324000123-aapl-20240928.txt
Extracted text from 000032019324000069-aapl-20240330.htm and saved to apple_filings_text_10Q/000032019324000069-aapl-20240330.txt
Extracted text from 000032019324000081-aapl-20240629.htm and saved to apple_filings_text_10Q/000032019324000081-aapl-20240629.txt

 All HTML filings processed. Extracted text saved in `filings_text/`.
