In [20]:
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup


In [28]:
class SECDataManager:
    def __init__(self,
                 filepath="ticker.txt",
                 headers=None,
                 debug=False,
                 ticker=None,
                 company_cik=None,
                 company_filings=None,
                 accession_number=None,
                 filing_section=None,
                 filing_options=['10-K'],
                 raw_filing=None):
        self.filepath = filepath
        self.debug = debug
        self.ticker = ticker
        self.company_cik = company_cik
        self.company_filings = company_filings
        self.accession_number = accession_number
        self.filing_section = filing_section
        self.filing_options = filing_options
        self.raw_filing = raw_filing
        
        if not headers:
            self.headers = {
                'User-Agent': 'evolvConsulting ward.rushton@evolvconsulting.com',
                'From': 'ward.rushton@evolvconsulting.com'
            }
        else: raise NameError('Required Headers Not Defined')
    
    # Pre-process to get Retrieval URL
    def get_tickers_from_file(self):
        df = pd.read_csv(self.filepath, sep='\t', names=["Ticker", "CIK"])
        # Normalize CIK values to 10 characters long per SEC guidelines
        df['CIK'] = df['CIK'].apply('{:0>10}'.format)
        if self.debug: print(df.head())
        return df

    def get_cik_from_ticker(self):
        company_ticker = self.ticker.lower()
        tickers_df = self.get_tickers_from_file()  
        self.cik = tickers_df.loc[tickers_df['Ticker'] == company_ticker, 'CIK'].item()
        if self.debug: print("CIK:", self.cik)
        return self.cik
    
    def fetch_filings_list(self, filing_options=None):
        if not filing_options:
            filing_options = self.filing_options
            
        if not self.company_cik:
            self.company_cik = self.get_cik_from_ticker()
        
        base_url = "https://data.sec.gov/submissions/CIK"
        full_url = base_url + self.company_cik + ".json"

        response = requests.get(full_url, headers=self.headers)
        response.raise_for_status()
        
        filings_dict = response.json()['filings']['recent']
        filings_df = pd.DataFrame.from_dict(filings_dict)
        if self.debug: print("Filings DF:\n", filings_df.head())
        # Sort for only the filings where the correct document type is matched
        self.company_filings = filings_df[filings_df['primaryDocDescription'].isin(filing_options)]

        return self.company_filings

    def get_most_recent_accession_number(self, response_json=None):
        if not response_json:
            response_json = self.fetch_filings_list()
            if self.debug: print(response_json)
        self.accession_number = response_json['accessionNumber'].iloc[0]
        return self.accession_number

    def format_url_from_filings(self, accession_number=None):
        if not accession_number:
            # get most recent accession number
            accession_number = self.get_most_recent_accession_number(response_json=self.company_filings)
        base_url = "https://www.sec.gov/Archives/edgar/data/"
        url = base_url + f"{self.company_cik}/{accession_number.replace('-','')}/{accession_number}.txt"
        if self.debug: print("URL: ", url)
        return url
    
    # Actually get the raw text filing from the formatted URL
    def retrieve_filing_from_url(self, access_url=None):
        if not access_url:
            access_url = self.format_url_from_filings(accession_number=self.accession_number)
        response = requests.get(access_url, headers=self.headers)
        response.raise_for_status()
        if self.debug: print("Response: ", response.text[0:5000])
        self.raw_filing = response.text
        return self.raw_filing
    
    # Create Regex to find the relevant file parts
    def save_filing_section(self):
        # Regex to find <DOCUMENT> tags
        doc_start_pattern = re.compile(r'<DOCUMENT>')
        doc_end_pattern = re.compile(r'</DOCUMENT>')
        # Regex to find <TYPE> tag prceeding any characters, terminating at new line
        type_pattern = re.compile(r'<TYPE>[^\n]+')
        """
        Define Span Indices using REGEXes
        Now, that we have the regexes defined, we will use the `.finditer()` method to match the regexes in the raw filing.
        In the code below, we will create 3 lists:
        1. A list that holds the `.end()` index of each match of `doc_start_pattern`
        2. A list that holds the `.start()` index of each match of `doc_end_pattern`
        3. A list that holds the name of section from each match of `type_pattern`
        """
        self.filing_section = {}
        # Create a loop to go through each section type and save only the 10-K section in the dictionary
        for doc_type, doc_start, doc_end in zip(self.filing_options, doc_start_is, doc_end_is):
            if doc_type == '10-K':
                self.filing_section[doc_type] = self.raw_filing[doc_start:doc_end]
                
                
    
                
    def find_all_regex_matches(self):
        
        regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')

        # Use finditer to math the regex
        matches = regex.finditer(document['10-K'])

        # Write a for loop to print the matches
        for match in matches:
            print(match)


    # Beautify the resulting HTML text into readable wording
    def parse_html_from_response_text(self, text_response, features="html.parser", match_group=0):
        soup = BeautifulSoup(text_response.group(match_group), features=features)

        text = soup.get_text()
        
        return text
    


In [33]:
data_manager = SECDataManager(ticker='nvda', debug=False)
filing_response = data_manager.retrieve_filing_from_url()

Apply REGEXes to find 10-K Section from the document

For our purposes, we are only interested in the sections that contain the 10-K information. All the sections, including the 10-K are contained within the `<DOCUMENT>` and `</DOCUMENT>` tags. Each section within the document tags is clearly marked by a `<TYPE>` tag followed by the name of the section.


In [None]:
debug = False
# Initialize the SECDataManager object with the path to your ticker file
data_manager = SECDataManager(filepath='ticker.txt', debug=False)
tickers_df = data_manager.get_tickers_from_file()
company_cik = data_manager.get_cik_from_ticker('aapl')
filings_list = data_manager.fetch_filings_list(company_cik, filing_options=['10-K'])
filing_url = data_manager.format_url_from_filings(company_cik, data_manager.get_most_recent_accession_number(filings_list))

# Retrieve the filing content from the URL
# This will make an HTTP request, so ensure you have a valid URL and internet connection
filing_response = data_manager.retrieve_filing_from_url(filing_url)
extracted_content = data_manager.return_filing_from_pattern(filing_response, pattern=r"Item 1A\..*?(?=Item 2\.)")

# Parse and clean HTML content from the filing's text response
cleaned_text = data_manager.parse_html_from_response_text(extracted_content)
print(cleaned_text[:2000]) 



In [5]:
# Regex to find <DOCUMENT> tags
doc_start_pattern = re.compile(r'<DOCUMENT>')
doc_end_pattern = re.compile(r'</DOCUMENT>')
# Regex to find <TYPE> tag prceeding any characters, terminating at new line
type_pattern = re.compile(r'<TYPE>[^\n]+')

Define Span Indices using REGEXes

Now, that we have the regexes defined, we will use the `.finditer()` method to match the regexes in the `raw_10k`. In the code below, we will create 3 lists:

1. A list that holds the `.end()` index of each match of `doc_start_pattern`

2. A list that holds the `.start()` index of each match of `doc_end_pattern`

3. A list that holds the name of section from each match of `type_pattern`

In [40]:
data_manager = SECDataManager(filepath='ticker.txt', debug=True)
tickers_df = data_manager.get_tickers_from_file()
company_cik = data_manager.get_cik_from_ticker('banf')
filings_list = data_manager.fetch_filings_list(company_cik, filing_options=['10-K'])
filing_url = data_manager.format_url_from_filings(company_cik, data_manager.get_most_recent_accession_number(filings_list))

# Retrieve the filing content from the URL
# This will make an HTTP request, so ensure you have a valid URL and internet connection
raw_10k = data_manager.retrieve_filing_from_url(filing_url)


  Ticker         CIK
0   aapl  0000320193
1   msft  0000789019
2  brk-b  0001067983
3    unh  0000731766
4    jnj  0000200406
  Ticker         CIK
0   aapl  0000320193
1   msft  0000789019
2  brk-b  0001067983
3    unh  0000731766
4    jnj  0000200406
CIK: 0000760498
Filings DF:
         accessionNumber  filingDate  reportDate        acceptanceDateTime act  \
0  0001593968-24-000451  2024-03-07  2024-03-07  2024-03-07T16:38:50.000Z       
1  0001593968-24-000411  2024-03-04  2024-03-04  2024-03-04T18:18:14.000Z       
2  0001316601-24-000001  2024-03-01              2024-03-01T12:12:31.000Z  33   
3  0001593968-24-000371  2024-02-29  2024-02-29  2024-02-29T12:47:40.000Z       
4  0001593968-24-000368  2024-02-27  2024-02-27  2024-02-27T16:55:18.000Z       

  form fileNumber filmNumber items  size  isXBRL  isInlineXBRL  \
0    4                              5739       0             0   
1    4                              8071       0             0   
2  144  000-14384   24708242      

Create a Dictionary for the 10-K

In the code below, we will create a dictionary which has the key `10-K` and as value the contents of the `10-K` section found above. To do this, we will create a loop, to go through all the sections found above, and if the section type is `10-K` then save it to the dictionary. Use the indices in  `doc_start_is` and `doc_end_is`to slice the `raw_10k` file.

In [22]:
document = {}

# Create a loop to go through each section type and save only the 10-K section in the dictionary
for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
    if doc_type == '10-K':
        document[doc_type] = raw_10k[doc_start:doc_end]

In [23]:
# display excerpt the document
document['10-K'][0:500]

'\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>aapl-20230930.htm\n<DESCRIPTION>10-K\n<TEXT>\n<XBRL>\n<?xml version="1.0" ?><!--XBRL Document Created with the Workiva Platform--><!--Copyright 2023 Workiva--><!--r:c68c2fd8-345e-4faa-8610-8b173d5da094,g:843081e3-ad04-4f75-a78c-e87ea3423788,d:1cb1ba018cb1455aa66bd3f9ab0c5b1a--><html xmlns:ix="http://www.xbrl.org/2013/inlineXBRL" xmlns:dei="http://xbrl.sec.gov/dei/2023" xmlns:ixt="http://www.xbrl.org/inlineXBRL/transformation/2020-02-12" xmlns:ixt-sec="http://www.sec'

In [24]:
# Write the regex
regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')

# Use finditer to math the regex
matches = regex.finditer(document['10-K'])

# Write a for loop to print the matches
for match in matches:
    print(match)

<re.Match object; span=(141966, 141975), match='>Item 1A.'>
<re.Match object; span=(143369, 143378), match='>Item 1B.'>
<re.Match object; span=(153895, 153903), match='>Item 7.'>
<re.Match object; span=(155373, 155382), match='>Item 7A.'>
<re.Match object; span=(156819, 156827), match='>Item 8.'>
<re.Match object; span=(216277, 216286), match='>Item 1A.'>
<re.Match object; span=(310814, 310823), match='>Item 1B.'>
<re.Match object; span=(355524, 355532), match='>Item 7.'>
<re.Match object; span=(471091, 471100), match='>Item 7A.'>
<re.Match object; span=(482680, 482688), match='>Item 8.'>


In [25]:
# Matches
matches = regex.finditer(document['10-K'])

# Create the dataframe
test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

test_df.columns = ['item', 'start', 'end']
test_df['item'] = test_df.item.str.lower()

# Display the dataframe
test_df.head()

Unnamed: 0,item,start,end
0,>item 1a.,141966,141975
1,>item 1b.,143369,143378
2,>item 7.,153895,153903
3,>item 7a.,155373,155382
4,>item 8.,156819,156827


In [33]:
# Get rid of unnesesary charcters from the dataframe
# These include &nbsp or &#160;
test_df.replace('&#160;',' ',regex=True,inplace=True)
test_df.replace('&nbsp;',' ',regex=True,inplace=True)
test_df.replace(' ','',regex=True,inplace=True)
test_df.replace('\.','',regex=True,inplace=True)
test_df.replace('>','',regex=True,inplace=True)

# display the dataframe
test_df.head(10)

Unnamed: 0,item,start,end
0,item1a,141966,141975
1,item1b,143369,143378
2,item7,153895,153903
3,item7a,155373,155382
4,item8,156819,156827
5,item1a,216277,216286
6,item1b,310814,310823
7,item7,355524,355532
8,item7a,471091,471100
9,item8,482680,482688


In [27]:
# Drop duplicates
pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')

# Display the dataframe
pos_dat

Unnamed: 0,item,start,end
5,item1a,216277,216286
6,item1b,310814,310823
7,item7,355524,355532
8,item7a,471091,471100
9,item8,482680,482688


In [28]:
# Set item as the dataframe index
pos_dat.set_index('item', inplace=True)

# display the dataframe
pos_dat

Unnamed: 0_level_0,start,end
item,Unnamed: 1_level_1,Unnamed: 2_level_1
item1a,216277,216286
item1b,310814,310823
item7,355524,355532
item7a,471091,471100
item8,482680,482688


In [29]:
"""
The above dataframe contains the starting and end index of each match for Items 1A, 7, and 7A. 
In the code below, we will save all the text from the starting index of `item1a` till the starting index
of `item1b` into a variable called `item_1a_raw`. 

Similarly, save all the text from the starting index of `item7` till the starting index of `item7a` 
into a variable called `item_7_raw`. Finally,  save all the text from the starting index of `item7a` 
till the starting of `item8` into a variable called `item_7a_raw`. We can accomplish all of this by making the
correct slices of `document['10-K']`.
"""
# Get Item 1a
item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_dat['start'].loc['item1b']]

# Get Item 7
item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]

# Get Item 7a
item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]

In [30]:
item_1a_raw[0:1000]

'>Item 1A.&#160;&#160;&#160;&#160;Risk Factors</span></div><div style="margin-top:9pt;text-align:justify"><span style="color:#000000;font-family:\'Helvetica\',sans-serif;font-size:9pt;font-weight:400;line-height:120%">The Company&#8217;s business, reputation, results of operations, financial condition and stock price can be affected by a number of factors, whether currently known or unknown, including those described below. When any one or more of these risks materialize from time to time, the Company&#8217;s business, reputation, results of operations, financial condition and stock price can be materially and adversely affected.</span></div><div style="margin-top:12pt;text-align:justify"><span style="color:#000000;font-family:\'Helvetica\',sans-serif;font-size:9pt;font-weight:400;line-height:120%">Because of the following factors, as well as other factors affecting the Company&#8217;s results of operations and financial condition, past financial performance should not be considered to

In [34]:
#Beautify
### First convert the raw text we have to extracted to BeautifulSoup object 
item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')
### Our goal is though to remove html tags and see the content
### Method get_text() is what we need, \n\n is optional, I just added this to read text 
### more cleanly, it's basically new line character between sections. 
print(item_1a_content.get_text("\n\n"))

>Item 1A.    Risk Factors

The Company’s business, reputation, results of operations, financial condition and stock price can be affected by a number of factors, whether currently known or unknown, including those described below. When any one or more of these risks materialize from time to time, the Company’s business, reputation, results of operations, financial condition and stock price can be materially and adversely affected.

Because of the following factors, as well as other factors affecting the Company’s results of operations and financial condition, past financial performance should not be considered to be a reliable indicator of future performance, and investors should not use historical trends to anticipate results or trends in future periods. This discussion of risk factors contains forward-looking statements.

This section should be read in conjunction with Part II, Item 7, “Management’s Discussion and Analysis of Financial Condition and Results of Operations” and the con