In [79]:
# Imports
import pandas as pd
import requests
import re

In [103]:
"""
Flow is as follows
1. Tickers.txt is read into python variable if not already there to create CIK lookup table
2. PublicCompany class can be called by a user. This takes a ticker as part of __init__ and correlates with CIK
3. FilingRequester class manages the headers and URL assembly to find and pull most recent 10K
4. PublicFiling class is created with the requested filing and can create a parsed version from the raw filing
5. PublicFiling finds the relevant Item 1A sections with regex and returns values from function 
6. SECDataManager class orchestrates threaded calls for various tickers, the aggregation, and the output of those calls
"""
class SECDataManager:
    """
    This class orchestrates threaded calls to stay under the 10 calls/s max of the SEC Edgar website
    """
    def __init__(self):
        pass

    
class PublicCompany:
    """
    A PublicCompany is the entity that represents a publicly-traded company. 
    A public company instance will have multiple public filings (most likely), so each instance of a public filing 
    can be instanciated by creating a PublicFiling object. 
    For the 10K search case, the PublicFiling will just be the most recent 10K filing
    
    In the future, this class can be extended to managing filings based on the accession number / URL
    """
    def __init__(self, ticker=None, debug=False, company_cik=None, filepath='ticker.txt'):
        self.ticker = ticker
        self.debug = debug
        self.filepath = filepath 
        if company_cik:
            self.company_cik = company_cik
        else: self.get_company_cik()
        self.most_recent_10k = PublicFiling(company_cik=self.company_cik, ticker=self.ticker, debug=self.debug)
        
    def __repr__(self):
        """
        Returns representation of the object
        """
        return f"{self.__class__.__name__}(ticker='{self.ticker}', company_cik='{self.company_cik}')"

    def get_company_cik(self):
        """
        Get CIK from ticker reference file
        """
        df = pd.read_csv(self.filepath, sep='\t', names=["Ticker", "CIK"])
        df['CIK'] = df['CIK'].apply(lambda x: f"{x:0>10}")
        try:
            self.company_cik = df.loc[df['Ticker'].str.lower() == self.ticker.lower(), 'CIK'].item()
        except ValueError:
            raise ValueError("Ticker not recognized")

class FilingRequester():
    def __init__(self, company_cik, ticker=None, debug=False, accession_number=None,
                 request_url=None, headers=None, filing_options=None):
        """
        FilingRequester manages the assembly of the request URL and the request itself. 
        If an access_URL is passed to the function, it will automatically make the request without running through everything else
        """
        # Set up request headers
        self.headers = headers or {
            'User-Agent': 'evolvConsulting ward.rushton@evolvconsulting.com',
            'From': 'ward.rushton@evolvconsulting.com'}
        
        self.company_cik = company_cik
        self.filing_options = filing_options or ["10-K"]
        if accession_number is None:
            self.get_most_recent_accession_number()
        else: self.accession_number = accession_number
            
        if request_url is not None:
            self.request_url = request_url
            self.raw_filing = self.retrieve_filing_from_url()
        else: 
            self.format_url_from_cik()
            self.retrieve_filing_from_url()

    def format_url_from_cik(self):
        """
        Assemble the .TXT access URL from SEC website
        Requires 
        """
        base_url = "https://www.sec.gov/Archives/edgar/data/"
        formatted_accession_number = self.accession_number.replace('-', '')
        self.request_url = f"{base_url}{self.company_cik}/{formatted_accession_number}/{self.accession_number}.txt"
 
    def fetch_filings_list(self):
        """
        Fetches a list of filings for the company specified by its CIK.
        """
        base_url = "https://data.sec.gov/submissions/CIK"
        response = requests.get(f"{base_url}{self.company_cik}.json", headers=self.headers)
        response.raise_for_status()

        filings_dict = response.json()['filings']['recent']
        filings_df = pd.DataFrame(filings_dict)
        self.company_filings = filings_df[filings_df['primaryDocDescription'].isin(self.filing_options)]

    def get_most_recent_accession_number(self):
        """
        Retrieves the most recent accession number for the specified filings.
        """
        self.fetch_filings_list()
        self.accession_number = self.company_filings['accessionNumber'].iloc[0]
    
    def retrieve_filing_from_url(self):
        """
        Retrieves the raw filing text from the SEC website.
        """
        response = requests.get(self.request_url, headers=self.headers)
        response.raise_for_status()
        self.raw_filing = response.text
    
class PublicFiling:
    def __init__(self, ticker=None, debug=False, company_cik=None,
                 raw_filing=None, accession_number=None, filing_dict=None):
        """
        Create class that manages an instance of a PublicFiling
        Then use filingRequester to manage URL assembly
        PublicFiling should manage parsing and section match (consider subclassing Items)
        """
        self.company_cik = company_cik 
        self.ticker = ticker
        self.debug = debug
        self.accession_number = accession_number
        self.raw_filing_dict = {}
        self.filing_dict = filing_dict or {}
        if raw_filing is None:
            self.filing_request = FilingRequester(company_cik=self.company_cik, debug=self.debug, accession_number=self.accession_number)
            self.accession_number = self.filing_request.accession_number
            self.raw_filing = self.filing_request.raw_filing
    
    def __repr__(self):
        return f"{self.__class__.__name__}(ticker='{self.ticker}', company_cik='{self.company_cik}', accession_number='{self.accession_number}')"
    
    def head(self, chars=2000):
        print(self.raw_filing[0:chars])
    
    def get_item(self):
        """
        find_regex_matches_in_doc()
        Dedupe_regex_matches_to_drop_TOC_matches()
        get_requested_item_match()
        beautify_text()
        """
        pass
    
    def get_matching_document_sections(self):
        """
        Identifies and extracts sections of interest from the raw filing text.
        """

        doc_start_pattern = re.compile(r'<DOCUMENT>')
        doc_end_pattern = re.compile(r'</DOCUMENT>')
        type_pattern = re.compile(r'<TYPE>[^\n]+')

        doc_start_indices = [match.end() for match in doc_start_pattern.finditer(self.raw_filing)]
        doc_end_indices = [match.start() for match in doc_end_pattern.finditer(self.raw_filing)]
        doc_types_present = [match.group()[len('<TYPE>'):] for match in type_pattern.finditer(self.raw_filing)]

        for doc_type, start, end in zip(doc_types_present, doc_start_indices, doc_end_indices):
            if doc_type == "10-K":
                self.raw_filing_dict[doc_type] = self.raw_filing[start:end]
        self.filing_dict["10-K"] = self.beautify_text(self.raw_filing_dict["10-K"])
                
        #return self.dedupe_match_patterns()

    def dedupe_match_patterns(self):
        """
        Deduplicates match patterns and prepares a DataFrame with the findings.
        """

        # Write the regex
        dedupe_regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')

        # Matches
        matches = dedupe_regex.finditer(self.filing_dict["10-K"])

        # Create the dataframe
        self.filing_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

        self.filing_df.columns = ['item', 'start', 'end']
        self.filing_df['item'] = self.filing_df.item.str.lower()

        # Display the dataframe
        if self.debug: self.filing_df.head()
        # Get rid of unnesesary charcters from the dataframe
        # These include &nbsp or &#160;
        self.filing_df.replace('&#160;',' ',regex=True,inplace=True)
        self.filing_df.replace('&nbsp;',' ',regex=True,inplace=True)
        self.filing_df.replace(' ','',regex=True,inplace=True)
        self.filing_df.replace('\.','',regex=True,inplace=True)
        self.filing_df.replace('>','',regex=True,inplace=True)
        self.filing_df.replace('0xa0',regex=True, inplace=True)
        
        #Deduple with only last value (to avoid Table of Contents). Ensure sorted
        #self.filing_df = self.filing_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
        
        #Set index to Item
        self.filing_df.set_index('item', inplace=True)
        return self.filing_df

    def get_item_1a(self):
        """
        Extracts the "Item 1A" section from the filing, if available.
        """
        if not self.filing_dict:
            self.find_matching_document_sections()
        try:
            item_1a_start = self.filing_df.loc['item1a', 'start']
            item_1b_start = self.filing_df.loc['item1b', 'start']
            self.item_1a = self.filing_dict['10-K'][item_1a_start:item_1b_start]
            return self.beautify_text(self.item_1a)
        except KeyError as e:
            print(f"Item 1A not found. Error: {e}")

    def beautify_text(self, text):
        """
        Cleans and formats the text for better readability.
        """
        from bs4 import BeautifulSoup

        soup = BeautifulSoup(text, 'lxml')
        #return soup.get_text()
        return soup.prettify()
        


In [104]:
apple = PublicCompany(ticker='unh')


In [105]:
apple.most_recent_10k.get_matching_document_sections()

In [106]:
len(apple.most_recent_10k.filing_dict['10-K'])

3524948

In [91]:
# Write the regex
dedupe_regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')

# Matches
matches = dedupe_regex.finditer(apple.most_recent_10k.filing_dict['10-K'])

In [95]:
apple.most_recent_10k.raw_filing_dict['10-K']

<re.Match object; span=(91187, 91194), match='ITEM 1A'> <re.Match object; span=(157417, 157424), match='ITEM 1B'> <re.Match object; span=(167089, 167095), match='ITEM\xa07'> <re.Match object; span=(206268, 206275), match='ITEM 7A'> <re.Match object; span=(210476, 210482), match='ITEM 8'>


In [120]:
print(apple.most_recent_10k.filing_dict['10-K'], )

3524948


In [131]:

df = pd.read_csv('ticker.txt', sep='\t', names=["Ticker", "CIK"])
df['CIK'] = df['CIK'].apply(lambda x: f"{x:0>10}")
ticker_list = df['Ticker'].tolist()
ticker_list = ticker_list[0:25]

In [133]:
html_forms = []
raw_text_forms = []
for ticker in ticker_list:
    try:
        company = PublicCompany(ticker=ticker)
        company.most_recent_10k.get_matching_document_sections()
        html_forms.append(company.most_recent_10k.raw_filing_dict['10-K'])
        raw_text_forms.append(company.most_recent_10k.filing_dict['10-K'])
    except:
        continue

In [135]:
df = pd.DataFrame(list(zip(html_forms,raw_text_forms)), columns=['html_forms', 'raw_text_forms'])

In [137]:
df.to_csv("output.csv")