In [1]:
from langchain.document_loaders import DirectoryLoader



In [2]:
loader = DirectoryLoader('sample_filings', show_progress=True)

In [3]:
pages = loader.load()

 25%|███████████▎                                 | 1/4 [01:05<03:17, 65.92s/it]


In [4]:
len(pages)

1

In [5]:
# pages[0].page_content[0:500]

___

In [18]:
import re
from langchain.text_splitter import TextSplitter
import pandas as pd
from bs4 import BeautifulSoup


In [43]:
class CustomTextSplitter(TextSplitter):

    def split_text(self, document):
        text = document.page_content 
        
        # Extract metadata
        filed_as_of_date_match = re.search(r'FILED AS OF DATE:\s*(\d+)', text)
        filed_as_of_date = filed_as_of_date_match.group(1) if filed_as_of_date_match else None
        
        company_name_match = re.search(r'COMPANY CONFORMED NAME:\s*([^\n]+)', text)
        company_name = company_name_match.group(1).strip() if company_name_match else None

        sic_match = re.search(r'STANDARD INDUSTRIAL CLASSIFICATION:\s*([^\n]+)', text)
        sic = sic_match.group(1).strip() if sic_match else None

        state_match = re.search(r'STATE:\s*([^\n]+)', text)
        state = state_match.group(1).strip() if state_match else None
        
        # Regex to find <DOCUMENT> tags
        doc_start_pattern = re.compile(r'<DOCUMENT>')
        doc_end_pattern = re.compile(r'</DOCUMENT>')
        type_pattern = re.compile(r'<TYPE>[^\n]+')

        
        doc_start_is = [x.end() for x in doc_start_pattern.finditer(text)]
        doc_end_is = [x.start() for x in doc_end_pattern.finditer(text)]
        doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(text)]
        
        
        document = {}

        # Create a loop to go through each section type and save only the 10-K section in the dictionary
        for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
            if doc_type == '10-K':
                document[doc_type] = text[doc_start:doc_end]
                
                
        regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|2|3|4|5|6|7|7A|8|9A|9B|9|[1][0-5])\.{0,1})|(ITEM(\s|&#160;|&nbsp;)(1A|1B|2|3|4|5|6|7|7A|8|9A|9B|9|[1][0-5])\.{0,1})')
        matches = regex.finditer(document['10-K'])
        # Create the dataframe
        test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

        test_df.columns = ['item', 'start', 'end']
        test_df['item'] = test_df.item.str.lower()
        
        # Get rid of unnesesary charcters from the dataframe
        test_df.replace('&#160;',' ',regex=True,inplace=True)
        test_df.replace('&nbsp;',' ',regex=True,inplace=True)
        test_df.replace(' ','',regex=True,inplace=True)
        test_df.replace('\.','',regex=True,inplace=True)
        test_df.replace('>','',regex=True,inplace=True)
        
        # Drop duplicates
        pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')
        
        sections = ['item1a','item1b','item2','item3','item4','item5','item6','item7',
                 'item8','item9a','item9b','item9','item10','item11','item12','item13','item14',
                 'item15']        
        
        cleaned_sections = []

        for index, value in enumerate(sections):
            if value == 'item15':
                break
            cur_start = pos_dat[pos_dat['item'] == value]['start'].values[0]
            next_start = pos_dat[pos_dat['item'] == sections[index+1]]['start'].values[0]
            
            section = document['10-K'][cur_start:next_start]
                        
            # Clean up the date using BeautifulSoup
            content = BeautifulSoup(section, "lxml")

            # Append metadata with each section
            metadata = {
                "company": company_name,
                "date": filed_as_of_date,
                "sic": sic,
                "state": state
                
            }

            cleaned_section = {
                "content": content.get_text("\n\n"),
                "metadata": metadata
            }

            cleaned_sections.append(cleaned_section)

        return cleaned_sections
    

In [44]:
type(pages[0].page_content)

str

In [45]:
# Convert the Langchain Document to text
# document_text = pages[0].page_content
custom_splitter = CustomTextSplitter()

# Split the document's text into sections, clean up the date, and append metadata
sections = custom_splitter.split_text(pages[0])

['item1a', 'item1b', 'item2', 'item3', 'item4', 'item5', 'item6', 'item7', 'item8', 'item9a', 'item9b', 'item9', 'item10', 'item11', 'item12', 'item13', 'item14', 'item15']


In [61]:
sections[15]

{'content': '>Item 13.\xa0\xa0\xa0\xa0Certain Relationships and Related Transactions, and Director Independence\n\nThe information required by this Item will be included in the 2023 Proxy Statement, and is incorporated herein by reference.',
 'metadata': {'company': 'Apple Inc.',
  'date': '20221028',
  'sic': 'ELECTRONIC COMPUTERS [3571]',
  'state': 'CA'}}