In [199]:
from langchain.document_loaders import DirectoryLoader
import re
from langchain.text_splitter import TextSplitter
import pandas as pd
from bs4 import BeautifulSoup
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

In [244]:
import warnings
warnings.filterwarnings('ignore')

In [245]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

## Load Docs

In [144]:
loader = DirectoryLoader('sample_filings', show_progress=True)

In [145]:
pages = loader.load()

 20%|████████▊                                   | 1/5 [02:16<09:05, 136.38s/it]


In [146]:
len(pages)

1

## Custom Splitting

In [210]:
import re
import pandas as pd
from bs4 import BeautifulSoup

class CustomTextSplitter(TextSplitter):

    def split_text(self, document):
        text = document.page_content 

        metadata = {
            "company": self.extract_metadata(r'COMPANY CONFORMED NAME:\s*([^\n]+)', text),
            "date": self.extract_metadata(r'FILED AS OF DATE:\s*(\d+)', text),
            "sic": self.extract_metadata(r'STANDARD INDUSTRIAL CLASSIFICATION:\s*([^\n]+)', text),
            "state": self.extract_metadata(r'STATE:\s*([^\n]+)', text)
        }

        ten_k_text = self.extract_10k(text)
        
        sections_df = self.get_sections_dataframe(ten_k_text)
        # display(sections_df)

        return self.extract_cleaned_sections(ten_k_text, sections_df, metadata)

    def extract_metadata(self, pattern, text):
        match = re.search(pattern, text)
        return match.group(1).strip() if match else None

    def extract_10k(self, text):
        doc_start_pattern = re.compile(r'<DOCUMENT>')
        doc_end_pattern = re.compile(r'</DOCUMENT>')
        type_pattern = re.compile(r'<TYPE>[^\n]+')
        
        doc_start_is = [x.end() for x in doc_start_pattern.finditer(text)]
        doc_end_is = [x.start() for x in doc_end_pattern.finditer(text)]
        doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(text)]

        for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
            if doc_type == '10-K':
                return text[doc_start:doc_end]

        return None

    def get_sections_dataframe(self, ten_k_text):
        regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1.|1A|1B|2|3|4|5|6|7.|7A|8.|9A|9B|9.|[1][0-5])\.{0,1})|(ITEM(\s|&#160;|&nbsp;)(1.|1A|1B|2|3|4|5|6|7.|7A|8.|9A|9B|9.|[1][0-5])\.{0,1})')
        matches = regex.finditer(ten_k_text)

        df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches], columns=['item', 'start', 'end'])
        df['item'] = df['item'].str.lower().str.replace('&#160;|&nbsp;| |\.', '', regex=True).str.replace('>', '', regex=False)

        # Filter for the last occurrence of each section
        df = df.groupby('item').last().reset_index()

        # Map sections to a predefined order
        section_order = {
            'item1': 1, 'item1a': 2, 'item1b': 3, 'item2': 4, 'item3': 5, 'item4': 6, 
            'item5': 7, 'item6': 8, 'item7': 9, 'item7a': 10, 'item8': 11, 'item9': 12,
            'item9a': 13, 'item9b': 14, 'item9c': 15, 'item10': 16, 'item11': 17, 'item12': 18, 
            'item13': 19, 'item14': 20, 'item15': 21, 'item16': 22
        }
        df['order'] = df['item'].map(section_order)

        # Sort by predefined order
        return df.sort_values(by=['order']).reset_index(drop=True)


    def extract_cleaned_sections(self, ten_k_text, sections_df, metadata):
        sections = ['item1', 'item1a', 'item1b', 'item2', 'item3', 'item4', 'item5', 'item6', 'item7', 
                    'item7a', 'item8', 'item9', 'item9a', 'item9b', 'item9c', 'item10', 'item11', 'item12', 
                    'item13', 'item14', 'item15', 'item16']  
        section_titles = {
                    'item1': 'Business',
                    'item1a': 'Risk Factors',
                    'item1b': 'Unresolved Staff Comments',
                    'item2': 'Properties',
                    'item3': 'Legal Proceedings',
                    'item4': 'Mine Safety Disclosures',
                    'item5': 'Market for Registrant’s Common Equity, Related Stockholder Matters and Issuer Purchases of Equity Securities',
                    'item6': 'Selected Financial Data',
                    'item7': 'Management’s Discussion and Analysis of Financial Condition and Results of Operations',
                    'item7a': 'Quantitative and Qualitative Disclosures About Market Risk',
                    'item8': 'Financial Statements and Supplementary Data',
                    'item9': 'Changes in and Disagreements With Accountants on Accounting and Financial Disclosure',
                    'item9a': 'Controls and Procedures',
                    'item9b': 'Other Information',
                    'item10': 'Directors, Executive Officers and Corporate Governance',
                    'item11': 'Executive Compensation',
                    'item12': 'Security Ownership of Certain Beneficial Owners and Management and Related Stockholder Matters',
                    'item13': 'Certain Relationships and Related Transactions, and Director Independence',
                    'item14': 'Principal Accountant Fees and Services',
                    'item15': 'Exhibits and Financial Statement Schedules',
                    'item16': 'Form 10-K Summary'  
                }
        

        cleaned_sections = []

        for index, section_name in enumerate(sections[:-1]):  # We use -1 because we don't need to process the last item separately
            current_start_values = sections_df[sections_df['item'] == section_name]['start'].values
            if len(current_start_values) == 0:  # section_name not found in the dataframe
                continue
            current_start = current_start_values[0]

            # Adjusting for the next section's start
            if index + 1 < len(sections):
                next_start_values = sections_df[sections_df['item'] == sections[index+1]]['start'].values
                if len(next_start_values) == 0:  # next section name not found in the dataframe
                    next_start = len(ten_k_text)
                else:
                    next_start = next_start_values[0]
            else:
                next_start = len(ten_k_text)
                
            section_title = section_titles.get(section_name, "")
            section_content = ten_k_text[current_start:next_start] 
            
            current_metadata = metadata.copy()
            current_metadata["title"] = section_title
        
            content = BeautifulSoup(section_content, "lxml").get_text("\n\n")

            cleaned_sections.append({
                "content": content,
                "metadata": current_metadata
            })

        return cleaned_sections

  df['item'] = df['item'].str.lower().str.replace('&#160;|&nbsp;| |\.', '', regex=True).str.replace('>', '', regex=False)


In [211]:
custom_splitter = CustomTextSplitter()

In [212]:
sections = custom_splitter.split_text(pages[0])

In [219]:
sections[5]['metadata']

{'company': 'AGILENT TECHNOLOGIES, INC.',
 'date': '20221221',
 'sic': 'LABORATORY ANALYTICAL INSTRUMENTS [3826]',
 'state': 'CA',
 'title': 'Mine Safety Disclosures'}

In [236]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Instantiate the RecursiveCharacterTextSplitter
chunk_size = 1000
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

# Create an empty list to store the documents
documents = []

# Iterate over the sections and split large sections into smaller chunks
for section in sections:
    if len(section['content']) > chunk_size:
        chunks = text_splitter.split_text(section['content'])
        # Append the section's metadata to each chunk
        for chunk in chunks:
            modified_chunk = {
                "content": chunk,
                "metadata": section["metadata"]
            }
            # Create a Document object for each chunk and add it to the documents list
            document = Document(page_content=modified_chunk['content'], metadata=modified_chunk['metadata'])
            documents.append(document)
    else:
        # Create a Document object for the section and add it to the documents list
        document = Document(page_content=section['content'], metadata=section['metadata'])
        documents.append(document)

## Embedding and Vector Stores

In [252]:
embeddings = OpenAIEmbeddings()

In [253]:
import weaviate

auth_config = weaviate.AuthApiKey(api_key="EVGZZPpXvnby1SqI3sPdquyPFEu10LcC3KbB")

client = weaviate.Client(
  url="https://streamlit-hackathon-fkg18d0d.weaviate.network",
  auth_client_secret=auth_config
)

In [255]:
client

<weaviate.client.Client at 0x2887cf8e0>

In [256]:
vector_store = Weaviate.from_documents(documents, embeddings, client=client)

In [257]:
query = "When was Agilent Technologies incorporated?"
docs = vector_store.similarity_search(query)

In [258]:
len(docs)

4

In [259]:
chat_model = ChatOpenAI(model_name="gpt-3.5-turbo-0613", temperature=0.2)

In [260]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(chat_model,retriever=vector_store.as_retriever())
qa_chain({"query": query})

{'query': 'When was Agilent Technologies incorporated?',
 'result': 'Agilent Technologies Inc. was incorporated in May 1999.'}