In [1]:
from dotenv import load_dotenv
load_dotenv("../.env")

True

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

## `Document Ingestion`

#### Convert the docs into markdown

 - Converting the documents into markdown format improves the LLM understanding since it is a machine-friendly format.

In [3]:
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
import os
from pathlib import Path

def get_markdown(source, target_dir):

    doc_filename = Path(source).stem
    os.makedirs(target_dir, exist_ok=True)

    md_filename = os.path.join(target_dir, f"{doc_filename}.md")

    loader = DoclingLoader(file_path=source, export_type=ExportType.MARKDOWN)
    docs = loader.load()

    with open(md_filename, "w") as f:
        f.write(docs[0].page_content)
        print(f"File {md_filename} created")

    return md_filename

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
os.getcwd()

'c:\\Desktop\\AgenticRAG-Project\\code'

In [5]:
def process_documents(input_dir, output_dir):
    for root, dirs, files in os.walk(input_dir):
        # print(root, dirs, files)

        relative_path = os.path.relpath(root, input_dir)
        target_dir = os.path.join(output_dir, relative_path)
        os.makedirs(target_dir, exist_ok=True)

        for file in files:
            
            if file.lower().endswith((".pdf", ".docx", ".doc", ".pptx", ".ppt", ".xlsx", ".xls")):
                source = os.path.join(root, file)
                print(f"Processing file: {source} ->> {target_dir}")

                md_filename = get_markdown(source, target_dir)

                print(f"File {md_filename} created")

input_dir = "../dataset/docs"
output_dir = "../dataset/markdown"

process_documents(input_dir, output_dir)

Processing file: ../dataset/docs\amazon\amazon-10-q-q3-2024.pdf ->> ../dataset/markdown\amazon
File ../dataset/markdown\amazon\amazon-10-q-q3-2024.md created
File ../dataset/markdown\amazon\amazon-10-q-q3-2024.md created
Processing file: ../dataset/docs\facebook\Downloadable-BS-Q3-24.xlsx ->> ../dataset/markdown\facebook
File ../dataset/markdown\facebook\Downloadable-BS-Q3-24.md created
File ../dataset/markdown\facebook\Downloadable-BS-Q3-24.md created
Processing file: ../dataset/docs\facebook\Downloadable-PL-Q3-24.xlsx ->> ../dataset/markdown\facebook
File ../dataset/markdown\facebook\Downloadable-PL-Q3-24.md created
File ../dataset/markdown\facebook\Downloadable-PL-Q3-24.md created
Processing file: ../dataset/docs\facebook\Earnings-Presentation-Q3-2024.pdf ->> ../dataset/markdown\facebook
File ../dataset/markdown\facebook\Earnings-Presentation-Q3-2024.md created
File ../dataset/markdown\facebook\Earnings-Presentation-Q3-2024.md created
Processing file: ../dataset/docs\facebook\Meta-0

#### Loading all the markdown files in the directory

In [6]:
def read_markdown_files(input_dir):
    company_files = []

    for root, dirs, files in os.walk(input_dir):
        # Get the company name from the folder structure
        relative_path = os.path.relpath(root, input_dir)
        company_name = os.path.basename(relative_path)

        for file in files:
            if file.endswith('.md'):
                file_path = os.path.join(root, file)
                company_files.append({'company': company_name, 'file_path': file_path})

    return company_files

In [7]:
input_dir = '../dataset/markdown'
read_markdown_files(input_dir)

[{'company': 'amazon',
  'file_path': '../dataset/markdown\\amazon\\amazon-10-q-q3-2024.md'},
 {'company': 'facebook',
  'file_path': '../dataset/markdown\\facebook\\Downloadable-BS-Q3-24.md'},
 {'company': 'facebook',
  'file_path': '../dataset/markdown\\facebook\\Downloadable-PL-Q3-24.md'},
 {'company': 'facebook',
  'file_path': '../dataset/markdown\\facebook\\Earnings-Presentation-Q3-2024.md'},
 {'company': 'facebook',
  'file_path': '../dataset/markdown\\facebook\\Meta-09-30-2024-Exhibit-99-1_FINAL.md'},
 {'company': 'facebook',
  'file_path': '../dataset/markdown\\facebook\\META-Q3-2024-Earnings-Call-Transcript.md'},
 {'company': 'facebook',
  'file_path': '../dataset/markdown\\facebook\\META-Q3-2024-Follow-Up-Call-Transcript.md'},
 {'company': 'google',
  'file_path': '../dataset/markdown\\google\\goog-10-q-q3-2024.md'}]

#### Document Chunking

In [8]:
from langchain_text_splitters import MarkdownHeaderTextSplitter


def get_markdown_splits(md_filename):
    with open(md_filename, "r", encoding="utf-8") as f:
        markdown_content = f.read()

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
    md_header_splits = markdown_splitter.split_text(markdown_content)

    return md_header_splits

In [9]:
input_dir = '../dataset/markdown'
md_files_with_companies =  read_markdown_files(input_dir)

In [10]:
documents = []
for md_file in md_files_with_companies:
    company_name = md_file['company']
    md_file_path = md_file['file_path']

    md_header_splits = get_markdown_splits(md_file_path)

    for doc in md_header_splits:
        doc.metadata['company'] = company_name

    documents.extend(md_header_splits)

In [11]:
documents

[Document(metadata={'Header 2': 'UNITED STATES', 'company': 'amazon'}, page_content='## UNITED STATES'),
 Document(metadata={'Header 2': 'SECURITIES AND EXCHANGE COMMISSION', 'company': 'amazon'}, page_content='## SECURITIES AND EXCHANGE COMMISSION  \nWashington, D.C. 20549  \n\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_  \nFORM 10-Q  \n\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_  \n(Mark One)  \n☒  \nQUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934  \nFor the quarterly period ended September 30, 2024  \nor  \n☐  \nTRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934  \nFor the transition period from            to             .  \nCommission File No. 000-22513  \n\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_\\_'),
 Document(me

In [13]:
## Total chunks created
print(f"Total chunks created: {len(documents)}")
# len(documents)

Total chunks created: 464


### Document Vector Embedding

In [19]:
from langchain_ollama import OllamaEmbeddings

import faiss
from langchain_community.vectorstores import FAISS 
from langchain_community.docstore.in_memory import InMemoryDocstore


embeddings = OllamaEmbeddings(model="nomic-embed-text", base_url="http://localhost:11434")

index = faiss.IndexFlatL2(len(embeddings.embed_query("test")))
vector_store = FAISS(embedding_function= embeddings,
                     index=index,
                     docstore=InMemoryDocstore(),
                     index_to_docstore_id={})


vector_store.add_documents(documents)

vector_store.index.ntotal, vector_store.index.d

(464, 768)