# Testing AzureDocLoader in LangChain

Only run once.

In [1]:
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from dotenv import load_dotenv
import os

load_dotenv()

file_path = "/home/yixin0829/minmod/minmod-poc/data/raw/mvt_zinc/reports_processed/Bongará Zn 3-2019.pdf"
endpoint = os.getenv("AZURE_PDF_PARSING_ENDPOINT")
key = os.getenv("AZURE_API_KEY")
loader = AzureAIDocumentIntelligenceLoader(
    api_endpoint=endpoint, api_key=key, file_path=file_path, api_model="prebuilt-layout"
)

documents = loader.load()

In [7]:
len(documents)

1

In [8]:
documents

[Document(page_content='TECHNICAL REPORT ON THE BONGARÁ ZINC PROJECT YAMBRASBAMBA DISTRICT, AMAZONAS REGION, NORTHERN PERU FOR ZINC ONE RESOURCES INC.\n\nprepared by:\n\nAlbert W. (Al) Workman, P.Geo. Senior Geologist, and Vice-President, Operations and\n\nJohn Reddick, P.Geo. Senior Associate Resource Geologist\n\nEffective Date: 11 March, 2019 Toronto, Canada\n\n<figure>\n\n![](figures/0)\n\n<!-- FigureContent="Watts, Griffis and McOuat Geological and Mining Consultants" -->\n\n</figure>\n\n<figure>\n\n![](figures/1)\n\n<!-- FigureContent="Watts, Griffis and McOuat" -->\n\n</figure>\n\n\n| TABLE OF CONTENTS | Page |\n| - | - |\n| 1. SUMMARY .... | 1 |\n| 2. INTRODUCTION. ....... .......... | 9 |\n| 2.1 GENERAL | 9 |\n| 2.2 TERMS OF REFERENCE | 11 |\n| 2.3 SOURCES OF INFORMATION | 11 |\n| 2.3 UNITS AND CURRENCY | 12 |\n| 2.4 RISK FACTORS | 12 |\n| 3. RELIANCE ON OTHER EXPERTS .......... ..... | 14 |\n| 4. PROPERTY DESCRIPTION AND LOCATION | 15 |\n| 4.1 GENERAL LOCATION | 15 |\n| 4.2 P

In [16]:
with open("./azure_ai_parse_result.md", "w") as f:
    f.write(documents[0].page_content)

# Markdown splitter

In [1]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

# read markdown file parsed by azure ai
with open("./azure_ai_parse_result.md", "r") as f:
    markdown_document = f.read()

headers_to_split_on = [
    # ("#", "Header 1"),  # In markdown, 2.\ is used for header 1
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)
md_header_splits[:5]

[Document(page_content='TECHNICAL REPORT ON THE BONGARÁ ZINC PROJECT YAMBRASBAMBA DISTRICT, AMAZONAS REGION, NORTHERN PERU FOR ZINC ONE RESOURCES INC.  \nprepared by:  \nAlbert W. (Al) Workman, P.Geo. Senior Geologist, and Vice-President, Operations and  \nJohn Reddick, P.Geo. Senior Associate Resource Geologist  \nEffective Date: 11 March, 2019 Toronto, Canada  \n<figure>  \n![](figures/0)  \n<!-- FigureContent="Watts, Griffis and McOuat Geological and Mining Consultants" -->  \n</figure>  \n<figure>  \n![](figures/1)  \n<!-- FigureContent="Watts, Griffis and McOuat" -->  \n</figure>  \n| TABLE OF CONTENTS | Page |\n| - | - |\n| 1. SUMMARY .... | 1 |\n| 2. INTRODUCTION. ....... .......... | 9 |\n| 2.1 GENERAL | 9 |\n| 2.2 TERMS OF REFERENCE | 11 |\n| 2.3 SOURCES OF INFORMATION | 11 |\n| 2.3 UNITS AND CURRENCY | 12 |\n| 2.4 RISK FACTORS | 12 |\n| 3. RELIANCE ON OTHER EXPERTS .......... ..... | 14 |\n| 4. PROPERTY DESCRIPTION AND LOCATION | 15 |\n| 4.1 GENERAL LOCATION | 15 |\n| 4.2 PRO

In [2]:
type(md_header_splits[0])

langchain_core.documents.base.Document

In [3]:
import tiktoken
import numpy as np

token_counts = []
for md_header_split in md_header_splits:
    encoding = tiktoken.get_encoding(encoding_name="cl100k_base")
    num_tokens = len(encoding.encode(md_header_split.page_content))
    token_counts.append(num_tokens)

print(f"original file token count: {np.sum(token_counts)}")
# calculate the average token count in each md header split
print(f"avg token count: {round(np.mean(token_counts))}")
print(f"max token count: {np.max(token_counts)}")
print(f"min token count: {np.min(token_counts)}")


original file token count: 118629
avg token count: 1276
max token count: 14147
min token count: 16


# Vector stores

In [4]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores.chroma import Chroma

embedding_function = OpenAIEmbeddings(model="text-embedding-3-large")
LOAD_CACHED_EMBEDDINGS = True  # load cached embeddings from disk

if LOAD_CACHED_EMBEDDINGS:
    # load from disk
    vector_db = Chroma(persist_directory="/home/yixin0829/minmod/minmod-poc/.chroma_db", embedding_function=embedding_function)
else:
    # Create a vector database from the documents
    vector_db = Chroma.from_documents(md_header_splits, embedding_function, persist_directory="/home/yixin0829/minmod/minmod-poc/.chroma_db")

# Create a retriever from the vector database
retriever = vector_db.as_retriever()

In [7]:
# Write the retrieved documents to a file for inspection
with open("./azure_retrieval_result.txt", "a") as f:
    retrieved_docs = retriever.invoke("mineral resources and reserves tonnage")
    for doc in retrieved_docs:
        f.write(doc.page_content)

# Build parallel extraction chain

In [11]:
import sys
sys.path.append("/home/yixin0829/minmod/minmod-poc/")
from pydantic.v1 import BaseModel
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from src.schema.mineral_site import (
    BasicInfo,
    DepositTypeCandidates,
    LocationInfo,
    MineralInventory,
    MineralSite,
)

SYS_PROMPT: str = """You extract information of interest from a given mineral report in structured JSON formats. The information of interest includes {query}.

{format_instructions}"""

def extraction_chain_factory(query: str, output_schema: BaseModel):
    prompt = ChatPromptTemplate.from_template(SYS_PROMPT)
    output_parser = Py
