# Testing AzureDocLoader in LangChain

Only run once.

In [1]:
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from dotenv import load_dotenv
import os

load_dotenv()

file_path = "data/raw/mvt_zinc/reports_processed/Bleiberg Pb Zn 5-2017.pdf"
endpoint = os.getenv("AZURE_DI_ENDPOINT")
key = os.getenv("AZURE_DI_API_KEY")
loader = AzureAIDocumentIntelligenceLoader(
    api_endpoint=endpoint, api_key=key, file_path=file_path, api_model="prebuilt-layout"
)

documents = loader.load()

In [2]:
len(documents)

1

In [3]:
documents

[Document(page_content='MAMMOTH GEOLOGICAL LTD. 2446 Bidston Road, Mill Bay, B.C. Canada V0R 2P4\n\nPhone: (250) 743-8228 Fax: (250) 743-4430 email : mammothgeo@shaw.ca\n\nTECHNICAL REPORT BLEIBERG PROPERTY\n\nLocated in the (Bad Bleiberg Area, Southern Austria)\n\nMap Sheet BLEIBERG FB050 WGS 84 Zone 33N 394000E 5164500N\n\nFOR\n\nTasca Resources Ltd. Suite 830 - 1100 Melville Street Vancouver, British Columbia V6E 4A6\n\nR. Tim Henneberry, P.Geo. February 15, 2017 Revised May 8, 2017\n\n-2- SUMMARY\n===\n\nThe Bleiberg property consists of 116 exploration licenses totaling 6,582.4 hectares, 130 kilometres southeast of the city of Salzburg, Austria. Tasca Resources Ltd. is earning a 100% interest in 116 exploration licenses from Samarium Borealis Corporation of Vancouver, British Columbia by making a $60,000 cash payment on TSX Venture Exchange approval and a further $60,000 cash payment within two years of TSX Venture Exchange approval. There are no additional payments or royalties.\

In [4]:
with open("./Bleiberg_Pb_Zn_5-2017.md", "w") as f:
    f.write(documents[0].page_content)

In [6]:
def extract_pdf(file_path):
    output_dir = "data/asset/parsed_pdf_azure"
    loader = AzureAIDocumentIntelligenceLoader(
        api_endpoint=endpoint, api_key=key, file_path=file_path, api_model="prebuilt-layout"
    )
    documents = loader.load()
    output_file = os.path.join(
        output_dir, os.path.splitext(os.path.basename(file_path))[0].replace(" ", "_") + ".md"
    )
    with open(output_file, "w") as f:
        f.write(documents[0].page_content)

pdf_w_ground_truth = [
    # "data/raw/mvt_zinc/reports_processed/Bleiberg Pb Zn 5-2017.pdf",
    # "data/raw/mvt_zinc/reports_processed/Bongará Zn 3-2019.pdf",
    # "data/raw/mvt_zinc/reports_failed/Daniels Harbour Zn 12-2017.pdf",
    # "data/raw/mvt_zinc/reports_processed/Hakkari Zn 3-2010.pdf",
    # "data/raw/mvt_zinc/reports_processed/Hakkari Zn 7-2013.pdf",
    "data/raw/mvt_zinc/reports_processed/Hakkira Zn 4-2011.pdf",
    "data/raw/mvt_zinc/reports_processed/Mehdiabad Zn 3-2005.pdf",
    "data/raw/mvt_zinc/reports_failed/Prairie Creek Zn Pb Ag 9-2017 FS.pdf",
    "data/raw/mvt_zinc/reports_processed/Reocin Zn 3-2002.pdf",
]

for pdf in pdf_w_ground_truth:
    print("processing: ", pdf)
    extract_pdf(pdf)

processing:  data/raw/mvt_zinc/reports_processed/Hakkira Zn 4-2011.pdf
processing:  data/raw/mvt_zinc/reports_processed/Mehdiabad Zn 3-2005.pdf
processing:  data/raw/mvt_zinc/reports_failed/Prairie Creek Zn Pb Ag 9-2017 FS.pdf
processing:  data/raw/mvt_zinc/reports_processed/Reocin Zn 3-2002.pdf


# Markdown splitter

In [1]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

# read markdown file parsed by azure ai
with open("./azure_ai_parse_result.md", "r") as f:
    markdown_document = f.read()

headers_to_split_on = [
    # ("#", "Header 1"),  # In markdown, 2.\ is used for header 1
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)
md_header_splits[:5]

[Document(page_content='TECHNICAL REPORT ON THE BONGARÁ ZINC PROJECT YAMBRASBAMBA DISTRICT, AMAZONAS REGION, NORTHERN PERU FOR ZINC ONE RESOURCES INC.  \nprepared by:  \nAlbert W. (Al) Workman, P.Geo. Senior Geologist, and Vice-President, Operations and  \nJohn Reddick, P.Geo. Senior Associate Resource Geologist  \nEffective Date: 11 March, 2019 Toronto, Canada  \n<figure>  \n![](figures/0)  \n<!-- FigureContent="Watts, Griffis and McOuat Geological and Mining Consultants" -->  \n</figure>  \n<figure>  \n![](figures/1)  \n<!-- FigureContent="Watts, Griffis and McOuat" -->  \n</figure>  \n| TABLE OF CONTENTS | Page |\n| - | - |\n| 1. SUMMARY .... | 1 |\n| 2. INTRODUCTION. ....... .......... | 9 |\n| 2.1 GENERAL | 9 |\n| 2.2 TERMS OF REFERENCE | 11 |\n| 2.3 SOURCES OF INFORMATION | 11 |\n| 2.3 UNITS AND CURRENCY | 12 |\n| 2.4 RISK FACTORS | 12 |\n| 3. RELIANCE ON OTHER EXPERTS .......... ..... | 14 |\n| 4. PROPERTY DESCRIPTION AND LOCATION | 15 |\n| 4.1 GENERAL LOCATION | 15 |\n| 4.2 PRO

In [2]:
type(md_header_splits[0])

langchain_core.documents.base.Document

In [3]:
import tiktoken
import numpy as np

token_counts = []
for md_header_split in md_header_splits:
    encoding = tiktoken.get_encoding(encoding_name="cl100k_base")
    num_tokens = len(encoding.encode(md_header_split.page_content))
    token_counts.append(num_tokens)

print(f"original file token count: {np.sum(token_counts)}")
# calculate the average token count in each md header split
print(f"avg token count: {round(np.mean(token_counts))}")
print(f"max token count: {np.max(token_counts)}")
print(f"min token count: {np.min(token_counts)}")


original file token count: 118629
avg token count: 1276
max token count: 14147
min token count: 16


# Vector stores

In [4]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores.chroma import Chroma

embedding_function = OpenAIEmbeddings(model="text-embedding-3-large")
LOAD_CACHED_EMBEDDINGS = True  # load cached embeddings from disk

if LOAD_CACHED_EMBEDDINGS:
    # load from disk
    vector_db = Chroma(persist_directory="/home/yixin0829/minmod/minmod-poc/.chroma_db", embedding_function=embedding_function)
else:
    # Create a vector database from the documents
    vector_db = Chroma.from_documents(md_header_splits, embedding_function, persist_directory="/home/yixin0829/minmod/minmod-poc/.chroma_db")

# Create a retriever from the vector database
retriever = vector_db.as_retriever()

In [7]:
# Write the retrieved documents to a file for inspection
with open("./azure_retrieval_result.txt", "a") as f:
    retrieved_docs = retriever.invoke("mineral resources and reserves tonnage")
    for doc in retrieved_docs:
        f.write(doc.page_content)

# Build parallel extraction chain

In [11]:
import sys
sys.path.append("/home/yixin0829/minmod/minmod-poc")
from pydantic.v1 import BaseModel
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from src.schema.mineral_site import (
    BasicInfo,
    DepositTypeCandidates,
    LocationInfo,
    MineralInventory,
    MineralSite,
)

SYS_PROMPT: str = """You extract information of interest from a given mineral report in structured JSON formats. The information of interest includes {query}.

{format_instructions}"""

def extraction_chain_factory(query: str, output_schema: BaseModel):
    prompt = ChatPromptTemplate.from_template(SYS_PROMPT)
    output_parser = Py
