In [6]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/mock/Bongará_Zn_3-2019_mock.txt")
doc = loader.load()
doc

[Document(page_content='# TECHNICAL REPORT\n# ON THE\n# BONGARÁ ZINC PROJECT\n# YAMBRASBAMBA DISTRICT, AMAZONAS REGION,\n# NORTHERN PERU\n# FOR ZINC ONE RESOURCES INC.\nprepared by:\n\nAlbert W. (Al) Workman, P.Geo.\n\nSenior Geologist, and Vice-President, Operations\n\nand\n\nJohn Reddick, P.Geo.\n\nSenior Associate Resource Geologist\n\nEffective Date: 11 March, 2019 Toronto, Canada\n\n# 1.  SUMMARY\nOn 26 May 2017, Forrester Metals Inc. (“Forrester”) of Toronto, Ontario Canada accepted the proposal of Watts, Griffis and McOuat Limited ("WGM") to prepare a technical report that is compliant with Canada’s security rule National Instrument 43-101 (“NI 43-101”) for the Bongará zinc-oxide project (the “Project”). On June 1, 2017, Zinc One Resources Inc. (“Zinc One”) of Vancouver, Canada acquired all of the issued and outstanding shares of Forrester Metals Inc. As a result, this report has been prepared for Zinc One.\n\nThe area of interest is located in the Yambrasbamba District of Bonga

## Chunking

In [7]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

In [12]:
texts = text_splitter.create_documents([doc[0].page_content])
print(type(texts[0]))
print(texts[0].page_content[:1000])

Created a chunk of size 1052, which is longer than the specified 1000
Created a chunk of size 2128, which is longer than the specified 1000
Created a chunk of size 1613, which is longer than the specified 1000
Created a chunk of size 1319, which is longer than the specified 1000
Created a chunk of size 1643, which is longer than the specified 1000
Created a chunk of size 1529, which is longer than the specified 1000
Created a chunk of size 1027, which is longer than the specified 1000
Created a chunk of size 1077, which is longer than the specified 1000
Created a chunk of size 1328, which is longer than the specified 1000
Created a chunk of size 2357, which is longer than the specified 1000


<class 'langchain_core.documents.base.Document'>
# TECHNICAL REPORT
# ON THE
# BONGARÁ ZINC PROJECT
# YAMBRASBAMBA DISTRICT, AMAZONAS REGION,
# NORTHERN PERU
# FOR ZINC ONE RESOURCES INC.
prepared by:

Albert W. (Al) Workman, P.Geo.

Senior Geologist, and Vice-President, Operations

and

John Reddick, P.Geo.

Senior Associate Resource Geologist

Effective Date: 11 March, 2019 Toronto, Canada

# 1.  SUMMARY
On 26 May 2017, Forrester Metals Inc. (“Forrester”) of Toronto, Ontario Canada accepted the proposal of Watts, Griffis and McOuat Limited ("WGM") to prepare a technical report that is compliant with Canada’s security rule National Instrument 43-101 (“NI 43-101”) for the Bongará zinc-oxide project (the “Project”). On June 1, 2017, Zinc One Resources Inc. (“Zinc One”) of Vancouver, Canada acquired all of the issued and outstanding shares of Forrester Metals Inc. As a result, this report has been prepared for Zinc One.


## Embeddings

In [14]:
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os

load_dotenv()

embeddings_model = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
embeddings = embeddings_model.embed_documents([text.page_content for text in texts])

len(embeddings), len(embeddings[0])


(38, 1536)

In [17]:
basic_info_query: str = """the mineral site's name"""

location_info_query: str = (
    """the mineral site's location, coordinate reference system used, the country and state or province where the mineral site is located in"""
)

mineral_inventory_query: str = (
    """the identified mineral resources or reserves in each mineral zone including information like the mineral commodity type (e.g. indicated, inferred), ore unit and tonnage, grade, cutoff grade, date of the last assessment, and mineral zone"""
)

deposit_type_query: str = """the mineral site's deposit type(s)"""

embedded_query = embeddings_model.embed_query(mineral_inventory_query)
embedded_query[:10]


[0.0021897615937167573,
 -0.011089922437218408,
 -0.0059600032803637576,
 -0.023547823688793768,
 -0.013759473038028347,
 0.016256364807494407,
 -0.01896575925251768,
 -0.0543738255983642,
 -0.014662604519702771,
 -0.02980333703261077]

## Vector Store

In [30]:
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(texts, OpenAIEmbeddings())

In [34]:
docs = db.similarity_search(mineral_inventory_query)
print(doc[0].page_content)

IndexError: list index out of range

# Testing minmod code

In [15]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

with open("../data/asset/parsed_result_mock/Bongará_Zn_3-2019_mock.txt", "r") as f:
    doc = f.read()

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)
md_header_splits = md_splitter.split_text(doc)
md_header_splits

[Document(page_content='prepared by:  \nAlbert W. (Al) Workman, P.Geo.  \nSenior Geologist, and Vice-President, Operations  \nand  \nJohn Reddick, P.Geo.  \nSenior Associate Resource Geologist  \nEffective Date: 11 March, 2019 Toronto, Canada', metadata={'Header 1': 'FOR ZINC ONE RESOURCES INC.'}),
 Document(page_content='On 26 May 2017, Forrester Metals Inc. (“Forrester”) of Toronto, Ontario Canada accepted the proposal of Watts, Griffis and McOuat Limited ("WGM") to prepare a technical report that is compliant with Canada’s security rule National Instrument 43-101 (“NI 43-101”) for the Bongará zinc-oxide project (the “Project”). On June 1, 2017, Zinc One Resources Inc. (“Zinc One”) of Vancouver, Canada acquired all of the issued and outstanding shares of Forrester Metals Inc. As a result, this report has been prepared for Zinc One.  \nThe area of interest is located in the Yambrasbamba District of Bongará Province within the Amazonas Department in north-central Peru. The Project is l

In [16]:
chunk_size = 1000
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

text_splits = text_splitter.split_documents(md_header_splits)
text_splits


[Document(page_content='prepared by:  \nAlbert W. (Al) Workman, P.Geo.  \nSenior Geologist, and Vice-President, Operations  \nand  \nJohn Reddick, P.Geo.  \nSenior Associate Resource Geologist  \nEffective Date: 11 March, 2019 Toronto, Canada', metadata={'Header 1': 'FOR ZINC ONE RESOURCES INC.'}),
 Document(page_content='On 26 May 2017, Forrester Metals Inc. (“Forrester”) of Toronto, Ontario Canada accepted the proposal of Watts, Griffis and McOuat Limited ("WGM") to prepare a technical report that is compliant with Canada’s security rule National Instrument 43-101 (“NI 43-101”) for the Bongará zinc-oxide project (the “Project”). On June 1, 2017, Zinc One Resources Inc. (“Zinc One”) of Vancouver, Canada acquired all of the issued and outstanding shares of Forrester Metals Inc. As a result, this report has been prepared for Zinc One.', metadata={'Header 1': '1.  SUMMARY'}),
 Document(page_content='The area of interest is located in the Yambrasbamba District of Bongará Province within t

In [17]:
db = Chroma.from_documents(text_splits, OpenAIEmbeddings())

In [18]:
basic_info_query: str = """the mineral site's name"""

location_info_query: str = (
    """the mineral site's location, coordinate reference system used, the country and state or province where the mineral site is located in"""
)

mineral_inventory_query: str = (
    """the identified mineral resources or reserves in each mineral zone including information like the mineral commodity type (e.g. indicated, inferred), ore unit and tonnage, grade, cutoff grade, date of the last assessment, and mineral zone"""
)

deposit_type_query: str = """the mineral site's deposit type(s)"""

docs = db.similarity_search(mineral_inventory_query)
docs

[Document(page_content='- The Mineral Resources were estimated using the Canadian Institute of Mining, Metallurgy and Petroleum (“CIM”) Standards on Mineral Resources and Reserves, Definitions and Guidelines prepared by the CIM Standing Committee on Reserve Definitions and adopted by CIM Council May 10, 2014; and mandated in National Instrument 43-101 (“NI 43-101”).\n- Figures may not total due to rounding.\nThe mineralized trend on which the zinc-bearing zones are located represents an excellent target for additional drilling. In WGM’s opinion, in-fill drilling with the objective of upgrading the Inferred Mineral Resources to Indicated Mineral Resources is warranted. Zinc grade variability is sufficient however, that not all Inferred Resources will be up-graded.  With continuing exploration drilling in the Mina Chica area, some enlargement of the deposit is possible. Exploration drilling along the trend between Mina Chica and Campo Ceilo and', metadata={'Header 1': '1.  SUMMARY'}),
 D

In [19]:
db.similarity_search(basic_info_query)

[Document(page_content='During the site visit, Mr. Workman examined the Mina Grandé Centro and Norté workings (Day 1), the Mine Chica and Bongarita prospects (Day 2), and the Mina Grandé Sur area (Day 3). The Mina Grandé Centro and Norté workings and Mina Chica were outlined in detail by GPS track at the time they were examined and check sampled by WGM. All track and sample location (waypoint) data were collected using the PSAD-56 datum and imported into Google Earth software. All sites reported an estimated precision error of less than 3 metres. The fourth day was spent at the drying facility near the village of Yambrasbamba in a failed effort to examine Bongará drill core prior to beginning the return trip to Tarapoto (most of the core was lost and remaining core was unusable. A total of 16 samples were collected of mineralization in outcrop for multi-element analysis by ICP, quantitative ore-grade analysis for zinc, lead and silver and a determination of major elements (oxides) by f

In [20]:
db.similarity_search(location_info_query)

[Document(page_content='collected of mineralization in outcrop for multi-element analysis by ICP, quantitative ore-grade analysis for zinc, lead and silver and a determination of major elements (oxides) by fusion XRF. Water content data and specific gravity determinations were requested on all samples. The samples were delivered by WGM to the SGS laboratory in Lakefield, Ontario. WGM’s observations and the results from these samples confirmed the type of mineralization present, its geological setting and the general grade of the mineralization.', metadata={'Header 1': '2.  INTRODUCTION', 'Header 2': '2.1 GENERAL'}),
 Document(page_content='collected of mineralization in outcrop for multi-element analysis by ICP, quantitative ore-grade analysis for zinc, lead and silver and a determination of major elements (oxides) by fusion XRF. Water content data and specific gravity determinations were requested on all samples. The samples were delivered by WGM to the SGS laboratory in Lakefield, On

In [21]:
db.similarity_search(deposit_type_query)

[Document(page_content='Although sulphide mineralization is absent in the Bongará area due to weathering and oxidation effects, zinc-lead-silver mineralization is thought to conform to the general provisions of the Mississippi Valley-Type (“MVT”) model for stratabound, carbonate-hosted deposits. These epigenetic deposits occur in all regions of the earth, and in many areas they occur in clusters that define significant mineralized districts. Hypogene sulphide mineralization typically occurs in platform sequence rocks associated with extensional basins. Major MTV districts include the Silesia of eastern Europe, the Viburnum Trend in SE Missouri, the Pine Point area in Canada’s Northwest Territories and the Tri-State area in the United States. According to the US Geological Survey, the median deposit size of 7.0 Mt grading 6% zinc, 1.9% lead, 0.23% copper and 32.5 g silver per tonne. The main ore minerals comprising these deposits are sphalerite and galena, and they are associated with p