In [1]:
## Data ingestion

from langchain_community.document_loaders import TextLoader
loader = TextLoader("content.txt", encoding="utf-8")

text_documents = loader.load()

text_documents



[Document(metadata={'source': 'content.txt'}, page_content="A blockchain is a distributed database or ledger shared across a computer network's nodes. They are best known for their crucial role in cryptocurrency systems, maintaining a secure and decentralized record of transactions, but they are not limited to cryptocurrency uses. Blockchains can be used to make data in any industry immutable—meaning it cannot  be altered.\n\nSince a block can’t be changed, the only trust needed is at the point where a user or program enters data. This reduces the need for trusted third parties, such as auditors or other humans, who add costs and can make mistakes.\n\nYou might be familiar with spreadsheets or databases. A blockchain is somewhat similar because it is a database where information is entered and stored. The key difference between a traditional database or spreadsheet and a blockchain is how the data is structured and accessed.\n\nA blockchain consists of programs called scripts that cond

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["OPENAI_API_KEY"] = "sk-..."  # this will enable OpenAI API and Chromadb connection

In [3]:
from langchain_community.document_loaders import WebBaseLoader
import bs4

## load chunk and index content of html page
url="https://pmc.ncbi.nlm.nih.gov/articles/PMC7004292/"
loader = WebBaseLoader(web_path=(url), bs_kwargs=dict(parse_only=bs4.SoupStrainer(
    class_=("front-matter","st1","st2","st3")
)),)
text_docs = loader.load()
text_docs

USER_AGENT environment variable not set, consider setting it to identify your requests.


[Document(metadata={'source': 'https://pmc.ncbi.nlm.nih.gov/articles/PMC7004292/'}, page_content='\nBlockchain Technologies: Opportunities for Solving Real-World Problems in Healthcare and Biomedical Sciences\nTaghreed Justinia\nTaghreed Justinia\n\n1College of Public Health & Health Informatics, King Saud bin Abdulaziz University for Health Sciences/ King Abdullah International Medical Research Center, Jeddah, KSA\nFind articles by Taghreed Justinia\n\n\n1\n\n\nAuthor information\nArticle notes\nCopyright and License information\n\n\n\n\n1College of Public Health & Health Informatics, King Saud bin Abdulaziz University for Health Sciences/ King Abdullah International Medical Research Center, Jeddah, KSA\n\n✉Corresponding author: Taghreed Justinia, PhD, Regional Director IT Services, Technology & Health Informatics, Joint-Appointment Assistant Professor Department of Health Informatics, King Saud bin Abdulaziz University for Health Sciences, PO. Box 9515, Jeddah, 21423, Phone: +966 12 

In [4]:
from langchain_community.document_loaders import PyPDFLoader

loader= PyPDFLoader("blockchain.pdf")
loader.load()

[Document(metadata={'producer': 'iTextSharp 4.0.7 (based on iText 2.0.7)', 'creator': 'PyPDF', 'creationdate': '2017-06-08T12:30:26-07:00', 'moddate': '2017-06-08T12:30:26-07:00', 'title': 'An Overview of Blockchain Technology: Architecture, Consensus, and Future Trends', 'author': 'Zibin Zheng, Shaoan Xie, Hongning Dai, Xiangping Chen, Huaimin Wang', 'keywords': 'Blockchain, decentralization, consensus, scalability', 'subject': '2017 IEEE 6th International Congress on Big Data', 'rgid': 'PB:318131748_AS:546346524725248@1507270570125', 'source': 'blockchain.pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}, page_content='See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/318131748\nAn Overview of Blockchain Technology: Architecture, Consensus, and Future\nTrends\nConference Paper · June 2017\nDOI: 10.1109/BigDataCongress.2017.85\nCITATIONS\n5,036\nREADS\n419,678\n5 authors, including:\nZibin Zheng\nSun Yat-sen University\n6

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    #length_function=len
)

documents= text_splitter.split_documents(text_documents)

documents[:5]

In [6]:
## vector embeddings - vector store
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
db= Chroma.from_documents(documents[:20], embedding=OpenAIEmbeddings())

# embeddings = OpenAIEmbeddings()
# vector_store = Chroma(embedding_function=embeddings.embed_query)

In [None]:
## vector search
query= "What is block chain?"
result= db.similarity_search(query)
result