## Below is the imports needed

In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader

import config


## Load documents
Load documents to do question answering over. If you want to do this over your documents, this is the section you should replace.

In [2]:
loader = PyPDFLoader("challenges.pdf")
documents = loader.load()

## Split documents

Split documents into small chunks. This is so we can find the most relevant chunks for a query and pass only those into the LLM.

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)


In [4]:
print(len(texts))

46


## Initialize ChromaDB

Create embeddings for each chunk and insert into the Chroma vector database.

In [5]:
openai_api_key=config.api_key

In [6]:
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
vectordb = Chroma.from_documents(texts, embeddings)
docsearch = Chroma.from_documents(texts, embeddings)


Using embedded DuckDB without persistence: data will be transient
Using embedded DuckDB without persistence: data will be transient


## Create the chain

Initialize the chain we will use for question answering.

In [7]:
qa = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=openai_api_key), chain_type="stuff", retriever=docsearch.as_retriever())


## Ask questions

In [9]:
query = "What is the authors key point with this article?"
qa.run(query)

' The key point of this article is that companies should consider a single, integrated platform to achieve the highest business value for their data integration projects, rather than relying on a series of disparate tools.'

Summarization involves creating a smaller summary of multiple longer documents. This can be useful for distilling long documents into the core pieces of information.

The recommended way to get started using a summarization chain is:

In [10]:
from langchain import OpenAI, PromptTemplate, LLMChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate

llm=OpenAI(openai_api_key=openai_api_key)

text_splitter = CharacterTextSplitter()

In [11]:
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document

loader = PyPDFLoader("Mode-2021-Modern-Data-Architecture.pdf")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

chain = load_summarize_chain(llm, chain_type="map_reduce")
#docs = [Document(page_content=t) for t in texts[4:7]]
chain.run(texts)
#print(texts[4:7])

' This article discusses how to upgrade data architecture to make faster and smarter business decisions. It provides advice and guidance on leveraging modern data architectures, modular stacks, and pre-built connectors. It also covers important aspects such as global governance, vendor lock-in, and scalability, as well as discussing company-specific examples of successful data stack upgrades. It explains how to create a data-driven culture by introducing self-serve tools, promoting data innovation, and fostering collaboration between stakeholders. Finally, it introduces Mode and Sisu as advanced analytics solutions.'