## Ingesting PDF

In [3]:
# pip install onnx==1.16.14

In [5]:
# %pip install --q unstructured langchain
# %pip install --q "unstructured[all-docs]"

In [7]:
from langchain_community.document_loaders import UnstructuredPDFLoader
# from langchain_community.document_loaders import OnlinePDFLoader

In [9]:
local_path = "barilla.pdf"
loader = UnstructuredPDFLoader(file_path=local_path)
data = loader.load()

In [21]:
# Preview first page
data[0].page_content[:1000]

'LA GIOIA DEL CIBO PER UNA VITA MIGLIORE\n\nRAPPORTO DI SOSTENIBILITÀ 2022\n\nIl rapporto di sostenibilità rappresenta lo strumento per condividere il percorso intrapreso da Barilla per un presente e un futuro migliore.\n\nInclude il dettaglio dei risultati raggiunti e il percorso definito per gli anni a venire, in linea con l’Agenda 2030 e gli Obiettivi di Sviluppo Sostenibile delle Nazioni Unite.\n\nI risultati presenti in questo rapporto sono stati raggiunti grazie alle Persone Barilla, alla loro instancabile passione e dedizione. Le informazioni e i dati riportati sono relativi al Gruppo Barilla e si riferiscono al periodo 1 gennaio – 31 dicembre 2022, salvo diversamente indicato.\n\nLA GIOIA DEL CIBO PER UNA VITA MIGLIORE\n\nRIUNIRE LE PERSONE ATTORNO ALLA GIOIA DEL BUON CIBO E RENDERE LA QUALITÀ LA SCELTA PER UNA VITA MIGLIORE, DAL SINGOLO AL PIANETA.\n\nÈ COSÌ CHE NUTRIAMO IL FUTURO, OGGI.\n\nR A P P O RTO D I S O S T E N I B I L I TÀ 2022\n\nINTRO\n\nLA GIOIA DEL CIBO PER UNA V

## Vector Embeddings

In [45]:
# !ollama pull nomic-embed-text
# !ollama pull mistral

In [47]:
!ollama list

NAME                       ID              SIZE      MODIFIED     
llama3.2:latest            a80c4f17acd5    2.0 GB    2 months ago    
phi3:latest                4f2222927938    2.2 GB    2 months ago    
nomic-embed-text:latest    0a109f422b47    274 MB    2 months ago    
mistral:latest             f974a74358d6    4.1 GB    2 months ago    


In [49]:
# %pip install --q chromadb
# %pip install --q langchain-text-splitters

In [51]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [53]:
# Split and chunk 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)
chunks = text_splitter.split_documents(data)

In [55]:
# Add to vector database
vector_db = Chroma.from_documents(
    documents=chunks, 
    embedding=OllamaEmbeddings(model="nomic-embed-text",show_progress=True),
    collection_name="local-rag"
)

OllamaEmbeddings: 100%|█████████████████████████| 47/47 [00:25<00:00,  1.85it/s]


In [32]:
# SAVE - not working
PERSIST_DIRECTORY = './'

vector_store = Chroma(
    collection_name="my_collection",
    embedding_function=vector_db,
    persist_directory=PERSIST_DIRECTORY,
)

In [30]:
# LOAD
# PERSIST_DIRECTORY = './'
# vector_db = Chroma(
#     collection_name="my_collection",
#     persist_directory=PERSIST_DIRECTORY,
# )

## Retrieval

In [57]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [59]:
# LLM from Ollama
local_model = "phi3"
llm = ChatOllama(model=local_model)

In [61]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [63]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [65]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [67]:
chain.invoke(input(""))

 Summarise this document in 20 words or less.


OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00,  4.80it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 44.75it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 39.11it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 41.71it/s]
OllamaEmbeddings: 100%|███████████████████████████| 1/1 [00:00<00:00, 44.10it/s]


"Barilla's GRI report details sustainability objectives for packaging, waste management and climate impact reduction goals by 2030; full methodologies pending refinement. (19 words)\n"

In [23]:
# Delete all collections in the db
# vector_db.delete_collection()