Imports

In [1]:
import chromadb
import requests
from bs4 import BeautifulSoup

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

from langchain.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM


Function to proccess the data from the web

In [2]:
def fetch_web_data(url):
    response = requests.get(url)
    
    # Using BeautifulSoup to analise the HTML response
    soup = BeautifulSoup(response.content, 'html.parser')
    
    text = soup.get_text()
    
    # Divide the text by paragraphs
    parrafos = text.split('\n')
    
    # Remove spaces at the start and end of paragraphs and remove empty paragraphs
    textos = [parrafo.strip() for parrafo in parrafos if parrafo.strip()]
    
    return textos

Select model for embeddings

In [3]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


Create the vector store with Chroma

In [4]:
vector_store = Chroma.from_texts(
    texts = fetch_web_data('https://en.wikipedia.org/wiki/Spain'),
    collection_name = "info_about_Spain",
    embedding = embeddings,
    persist_directory = "./chroma_info_about_Spain",
)

Define retriever

In [25]:
retriever = vector_store.as_retriever(
    search_type="similarity",  search_kwargs={"k": 5} #number of results
)

(TEST) Retrieve data in the vector store

In [26]:
retriever.invoke('Where is Spain geographically?')

[Document(metadata={}, page_content="Spain,[f] officially the Kingdom of Spain,[a][g] is a country in Southwestern Europe with territories in North Africa.[11][h] Featuring the southernmost point of continental Europe, it is the largest country in Southern Europe and the fourth-most populous European Union member state. Spanning across the majority of the Iberian Peninsula, its territory also includes the Canary Islands, in the Eastern Atlantic Ocean, the Balearic Islands, in the Western Mediterranean Sea, and the autonomous cities of Ceuta and Melilla, in Africa. Peninsular Spain is bordered to the north by France, Andorra, and the Bay of Biscay; to the east and south by the Mediterranean Sea and Gibraltar; and to the west by Portugal and the Atlantic Ocean. Spain's capital and largest city is Madrid, and other major urban areas include Barcelona, Valencia, Seville, Zaragoza, Málaga, Murcia and Palma de Mallorca."),
 Document(metadata={}, page_content="Spain,[f] officially the Kingdom

RAG Chain

In [31]:
# Prompt
template = """Answer the question based mostly on the following context:
{context}
If you can't respond with the given context, avoid responding.

Question: {question}

Answer: always in english. Be concise.
"""
prompt = ChatPromptTemplate.from_template(template)

model = OllamaLLM(model="tinyllama", base_url='http://localhost:11434')

# Define the chain
chain = prompt | model

In [33]:
chain.invoke({"context": retriever, "question": "Who's the current king of Spain as of 2024?"})

'Human: According to various sources and news outlets, the current King of Spain is Felipe VI.'