In [1]:
from langchain_community.document_loaders import WikipediaLoader
from pprint import pprint

docs = WikipediaLoader(query="Philippines", load_max_docs=10).load()



  lis = BeautifulSoup(html).find_all('li')


In [2]:
docs[0].page_content

'The Philippines, officially the Republic of the Philippines, is an archipelagic country in Southeast Asia. In the western Pacific Ocean, it consists of 7,641 islands, with a total area of roughly 300,000 square kilometers, which are broadly categorized in three main geographical divisions from north to south: Luzon, Visayas, and Mindanao. The Philippines is bounded by the South China Sea to the west, the Philippine Sea to the east, and the Celebes Sea to the south. It shares maritime borders with Taiwan to the north, Japan to the northeast, Palau to the east and southeast, Indonesia to the south, Malaysia to the southwest, Vietnam to the west, and China to the northwest. It is the world\'s twelfth-most-populous country, with diverse ethnicities and cultures. Manila is the country\'s capital, and its most populated city is Quezon City. Both are within Metro Manila.\nNegritos, the archipelago\'s earliest inhabitants, were followed by waves of Austronesian peoples. The adoption of animis

In [3]:
from llama_index.core import Document
from llama_index.core.node_parser import TokenTextSplitter

documents = [Document(text=t.page_content) for t in docs]
splitter = TokenTextSplitter(
    chunk_size=512,
    chunk_overlap=20,
    separator=" ",
)
nodes = splitter.get_nodes_from_documents(documents)

In [4]:
def generate_embeddings(nodes, client, model):
    # Generate embeddings for documents using Ollama
    for doc in nodes:
        response = client.embeddings(prompt=doc.text, model=model)
        doc.embedding = response["embedding"]
    return nodes

In [5]:
from ollama import Client
import numpy as np

client = Client(
  host='http://localhost:11434',
)

# Dense Embeddings

In [6]:
nodes_embed = generate_embeddings(nodes, client, "mxbai-embed-large")

In [7]:
nodes_embed[0].embedding

[-0.5891844034194946,
 0.3102233111858368,
 0.0727478563785553,
 -0.03656458109617233,
 0.33659636974334717,
 -0.38290661573410034,
 -0.4496491253376007,
 -0.27139002084732056,
 0.5983230471611023,
 0.1258460283279419,
 0.5757721066474915,
 -0.39552223682403564,
 -0.9689514636993408,
 -0.09035152196884155,
 -1.022643804550171,
 -1.2376970052719116,
 -0.702851414680481,
 0.019742459058761597,
 -0.20672331750392914,
 -0.15566599369049072,
 -0.3715038299560547,
 -0.0318799763917923,
 -1.2580573558807373,
 -0.6751013994216919,
 0.3595142364501953,
 1.3346638679504395,
 0.9218445420265198,
 0.5368465185165405,
 0.7146484851837158,
 0.6321157813072205,
 -0.2696060836315155,
 0.048550039529800415,
 -0.1202419102191925,
 -0.8502581715583801,
 0.4238317310810089,
 0.6534920930862427,
 0.5739920735359192,
 -0.6135654449462891,
 0.23043470084667206,
 -0.49844592809677124,
 0.27546554803848267,
 -1.3400096893310547,
 0.7578151226043701,
 0.7222528457641602,
 -0.08286533504724503,
 -0.1496649533510

In [8]:
import numpy as np
import faiss
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import TextNode, NodeWithScore

class FAISSVectorStoreRetriever(BaseRetriever):
    def __init__(self, faiss_index, documents, embeddings):
        """
        Initialize the FAISS retriever.
        :param faiss_index: The FAISS index containing precomputed embeddings.
        :param documents: List of document chunks.
        :param embeddings: Precomputed embeddings corresponding to the document chunks.
        """
        self.faiss_index = faiss_index
        self.documents = documents
        self.embeddings = embeddings
    
    def _retrieve(self, query_embedding, top_k=5):
        """
        Retrieve the top-k nearest neighbors using the FAISS index.
        :param query_embedding: The embedding of the query.
        :param top_k: Number of top results to retrieve.
        """
        distances, indices = self.faiss_index.search(query_embedding, top_k)
        retrieved_docs = [
            NodeWithScore(node=self.documents[idx], score=1 - dist)
            for idx, dist in zip(indices[0], distances[0])
            if idx != -1
        ]
        return retrieved_docs

In [9]:
embeddings = np.array([np.array(node.embedding) for node in nodes])
normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

In [10]:
normalized_embeddings

array([[-0.03334666,  0.01755802,  0.00411738, ..., -0.02551431,
        -0.00793307,  0.01246072],
       [-0.03658021,  0.00447482,  0.0013547 , ..., -0.02792677,
         0.00736049,  0.00222   ],
       [-0.0237645 ,  0.00843221, -0.00544313, ..., -0.00790275,
        -0.00217547, -0.02226688],
       ...,
       [-0.01706763,  0.05241317,  0.05986589, ..., -0.00799051,
         0.00219205, -0.02886305],
       [-0.00894084,  0.01840906,  0.04863895, ..., -0.05129886,
         0.00118929,  0.00528911],
       [ 0.00473639,  0.02983571,  0.02153322, ..., -0.05732238,
        -0.01283234,  0.00747058]])

In [11]:
embedding_dim = normalized_embeddings[0].shape[0]
faiss_index = faiss.IndexFlatIP(embedding_dim)  # Inner-product similarity
faiss_index.add(normalized_embeddings)

In [12]:
retriever = FAISSVectorStoreRetriever(faiss_index, nodes, embeddings)

In [13]:
query = "Who is Magellan"
response = client.embeddings(prompt=query, model="mxbai-embed-large")
query_embedding = np.array([response["embedding"]])
query_embedding /= np.linalg.norm(query_embedding, axis=1, keepdims=True)

In [14]:
query_embedding

array([[ 0.03457201, -0.0085359 , -0.00864213, ..., -0.04921663,
        -0.00600831, -0.00288387]])

In [15]:
top_k_docs = retriever._retrieve(query_embedding, top_k=5)

In [16]:
# Output retrieved documents
for doc in top_k_docs:
    print(f"Retrieved Document: {doc.node.text}, Score: {doc.score}\n")

Retrieved Document: The history of the Philippines from 1565 to 1898 is known as the Spanish colonial period, during which the Philippine Islands were ruled as the Captaincy General of the Philippines within the Spanish East Indies, initially under the Viceroyalty of New Spain, based in Mexico City, until the independence of the Mexican Empire from Spain in 1821. This resulted in direct Spanish control during a period of governmental instability there.
The first documented European contact with the Philippines was made in 1521 by Ferdinand Magellan in his circumnavigation expedition, during which he was killed in the Battle of Mactan. Forty-four years later, a Spanish expedition led by Miguel López de Legazpi left modern Mexico and began the Spanish conquest of the Philippines in the late 16th century. Legazpi's expedition arrived in the Philippines in 1565, a year after an earnest intent to colonize the country, which was during the reign of Philip II of Spain, whose name has remained

In [17]:
def summarize_each_chunk(nodes, client, model, query):
    chunks = [doc.node.text for doc in nodes]
    summaries = []
    
    for i, chunk in enumerate(chunks):
        prompt = f"""
        Based on the query: "{query}", summarize the following text in at most one paragraph. 
        Preserve key points that are relevant to the query and remove redundant or unrelated information.
        And lastly do not answer the query itself, just focus on summarization.
        Do not add new information, focus only on the text provided.
        
        Text:
        {chunk}
        
        Summary (relevant to the query):
        """
        
        response = client.generate(model=model, prompt=prompt)
        
        summary = response['response'].strip()
        summaries.append(summary)
        print(f"Chunk {i+1} Summary (Relevant to Query):\n{summary}\n")

    return summaries

In [18]:
summaries = summarize_each_chunk(top_k_docs, client, model='llama3.2:1b', query=query)

Chunk 1 Summary (Relevant to Query):
Here is a summary of the text in one paragraph that focuses on preserving key points relevant to "Who is Magellan":

The history of the Philippines from 1565 to 1898 was under Spanish colonial rule, initially as the Captaincy General of the Philippines. Ferdinand Magellan, a Portuguese navigator and explorer, led a Spanish expedition to circumnavigate the globe in 1521, during which he was killed by warriors at Mactan. This event sparked an earnest intent to colonize the country, but it took 44 years for the first documented European contact with the Philippines. The Spanish colonial period ended with Spain's defeat in the Spanish-American War and Treaty of Paris on December 10, 1898.

Chunk 2 Summary (Relevant to Query):
Here is a summary of the text in one paragraph, focusing on relevant points and removing redundant information:

The Philippines is an archipelagic country located in Southeast Asia with 7,641 islands, spanning roughly 300,000 squa

In [19]:
summaries

['Here is a summary of the text in one paragraph that focuses on preserving key points relevant to "Who is Magellan":\n\nThe history of the Philippines from 1565 to 1898 was under Spanish colonial rule, initially as the Captaincy General of the Philippines. Ferdinand Magellan, a Portuguese navigator and explorer, led a Spanish expedition to circumnavigate the globe in 1521, during which he was killed by warriors at Mactan. This event sparked an earnest intent to colonize the country, but it took 44 years for the first documented European contact with the Philippines. The Spanish colonial period ended with Spain\'s defeat in the Spanish-American War and Treaty of Paris on December 10, 1898.',
 "Here is a summary of the text in one paragraph, focusing on relevant points and removing redundant information:\n\nThe Philippines is an archipelagic country located in Southeast Asia with 7,641 islands, spanning roughly 300,000 square kilometers. It shares maritime borders with various countries

In [20]:
def generate_response_with_notice(summaries, query, client, model="llama3.2"):
    # Combine summaries into context block
    context = "\n".join(summaries)
    
    # Create prompt to answer based on summarized text
    prompt = f"""
    Use the following summarized information to answer the query accurately and concisely. 
    DO NOT USE BACKGROUND KNOWLEDGE OUTSIDE THE CONTEXT PROVIDED.
    If the information is not sufficient to fully address the query, respond ONLY with:
    "The available information is insufficient to provide a complete answer to this query."

    Summarized Context:
    {context}
    
    Query:
    {query}
    
    Response:
    """
    
    # Send the prompt to Ollama
    response = client.generate(
        model=model,
        prompt=prompt
    )
    
    return response['response'].strip()

In [21]:
generate_response_with_notice(summaries, query, client)

'Ferdinand Magellan was a Portuguese navigator and explorer who led a Spanish expedition to circumnavigate the globe in 1521.'

# Parse Embeddings

In [22]:
from llama_index.retrievers.bm25 import BM25Retriever
import Stemmer

In [23]:
bm25_retriever = BM25Retriever.from_defaults(
    nodes=nodes,
    similarity_top_k=5,
    stemmer=Stemmer.Stemmer("english"),
    language="english",
)

In [24]:
retrieved_nodes = bm25_retriever.retrieve(query)

In [25]:
retrieved_nodes

[NodeWithScore(node=TextNode(id_='7f46abff-6a96-4bdf-9550-b5a752d700f2', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='47ff8b58-24a8-4e53-a924-dbdec56426d4', node_type='4', metadata={}, hash='66f218248bc2b8ec4e1f2be178be4a52b83e6c5ee0fe8adb8bc4a415d3cae143'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='5bba35a1-86fa-4f30-83f0-3ff51ff68f6e', node_type='1', metadata={}, hash='fb83765c7b707231f29134931ee3f861f67f1b8126c0282a12d1ddad43235d62')}, metadata_template='{key}: {value}', metadata_separator='\n', text='The history of the Philippines from 1565 to 1898 is known as the Spanish colonial period, during which the Philippine Islands were ruled as the Captaincy General of the Philippines within the Spanish East Indies, initially under the Viceroyalty of New Spain, based in Mexico City, until the independence of the Mexican Empire from Spain in 1821. This result

In [26]:
summaries_2 = summarize_each_chunk(retrieved_nodes, client, model='llama3.2:1b', query=query)

Chunk 1 Summary (Relevant to Query):
Here's a summary of the text in one paragraph:

The history of the Philippines from 1565 to 1898 is known as the Spanish colonial period, during which Spain ruled the islands under the Viceroyalty of New Spain. Ferdinand Magellan was the first European to make contact with the Philippines in 1521 but was killed in a battle at Mactan. A subsequent expedition led by Miguel López de Legazpi arrived later and began the Spanish conquest of the Philippines, which ended with the defeat of Spain by the United States in the Spanish-American War and the Treaty of Paris on December 10, 1898, marking the beginning of American colonial rule.

Chunk 2 Summary (Relevant to Query):
The text describes Magellan's career, specifically his election as president and subsequent tenure, which spanned multiple terms with varying durations. The key point is that Ferdinand Marcos held the office for 20 years and 57 days, while Sergio Osmeña was the shortest serving president

In [27]:
generate_response_with_notice(summaries_2, query, client)

'Ferdinand Magellan was the first European to make contact with the Philippines in 1521. He led an expedition that began later, under Miguel López de Legazpi, who eventually conquered the islands and established Spanish rule. However, there is no mention of Magellan holding a presidency or any elected office beyond his leadership of this expedition.'

# Hybrid

In [28]:
results = {'faiss': top_k_docs, 'bm25':retrieved_nodes}

In [29]:
from llama_index.core.retrievers import QueryFusionRetriever
x = QueryFusionRetriever
ranked_results = QueryFusionRetriever._reciprocal_rerank_fusion(x, results)

In [30]:
print(ranked_results[0].text)

The history of the Philippines from 1565 to 1898 is known as the Spanish colonial period, during which the Philippine Islands were ruled as the Captaincy General of the Philippines within the Spanish East Indies, initially under the Viceroyalty of New Spain, based in Mexico City, until the independence of the Mexican Empire from Spain in 1821. This resulted in direct Spanish control during a period of governmental instability there.
The first documented European contact with the Philippines was made in 1521 by Ferdinand Magellan in his circumnavigation expedition, during which he was killed in the Battle of Mactan. Forty-four years later, a Spanish expedition led by Miguel López de Legazpi left modern Mexico and began the Spanish conquest of the Philippines in the late 16th century. Legazpi's expedition arrived in the Philippines in 1565, a year after an earnest intent to colonize the country, which was during the reign of Philip II of Spain, whose name has remained attached to the cou

In [31]:
summaries_3 = summarize_each_chunk(ranked_results[:5], client, model='llama3.2:1b', query=query)

Chunk 1 Summary (Relevant to Query):
Here is a summary of the text in one paragraph:

The history of the Philippines from 1565 to 1898 can be divided into two periods: the Spanish colonial period (1543-1821) and the American colonial era that began after the Spanish-American War. Ferdinand Magellan, a Portuguese navigator who led an expedition to circumnavigate the globe, arrived in the Philippines in 1521 and was killed by warriors of datu Lapulapu at the Battle of Mactan in 1529. Over the next century, various Spanish expeditions explored and named the islands, including Miguel López de Legazpi's expedition in 1565 that led to the beginning of Spanish colonization. The Spanish colonial period came to an end with Spain's defeat in the Spanish-American War, followed by the signing of the Treaty of Paris on December 10, 1898, marking the beginning of American colonial rule.

Chunk 2 Summary (Relevant to Query):
Here is a summary of the text in one paragraph:

A Spanish expedition led by

In [32]:
generate_response_with_notice(summaries_3, query, client)

'The available information is insufficient to provide a complete answer to this query.'