In [1]:
from langchain_community.document_loaders import WikipediaLoader
from pprint import pprint

docs = WikipediaLoader(query="Philippines", load_max_docs=10).load()



  lis = BeautifulSoup(html).find_all('li')


In [2]:
docs[0].page_content

'The Philippines, officially the Republic of the Philippines, is an archipelagic country in Southeast Asia. In the western Pacific Ocean, it consists of 7,641 islands, with a total area of roughly 300,000 square kilometers, which are broadly categorized in three main geographical divisions from north to south: Luzon, Visayas, and Mindanao. The Philippines is bounded by the South China Sea to the west, the Philippine Sea to the east, and the Celebes Sea to the south. It shares maritime borders with Taiwan to the north, Japan to the northeast, Palau to the east and southeast, Indonesia to the south, Malaysia to the southwest, Vietnam to the west, and China to the northwest. It is the world\'s twelfth-most-populous country, with diverse ethnicities and cultures. Manila is the country\'s capital, and its most populated city is Quezon City. Both are within Metro Manila.\nNegritos, the archipelago\'s earliest inhabitants, were followed by waves of Austronesian peoples. The adoption of animis

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
chunks = text_splitter.split_documents(documents=docs)
len(chunks)

90

In [4]:
chunks_docs = [x.page_content for x in chunks]
chunks_docs[0]

'The Philippines, officially the Republic of the Philippines, is an archipelagic country in Southeast Asia. In the western Pacific Ocean, it consists of 7,641 islands, with a total area of roughly 300,000 square kilometers, which are broadly categorized in three main geographical divisions from north to south: Luzon, Visayas, and Mindanao. The Philippines is bounded by the South China Sea to the west, the Philippine Sea to the east, and the Celebes Sea to the south. It shares maritime borders'

In [5]:
from ollama import Client
import numpy as np

client = Client(
  host='http://localhost:11434',
)

In [6]:
def generate_embeddings(documents, client, model):
    # Generate embeddings for documents using Ollama
    embeddings = []
    for doc in documents:
        response = client.embeddings(prompt=doc, model=model)
        embeddings.append(response["embedding"])
    return np.array(embeddings)

In [151]:
embeddings = generate_embeddings(chunks_docs, client, "mxbai-embed-large")

In [152]:
embeddings

array([[-0.57230604,  0.64173025, -0.35715616, ..., -0.57842422,
        -0.23363476,  0.23840278],
       [-0.63292491,  0.66969293, -0.40974909, ..., -0.41128689,
        -0.56955785, -0.21378343],
       [-0.25046343,  1.35836792,  0.26955152, ..., -0.86843258,
        -0.1213906 , -0.57189274],
       ...,
       [-0.67305088,  1.19187093,  0.6693086 , ..., -0.18486166,
         0.17004541, -0.20220765],
       [-0.47561765,  0.72825933,  0.53640741, ..., -0.0581817 ,
        -0.01174569, -0.21620721],
       [-0.21668237,  0.58151639,  0.67202467, ..., -0.28565156,
        -0.42977393, -0.51442134]])

In [9]:
from llama_index.vector_stores.faiss import FaissVectorStore
import faiss

def create_faiss_vector_store(embedding_size: int):
    # Initialize FAISS with dimensions of the embedding model
    faiss_index = faiss.IndexFlatIP(embedding_size)  # Inner-product similarity
    vector_store = FaissVectorStore(faiss_index=faiss_index)
    return vector_store

In [10]:
embedding_dim = embeddings[0].shape[0]
embedding_dim 

1024

In [72]:
# Retrievers
from llama_index.core.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
    KeywordTableSimpleRetriever,
)

In [69]:
from llama_index.core import Document, VectorStoreIndex, load_index_from_storage, StorageContext
from llama_index.core.schema import TextNode, NodeWithScore

In [59]:
vector_store = create_faiss_vector_store(embedding_size=embedding_dim)

In [60]:
nodes = [
    TextNode(text=chunk, embedding=embedding)
    for chunk, embedding in zip(chunks_docs, embeddings)
]

In [115]:
class FAISSVectorStoreRetriever(BaseRetriever):
    def __init__(self, faiss_index, documents, embeddings):
        """
        Initialize the FAISS retriever.
        :param faiss_index: The FAISS index containing precomputed embeddings.
        :param documents: List of document chunks.
        :param embeddings: Precomputed embeddings corresponding to the document chunks.
        """
        self.faiss_index = faiss_index
        self.documents = documents
        self.embeddings = embeddings
    
    def _retrieve(self, query_embedding, top_k=5):
        """
        Retrieve the top-k nearest neighbors using the FAISS index.
        :param query_embedding: The embedding of the query.
        :param top_k: Number of top results to retrieve.
        """
        distances, indices = self.faiss_index.search(np.array(query_embedding), top_k)
        retrieved_docs = [
            NodeWithScore(node=self.documents[idx], score=1 - dist)
            for idx, dist in zip(indices[0], distances[0])
            if idx != -1
        ]
        return retrieved_docs

In [154]:
embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)

In [156]:
embeddings

array([[-0.03054146,  0.03424632, -0.01905985, ..., -0.03086796,
        -0.01246806,  0.01272251],
       [-0.03408776,  0.036068  , -0.02206807, ..., -0.02215089,
        -0.03067497, -0.01151384],
       [-0.01419157,  0.07696685,  0.01527313, ..., -0.04920649,
        -0.00687815, -0.03240417],
       ...,
       [-0.03798815,  0.06727124,  0.03777693, ..., -0.01043391,
         0.00959765, -0.01141295],
       [-0.02692873,  0.04123291,  0.03037055, ..., -0.00329416,
        -0.00066502, -0.01224132],
       [-0.01202815,  0.03228028,  0.03730445, ..., -0.01585667,
        -0.02385698, -0.0285558 ]])

In [155]:
faiss_index = faiss.IndexFlatIP(embedding_dim)
faiss_index.add(embeddings)

In [160]:
retriever = FAISSVectorStoreRetriever(faiss_index, nodes, embeddings)

In [168]:
np.array([embeddings[0]])

array([[-0.03054146,  0.03424632, -0.01905985, ..., -0.03086796,
        -0.01246806,  0.01272251]])

In [171]:
query_embedding = np.array([embeddings[0]])
query_embedding

array([[-0.03054146,  0.03424632, -0.01905985, ..., -0.03086796,
        -0.01246806,  0.01272251]])

In [172]:
query_embedding /= np.linalg.norm(query_embedding, axis=1, keepdims=True)

In [173]:
query_embedding

array([[-0.03054146,  0.03424632, -0.01905985, ..., -0.03086796,
        -0.01246806,  0.01272251]])

In [169]:
# Perform a retrieval
query_embedding = np.array([embeddings[0]])  # Example query embedding
top_k_docs = retriever._retrieve(query_embedding, top_k=5)

In [170]:
# Output retrieved documents
for doc in top_k_docs:
    print(f"Retrieved Document: {doc.node.text}, Score: {doc.score}")

Retrieved Document: The Philippines, officially the Republic of the Philippines, is an archipelagic country in Southeast Asia. In the western Pacific Ocean, it consists of 7,641 islands, with a total area of roughly 300,000 square kilometers, which are broadly categorized in three main geographical divisions from north to south: Luzon, Visayas, and Mindanao. The Philippines is bounded by the South China Sea to the west, the Philippine Sea to the east, and the Celebes Sea to the south. It shares maritime borders, Score: 0.0
Retrieved Document: with Taiwan to the north, Japan to the northeast, Palau to the east and southeast, Indonesia to the south, Malaysia to the southwest, Vietnam to the west, and China to the northwest. It is the world's twelfth-most-populous country, with diverse ethnicities and cultures. Manila is the country's capital, and its most populated city is Quezon City. Both are within Metro Manila., Score: 0.19139140844345093
Retrieved Document: The Philippines is an eme

In [180]:
query = ["What is Asia"]
query_embedding = generate_embeddings(query,client, model="mxbai-embed-large")
query_embedding /= np.linalg.norm(query_embedding, axis=1, keepdims=True)

In [181]:
query_embedding

array([[-0.02969048,  0.0487471 , -0.03912626, ..., -0.06983893,
        -0.01220416, -0.03056643]])

In [182]:
retrieved_docs = retriever._retrieve(query_embedding)
context = "\n".join([doc.node.text for doc in retrieved_docs])

In [183]:
# Output retrieved documents
for doc in retrieved_docs:
    print(f"Retrieved Document: {doc.node.text}, Score: {doc.score}")

Retrieved Document: The Philippines, officially the Republic of the Philippines, is an archipelagic country in Southeast Asia. In the western Pacific Ocean, it consists of 7,641 islands, with a total area of roughly 300,000 square kilometers, which are broadly categorized in three main geographical divisions from north to south: Luzon, Visayas, and Mindanao. The Philippines is bounded by the South China Sea to the west, the Philippine Sea to the east, and the Celebes Sea to the south. It shares maritime borders, Score: 0.39185839891433716
Retrieved Document: with Taiwan to the north, Japan to the northeast, Palau to the east and southeast, Indonesia to the south, Malaysia to the southwest, Vietnam to the west, and China to the northwest. It is the world's twelfth-most-populous country, with diverse ethnicities and cultures. Manila is the country's capital, and its most populated city is Quezon City. Both are within Metro Manila., Score: 0.4099165201187134
Retrieved Document: The Philip

In [199]:
prompt = f"be elaborate but must be strict with context. if it is not in the context, answer that it lacks evidence. Context: {context}\n\nQuestion: {query}\n\nAnswer:"
prompt

"be elaborate but must be strict with context. if it is not in the context, answer that it lacks evidence. Context: The Philippines, officially the Republic of the Philippines, is an archipelagic country in Southeast Asia. In the western Pacific Ocean, it consists of 7,641 islands, with a total area of roughly 300,000 square kilometers, which are broadly categorized in three main geographical divisions from north to south: Luzon, Visayas, and Mindanao. The Philippines is bounded by the South China Sea to the west, the Philippine Sea to the east, and the Celebes Sea to the south. It shares maritime borders\nwith Taiwan to the north, Japan to the northeast, Palau to the east and southeast, Indonesia to the south, Malaysia to the southwest, Vietnam to the west, and China to the northwest. It is the world's twelfth-most-populous country, with diverse ethnicities and cultures. Manila is the country's capital, and its most populated city is Quezon City. Both are within Metro Manila.\nThe Phi

In [200]:
response = client.generate(model='llama3.2', prompt=prompt)
print(response.response)

A question that falls squarely within the context of the provided information!

Asia, also known as the Asian continent or East Asia, refers to the largest and most populous of the world's seven continents. It is bounded by the Pacific Ocean to the east, the Arctic Ocean to the north, and the Indian Ocean to the south.

The Philippines, being an archipelagic country in Southeast Asia, shares maritime borders with several countries including Taiwan, Japan, Palau, Indonesia, Malaysia, Vietnam, and China, as mentioned earlier.

In this context, Asia is not just a geographical location but also encompasses various cultures, languages, economies, and natural resources, making it a vital part of the global community.


GenerateResponse(model='llama3.2', created_at='2024-12-20T15:30:37.029113262Z', done=True, done_reason='stop', total_duration=183494895, load_duration=31063977, prompt_eval_count=449, prompt_eval_duration=20000000, eval_count=11, eval_duration=130000000, response='The answer is "Philippines" (PH).', context=[128006, 9125, 128007, 271, 38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 271, 128009, 128006, 882, 128007, 271, 2014, 25, 578, 26363, 11, 19073, 279, 5545, 315, 279, 26363, 11, 374, 459, 5438, 82179, 13070, 3224, 304, 36664, 13936, 13, 763, 279, 19001, 16867, 22302, 11, 433, 17610, 315, 220, 22, 11, 23525, 30100, 11, 449, 264, 2860, 3158, 315, 17715, 220, 3101, 11, 931, 9518, 41668, 11, 902, 527, 44029, 71974, 304, 2380, 1925, 54001, 37601, 505, 10411, 311, 10007, 25, 82739, 263, 11, 7811, 98440, 11, 323, 23754, 3444, 78, 13, 578, 26363, 374, 62215, 555, 279, 4987, 5734, 15379, 311, 279, 9909, 11, 279, 57281, 15379, 311, 279, 11226, 11, 323, 279, 23519, 9620, 15379, 311, 279, 

In [206]:
def rag_query(query, retriever, client, model="mxbai-embed-large"):
    # Generate query embedding
    query_embedding = generate_embeddings([query],client, model)
    query_embedding /= np.linalg.norm(query_embedding, axis=1, keepdims=True)
    # Retrieve relevant documents
    retrieved_docs = retriever._retrieve(query_embedding)
    context = "\n".join([doc.node.text for doc in retrieved_docs])
    for doc in retrieved_docs:
        print(f"Retrieved Document: {doc.node.text}, Score: {doc.score}")
    
    # Generate response
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    response = client.generate(model='llama3.2', prompt=prompt)
    return response

# Example usage
query = "Tell me more about magellan"
response = rag_query(query, retriever, client, model="mxbai-embed-large")
print("RAG Response:", response.response)

Retrieved Document: The arrival of Ferdinand Magellan, a Portuguese explorer leading a fleet for Castile, marked the beginning of Spanish colonization. In 1543, Spanish explorer Ruy López de Villalobos named the archipelago Las Islas Filipinas in honor of King Philip II of Castile. Spanish colonization via New Spain, beginning in 1565, led to the Philippines becoming ruled by the Crown of Castile, as part of the Spanish Empire, for more than 300 years. Catholic Christianity became the dominant religion, and Manila, Score: 0.4096744656562805
Retrieved Document: During his 1542 expedition, Spanish explorer Ruy López de Villalobos named the islands of Leyte and Samar "Felipinas" after the Prince of Asturias, later Philip II of Castile. Eventually, the name "Las Islas Filipinas" would be used for the archipelago's Spanish possessions.: 6  Other names, such as "Islas del Poniente" (Western Islands), "Islas del Oriente" (Eastern Islands), Ferdinand Magellan's name, and "San Lázaro" (Islands 