In [None]:
import requests
from bs4 import BeautifulSoup
import sqlite3
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, ServiceContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.llms.llm import LLM

# imports
from llama_index.embeddings.gemini import GeminiEmbedding

from llama_index.llms.openai import OpenAI
from llama_index.llms.litellm import LiteLLM
import os

from llama_index.core.embeddings import BaseEmbedding

from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.core import VectorStoreIndex, SimpleKeywordTableIndex
from llama_index.core import SummaryIndex
from llama_index.core import ComposableGraph
from llama_index.llms.openai import OpenAI
from llama_index.core.response.notebook_utils import display_response
from llama_index.core import Settings
from dotenv import load_dotenv
from base.rag import BaseRag
from base.logger import Logger
from llama_index.core import StorageContext
from llama_index.graph_stores.nebula import NebulaGraphStore

from pathlib import Path
from llama_index.core.node_parser import SentenceSplitter
from llama_index.storage.docstore.redis import RedisDocumentStore
from llama_index.storage.index_store.redis import RedisIndexStore

load_dotenv("/home/t/atest/.global_env")

logger = Logger().get_logger()
REDIS_HOST = os.getenv("REDIS_HOST", "127.0.0.1")
REDIS_PORT = os.getenv("REDIS_PORT", 6379)

In [None]:

models = ['gemini/gemini-2.0-flash']

llm = LiteLLM(temperature=0, model=models[0])
embed_model = GeminiEmbedding()

In [None]:
Settings.embed_model = embed_model
Settings.llm = llm

In [None]:
os.environ["NEBULA_USER"] = "root"
os.environ["NEBULA_PASSWORD"] = "nebula"  # default is "nebula"
os.environ[
    "NEBULA_ADDRESS"
] = "127.0.0.1:9669"  # assumed we have NebulaGraph installed locally

space_name = "llamaindex"
edge_types, rel_prop_names = ["relationship"], [
    "relationship"
]  # default, could be omit if create from an empty kg
tags = ["entity"]  # default, could be omit if create from an empty kg

In [None]:

# graph_store = NebulaGraphStore(
#     space_name=space_name,
#     edge_types=edge_types,
#     rel_prop_names=rel_prop_names,
#     tags=tags,
# )
# storage_context = StorageContext.from_defaults(graph_store=graph_store)

In [None]:




DEFAULT_CHUNK_SIZE=2048
DEFAULT_CHUNK_OVERLAP = 200

class SiteRag(BaseRag):

    def __init__(self, llm:LLM, storage:StorageContext, embed_model:BaseEmbedding=None,
                 chunk_size:int=DEFAULT_CHUNK_SIZE, chunk_overlap:int=DEFAULT_CHUNK_OVERLAP):
        self.reader = None

        self.llm = llm
        
        if embed_model:
            self.embed_model = embed_model
            Settings.embed_model = self.embed_model

        Settings.llm = self.llm
        Settings.chunk_size = chunk_size
        Settings.chunk_overlap = chunk_overlap
        
        self.storage_context=storage

    
    def _ingest(self, dir:Path)->None:
        self.reader =SimpleDirectoryReader(input_dir=dir)
        documents = self.reader.load_data()
        nodes = SentenceSplitter().get_nodes_from_documents(documents)


        self.storage_context.docstore.add_documents(nodes)
        logger.info(f"{len(self.storage_context.docstore.docs)} Ingested.")
        
    
    
    # def _query(self, query:str, n:int)->str:
    #     self.query_engine = self.storage_context.as_query_engine()
    #     list_response = self.query_engine.query("What is a summary of this document?")
        
    #     pass

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.agent.workflow import FunctionAgent
from llama_index.llms.openai import OpenAI
from llama_index.readers.wikipedia import WikipediaReader
import asyncio
import os


In [None]:
documents = WikipediaReader().load_data(pages=['avengers (2012)', 'Ironman (2008)'])

In [None]:
documents[0]

In [None]:
transformations = [
    
]

In [None]:
index = VectorStoreIndex.from_documents(
    documents,
    transformations = []
    
    )


In [None]:

# Create a RAG tool using LlamaIndex
# documents = SimpleDirectoryReader("data").load_data()
query_engine = index.as_query_engine()

In [None]:
out = query_engine.query("what does  Asgardian Loki  do")

In [None]:
dir(out)

In [None]:
out.response

In [None]:
sn = out.source_nodes[0]

In [None]:
dir(sn)

In [None]:
sn.score

In [None]:
sn

In [None]:
from llama_index.readers.remote import RemoteReader

In [None]:
r = RemoteReader()

In [None]:
p  = r.load_data(url = 'https://en.wikipedia.org/wiki/Nick_Fury_(Marvel_Cinematic_Universe)')

In [None]:
print(dir(p))

In [None]:
import html2text
# h = html2text.HTML2Text()
# h.ignore_links=True
# h.ignore_tables=True
# h.ignore_mailto_links=True
# h.ignore_emphasis=True

In [None]:
import logging

logger = logging.getLogger('notebook')

In [None]:
def get_url_docs(url):
    r = RemoteReader()
    h = html2text.HTML2Text()
    
    h.ignore_links=True
    h.ignore_tables=True
    h.ignore_mailto_links=True
    h.ignore_emphasis=True
    
    docs  = r.load_data(url = url)
    clean_doc = []
    for i in docs:
        try:
            clean_doc.append(h.handle(i.text))
        except Exception :
            logger.error(f'Failed to get web content for {url}')
            clean_doc.append(None)
        
    for i,j in zip(docs, clean_doc):
        i.set_content(j) 
    
    return docs

In [None]:
# dir(p[0])

In [None]:
out = get_url_docs('https://en.wikipedia.org/wiki/Nick_Fury_(Marvel_Cinematic_Universe)')

In [None]:
# load_dotenv('../../.global_env')

In [None]:
out

In [None]:
from functools import cache
from nest_asyncio import apply
apply()

In [None]:
from functools import lru_cache
from llama_index.core.base.base_query_engine import BaseQueryEngine
from itertools import chain


# A dictionary-based manual cache (because lists are unhashable for lru_cache)
_query_engine_cache = {}

def _hash_urls(urls: list[str]) -> int:
    """Create a hashable key from the list of URLs."""
    return hash(frozenset(urls))  # Using frozenset so order doesn't matter

def get_query_engine(urls: list[str]) -> BaseQueryEngine:
    """Return a cached query engine if available, otherwise create a new one."""
    cache_key = _hash_urls(urls)
    
    if cache_key in _query_engine_cache:
        return _query_engine_cache[cache_key]
    
    documents = list(chain.from_iterable(get_url_docs(url) for url in urls))

    index = VectorStoreIndex(use_async=True, nodes=documents, transformations=[])
    # retriever=index.as_retriever()
    query_engine = index.as_query_engine()
    
    _query_engine_cache[cache_key] = query_engine  # Cache the result
    
    return query_engine

In [None]:
from llama_index.core.schema import NodeWithScore



def get_answers(urls:list[str], query:str)->NodeWithScore:
    query_engine = get_query_engine(urls)
    out = query_engine.query(query)
    return out

In [None]:
urls = ['https://en.wikipedia.org/wiki/Nick_Fury_(Marvel_Cinematic_Universe)', 'https://en.wikipedia.org/wiki/Maria_Hill', 'https://en.wikipedia.org/wiki/Avengers:_Age_of_Ultron']

In [None]:
documents = list(chain.from_iterable(get_url_docs(url) for url in urls))


In [None]:
spli = SentenceSplitter(secondary_chunking_regex=r1)

ss = spli.get_nodes_from_documents(documents)

In [None]:
transformations = [SentenceSplitter(secondary_chunking_regex=r1)]

In [None]:
ii = VectorStoreIndex(nodes=documents[:1], transformations=transformations)

In [None]:
out = get_answers(urls = urls,
                  query = "what is maria hill 's full name",
                  )

In [None]:
out.response

In [None]:
q= get_query_engine(urls)

In [None]:
out = q.query('who plays maria hill in live actionmoview')

In [None]:
dir(out)

In [None]:
out.response