In [None]:
import pathlib

In [None]:
urls = pathlib.Path("./sources.txt").read_text().splitlines()
urls = map(lambda x: x.strip(), urls)
urls = filter(lambda x: x.startswith("https"), urls)
urls = list(urls)
arxiv_urls = [url for url in urls if url.startswith("https://arxiv")]
web_urls = [url for url in urls if not url.startswith("https://arxiv")]

In [None]:
# arxiv_article = await load_arxiv_article(arxiv_urls[1])
# arxiv_chunks = chunk_doc(arxiv_article, length_fn, 256)
# print(arxiv_chunks[0].as_str)

In [None]:
# web_article = await load_web_content(web_urls[0])
# web_chunks = chunk_doc(web_article, length_fn, 256)
# print(web_chunks[0].as_str)

In [None]:
# print(web_article.content)

In [None]:
import httpx

readability_script_url = 'https://cdnjs.cloudflare.com/ajax/libs/readability/0.5.0/Readability.js'
response = httpx.get(readability_script_url)
readability_js_content = response.text

# Mozilla's Readability script to inject
readability_js = """
{readability_js_content}
(function() {{
    var documentClone = document.cloneNode(true);
    var reader = new Readability(documentClone);
    var article = reader.parse();
    if (article) {{
        return `<html><body><h1>${{article.title}}</h1>${{article.content}}</body></html>`;
    }} else {{
        return "";
    }}
}})();
""".format(readability_js_content=readability_js_content)


In [None]:
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
from pathlib import Path
from playwright.async_api import Error

import os
import hashlib
import time

In [None]:
from crawl4ai import AsyncWebCrawler
from playwright.async_api import Page, Browser



class CSPCompliantAsyncPlaywrightCrawlerStrategy(AsyncPlaywrightCrawlerStrategy):
        async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
            response_headers = {}
            status_code = None
            
            self._cleanup_expired_sessions()
            session_id = kwargs.get("session_id")
            if session_id:
                context, page, _ = self.sessions.get(session_id, (None, None, None))
                if not context:
                    context = await self.browser.new_context(
                        bypass_csp=True,
                        user_agent=self.user_agent,
                        proxy={"server": self.proxy} if self.proxy else None
                    )
                    await context.set_extra_http_headers(self.headers)
                    page = await context.new_page()
                    self.sessions[session_id] = (context, page, time.time())
            else:
                context = await self.browser.new_context(
                        user_agent=self.user_agent,
                        bypass_csp=True,
                        proxy={"server": self.proxy} if self.proxy else None
                )
                await context.set_extra_http_headers(self.headers)
                page = await context.new_page()

            try:
                if self.verbose:
                    print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")

                if self.use_cached_html:
                    cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
                    if os.path.exists(cache_file_path):
                        html = ""
                        with open(cache_file_path, "r") as f:
                            html = f.read()
                        # retrieve response headers and status code from cache
                        with open(cache_file_path + ".meta", "r") as f:
                            meta = json.load(f)
                            response_headers = meta.get("response_headers", {})
                            status_code = meta.get("status_code")
                        response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
                        return response

                if not kwargs.get("js_only", False):
                    await self.execute_hook('before_goto', page)
                    response = await page.goto(url, wait_until="domcontentloaded", timeout=60000)
                    await self.execute_hook('after_goto', page)
                    
                    # Get status code and headers
                    status_code = response.status
                    response_headers = response.headers
                else:
                    status_code = 200
                    response_headers = {}

                await page.wait_for_selector('body')
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")

                js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
                if js_code:
                    if isinstance(js_code, str):
                        r = await page.evaluate(js_code)
                    elif isinstance(js_code, list):
                        for js in js_code:
                            await page.evaluate(js)
                    
                    # await page.wait_for_timeout(100)
                    await page.wait_for_load_state('networkidle')
                    # Check for on execution even
                    await self.execute_hook('on_execution_started', page)
                    
                # New code to handle the wait_for parameter
                # Example usage:
                # await crawler.crawl(
                #     url,
                #     js_code="// some JavaScript code",
                #     wait_for="""() => {
                #         return document.querySelector('#my-element') !== null;
                #     }"""
                # )
                # Example of using a CSS selector:
                # await crawler.crawl(
                #     url,
                #     wait_for="#my-element"
                # )
                wait_for = kwargs.get("wait_for")
                if wait_for:
                    try:
                        await self.smart_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
                    except Exception as e:
                        raise RuntimeError(f"Wait condition failed: {str(e)}")

                html = await page.content()
                page = await self.execute_hook('before_return_html', page, html)

                if self.verbose:
                    print(f"[LOG] ✅ Crawled {url} successfully!")

                if self.use_cached_html:
                    cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
                    with open(cache_file_path, "w", encoding="utf-8") as f:
                        f.write(html)
                    # store response headers and status code in cache
                    with open(cache_file_path + ".meta", "w", encoding="utf-8") as f:
                        json.dump({
                            "response_headers": response_headers,
                            "status_code": status_code
                        }, f)

                response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
                return response
            except Error as e:
                raise Error(f"Failed to crawl {url}: {str(e)}")
            finally:
                if not session_id:
                    await page.close()

            # try:
            #     html = await _crawl()
            #     return sanitize_input_encode(html)
            # except Error as e:
            #     raise Error(f"Failed to crawl {url}: {str(e)}")
            # except Exception as e:
            #     raise Exception(f"Failed to crawl {url}: {str(e)}")


async def on_browser_created(browser: Browser):
    print("[HOOK] on_browser_created")
    # Example customization: set browser viewport size
    context = await browser.new_context(bypass_csp=True)
    page = await context.new_page()
    await page.close()
    await context.close()

async def on_execution_started(page: Page):
    print("[HOOK] on_execution_started")
    # Example customization: perform actions after JS execution
    result = await page.evaluate(readability_js)
    await page.set_content(html=result)


crawler_strategy = AsyncPlaywrightCrawlerStrategy(verbose=True)
crawler_strategy.set_hook('on_browser_created', on_browser_created)
crawler_strategy.set_hook('on_execution_started', on_execution_started)


async with AsyncWebCrawler(verbose=True, crawler_strategy=crawler_strategy) as crawler:
    pages = await crawler.arun("https://wandb.ai/site/articles/rag-techniques/", js_code=readability_js, bypass_cache=True)


In [None]:
print(pages.markdown)


In [None]:
from utils import html_to_md, load_html_content

print(html_to_md(load_html_content(pages.html, web_urls[20])["content"]))

In [None]:
from utils import load_html_content, html_to_md
article_contents = load_html_content(main_content, web_urls[15])
content = html_to_md(article_contents.pop("content"))

In [None]:
print(content)

In [None]:
from utils import html_to_md

In [None]:
import html2text
from readabilipy import simple_json_from_html_string
print(html2text.html2text(simple_json_from_html_string(result.html, use_readability=True)["plain_content"]))

In [None]:
from joblib import Parallel, delayed

md_results = Parallel(n_jobs=-1)(delayed(convert_to_md)(result.model_dump(mode="json")) for result in results)

In [None]:

# def convert_to_md(result):
#     try:
#         article = simple_json_from_html_string(result["html"], use_readability=False)
#         content= f"<html><body><h1>{article['title']}</h1>{article['plain_content']}</body></html>"
#         return html_to_md(content)
#     except Exception as e:
#         return result["markdown"]


In [None]:
print(html_to_md(main_content))

In [None]:
print(arxiv_urls)

In [None]:
arxiv_url = "https://arxiv.org/html/2410.01201v1"
from scraper import load_arxiv_article
arxiv_article = await load_arxiv_article(arxiv_url)

In [None]:
print(arxiv_article.content)

In [None]:
{
            "role":"tool", 
            "tool_call_id":tool_call_id, 
            "name": tool_function_name, 
            "content":results
        }

In [None]:
from dotenv import load_dotenv
load_dotenv()


In [21]:
SYSTEM_PROMPT = """You are an AI tutor specializing in Retrieval Augmented Generation (RAG) and Large Language Model (LLM) applications. 
Your role is to act as a conversational companion for AI engineers, providing guidance and explanations about concepts related to RAG. 
Your goal is to help users understand RAG concepts, gain insights and clarity, and develop intuition about the underlying principles their queries relate to.

When a user presents a query, follow these steps:

1. Use the `SearchRetrieve` function to research information relevant to the user's query and the underlying concepts
2. Once you receive the search results, carefully analyze and synthesize the most pertinent information. Focus on details that directly address the user's query and provide insights into RAG and LLM applications.

3. Craft a detailed response to the user's query. Your response should:
   a. Draw from the retrieved data to provide a detailed and accurate answer to the user's query
   b. Provide clear explanations of RAG concepts
   c. Offer insights into LLM applications
   d. Help the user understand the concepts in a friendly, conversational manner
   e. Use analogies or examples where appropriate to illustrate complex ideas
   f. Address any misconceptions or unclear points in the user's query

4. Present your response in a conversational tone, as if you're speaking directly to the user. Aim to make the explanation engaging and accessible, while maintaining technical accuracy.
5. If there are related concepts or topics that might be of interest to the user based on their query, briefly mention these and explain how they connect to the main topic.
6. Conclude your response by summarizing the key points and offering to clarify any remaining questions the user might have.
7. Be precise and concise and use paragraphs to structure your response. We will use this response to generate a speech output.

Begin with a brief acknowledgment of the user's query, then provide your detailed explanation. End with an invitation for further questions. For example:

<answer>
I understand you're asking about [brief restatement of the query].

[Your detailed explanation, insights, and examples here]

To sum up, [brief summary of key points]. Is there any part of this explanation you'd like me to elaborate on further?
</answer>

Remember, your goal is to help the user gain a deeper understanding of RAG and LLM concepts, so focus on providing clear, insightful, and practical information that an AI engineer would find valuable."""

In [22]:
import instructor
from litellm import completion
from pydantic import BaseModel, Field

client = instructor.from_litellm(completion)


class SearchRetrieve(BaseModel):
    queries: list[str] = Field(
        ...,
        title="Search Queries",
        description="A list of detailed and diverse natural language queries to search for relevant information from a knowledge engine",
        max_length=3,
    )
    limit: int = Field(
        3, title="Limit", description="The number of search results to retrieve for each query."
    )


messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": "What is contextual retrieval ?"},
    ]

response, raw_response = client.chat.completions.create_with_completion(
    model="gpt-4o",
    messages=messages,
    response_model=SearchRetrieve
)
print(response, raw_response)


  Expected `PromptTokensDetails` but got `dict` with value `{'audio_tokens': None, 'cached_tokens': 0}` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_json(
  Expected `PromptTokensDetails` but got `dict` with value `{'audio_tokens': None, 'cached_tokens': 0}` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


queries=['contextual retrieval definition', 'contextual retrieval in RAG', 'contextual retrieval applications'] limit=3 ModelResponse(id='chatcmpl-AJYtxQsLhN6DnX8G9j5Jh3cKSgTev', choices=[Choices(finish_reason='stop', index=0, message=Message(content=None, role='assistant', tool_calls=[ChatCompletionMessageToolCall(function=Function(arguments='{"queries":["contextual retrieval definition","contextual retrieval in RAG","contextual retrieval applications"]}', name='SearchRetrieve'), id='call_YpXEGgecab9SjJVqu0nfMqIn', type='function')], function_call=None))], created=1729225757, model='gpt-4o-2024-08-06', object='chat.completion', system_fingerprint='fp_6b68a8204b', usage=CompletionUsage(completion_tokens=49, prompt_tokens=1354, total_tokens=1403, completion_tokens_details=None, prompt_tokens_details=None), service_tier=None)


In [23]:
model_response = raw_response.choices[0].message
model_response_dict = model_response.model_dump(mode="json")
tool_calls = model_response_dict["tool_calls"]
tool_call_id = tool_calls[0]["id"]
tool_function_name = tool_calls[0]["function"]["name"]

In [24]:
from src.retrieval.docstore import HybridRetriever
import json
retriever = HybridRetriever.load()
tasks = [retriever.invoke(query, limit=response.limit) for query in response.queries]
import asyncio
results = await asyncio.gather(*tasks)
docs = [doc for result in results for doc in result]
deduped_docs = {}
for doc in docs:
    deduped_docs[doc.content] = doc
docs = list(deduped_docs.values())
context = json.dumps([doc.as_str for doc in docs])

In [10]:
model_response

Message(content=None, role='assistant', tool_calls=[ChatCompletionMessageToolCall(function=Function(arguments='{"queries":["contextual retrieval in RAG","contextual retrieval techniques","importance of context in information retrieval systems"]}', name='SearchRetrieve'), id='call_89ynZGC7emXrgNeEDQ9PbHKz', type='function')], function_call=None)

In [25]:
tool_message = {
            "role":"tool", 
            "tool_call_id":tool_call_id, 
            "name": tool_function_name, 
            "content":context
}

In [26]:
messages.append(model_response)
messages.append(tool_message)

output = completion(
    model="gpt-4o",
    messages=messages,
)

In [27]:
answer = output.choices[0].message.content

In [29]:
print(answer)

I understand you're asking about contextual retrieval.

Contextual retrieval is an advanced technique used in Retrieval-Augmented Generation (RAG) systems to enhance the process of retrieving information from a knowledge base. In traditional RAG, retrieving relevant information often involves pulling isolated pieces of data, which can lack sufficient context, leading to suboptimal results. Contextual retrieval addresses this by considering the context of each data chunk relative to the overall document or dataset it belongs to.

The essence of contextual retrieval lies in two main sub-techniques:

1. **Contextual Embeddings:** These are embeddings that include not just the content of a chunk but also additional context that situates it within the entire document. This added layer of context helps improve the quality and relevance of retrieved information. By embedding these contextually enriched chunks, search algorithms can more accurately find and return pertinent data, significantly

In [32]:
from pathlib import Path
from litellm import client, speech
import os 

speech_file_path ="speech.mp3"
response = speech(
        model="openai/tts-1",
        voice="alloy",
        input=answer,
    )
response.stream_to_file(speech_file_path)

AttributeError: 'function' object has no attribute 'with_streaming_response'

In [31]:
from IPython.display import Audio
Audio(speech_file_path)