<div style="text-align: center"> <h1>Document Summary and RAG</h1></div>

In [13]:
# !pip install llama_index

In [110]:
# Importing All required libraray from llama_index
try:
    from llama_index import (
        SimpleDirectoryReader,
        LLMPredictor,
        ServiceContext,
        get_response_synthesizer,
    )
    from llama_index.indices.document_summary import DocumentSummaryIndex
    from llama_index.llms import OpenAI
    from llama_index.indices.loading import load_index_from_storage
    from llama_index import StorageContext
    from llama_index.indices.document_summary import DocumentSummaryIndexRetriever
    from llama_index.query_engine import RetrieverQueryEngine

    import os
    import openai

    import logging
    import sys
    import json
    import requests
    from pathlib import Path

    import nest_asyncio
    from IPython.display import HTML, display

except ImportError as e:
    print("Error while importing one or more modules:", e)

import warnings
warnings.filterwarnings('ignore')

In [4]:
with open(r'D:\Data Science\OpenAI_key management\key.json') as config_file:
    config = json.load(config_file)
    api_key = config['api_key']

openai.api_key = api_key  # Replace it with your Own Key

In [8]:
# Configure the logging behavior
logging.basicConfig(
    stream=sys.stdout,  # Send log messages to standard output
    level=logging.INFO  # Set log level to INFO
)

# Create a StreamHandler and add it to the logger
stream_handler = logging.StreamHandler(stream=sys.stdout)
logger = logging.getLogger()
logger.addHandler(stream_handler)

# Apply the new asyncio event loop
nest_asyncio.apply()

In [19]:
## Helper function to fetch Text data from Wikipedia pages

def fetch_and_save_wiki_text(title):
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            "explaintext": True,
        },
    ).json()
    
    page = next(iter(response["query"]["pages"].values()))
    wiki_text = page["extract"]
    
    data_path = Path("data")
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", "w", encoding="utf-8") as fp:
        fp.write(wiki_text)

In [22]:
wiki_AirSports_titles = ['Aerobatics', 'Air racing', 'Gliding', 'Hang gliding', 'Parachuting']

for title in wiki_AirSports_titles:
    fetch_and_save_wiki_text(title)

In [27]:
# Load all wiki documents

AirSport_docs = []

for sports_title in wiki_AirSports_titles:
    docs = SimpleDirectoryReader(input_files=[f"data/{sports_title}.txt"]).load_data()
    docs[0].doc_id = sports_title
    AirSport_docs.extend(docs)

In [43]:
# AirSport_docs[0]

In [44]:
chatgpt = OpenAI(temperature=0, 
                 model="gpt-3.5-turbo") # generated text will be deterministic (less random) considering Temp =0
                                        # Using gpt-3.5-turbo model from Open AI

# ServiceContext using the initialized LLM Predictor
service_context = ServiceContext.from_defaults(llm=chatgpt, 
                                               chunk_size=1024)

# Define response synthesizer with specific settings
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize", use_async=True
)

# Helper function to build document Summary and indexes
def build_document_summary_index(docs, service_context, response_synthesizer):
    """
    Build a document summary index using the provided documents, service context, and response synthesizer.
    
    Parameters:
    - docs: List of documents to build the index from
    - service_context: Service context for interacting with the LLM Predictor
    - response_synthesizer: Response synthesizer settings
    
    Returns:
    - doc_summary_index: DocumentSummaryIndex instance
    """
    doc_summary_index = DocumentSummaryIndex.from_documents(
        docs,
        service_context=service_context,
        response_synthesizer=response_synthesizer,
    )
    return doc_summary_index

In [88]:
# service_context

In [54]:
# building the document summary index
built_index = build_document_summary_index(AirSport_docs, service_context, response_synthesizer)

In [47]:
print(built_index)

<llama_index.indices.document_summary.base.DocumentSummaryIndex object at 0x000001EAFA142400>


In [68]:
built_index.get_document_summary("Air racing")

'The provided text is about air racing, a type of motorsport that involves airplanes or other aircraft competing over a fixed course. It provides historical information about the first air races, including notable events and pilots. It also mentions different classes of racing airplanes and notable racing pilots. The text also mentions cultural depictions of air racing in movies and books. \n\nSome questions that this text can answer include:\n- What is air racing?\n- When and where was the first air race held?\n- Who were some of the early racing pilots?\n- What are some notable racing airplanes?\n- How has air racing been depicted in popular culture?\n- Are there any active air races today?\n- What are some different classes of racing airplanes?'

In [82]:
def display_summary_and_questions(text, summary_color="green", questions_color="blue"):
    summary_start = 'The provided text is about'
    questions_start = '\n\nSome questions that this text can answer'

    summary_end = text.find(questions_start)
    summary = text[:summary_end]
    questions = text[summary_end:]

    formatted_summary = f"<font color='{summary_color}'><u><b>Summary:</b></u></font><br> {summary}"
    formatted_questions = f"<font color='{questions_color}'><u><b>Questions:</b></u></font><br>{questions.replace(' -', '<br>-')}"

    display(HTML(formatted_summary))
    display(HTML(formatted_questions))

# Example usage
summary_and_questions_text = built_index.get_document_summary("Hang gliding")
display_summary_and_questions(summary_and_questions_text)


In [107]:
def query_index(response):

    storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
    built_index = load_index_from_storage(storage_context)

    query_engine = built_index.as_query_engine(
        response_mode="tree_summarize", use_async=True
    )
    query_response = query_engine.query(response)
    return query_response

In [109]:
# Persist the index
built_index.storage_context.persist("index")

response_text = "What records have been set in hang gliding?"
query_response = query_index(response_text)
print(query_response)


Records that have been set in hang gliding include distance covered in cross-country flights, duration of flights, altitude gained, and speed achieved. Some specific examples of these records include Dustin B. Martin's world record for straight distance, covering a distance of 764 km (475 mi) in 2012, Judy Leden's altitude record for a balloon-launched hang glider, reaching a height of 11,800 m (38,800 ft) in 1994, and Leden's record for gain of height, climbing to 3,970 m (13,025 ft) in 1992. These are just a few examples of the records that have been achieved in hang gliding.


In [55]:
built_index.storage_context.persist("index")

In [56]:
# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir="index")
built_index = load_index_from_storage(storage_context)

In [58]:
query_engine = built_index.as_query_engine(
    response_mode="tree_summarize", use_async=True
)

In [86]:
response = query_engine.query("Explain me Rigid Wing Class?")

In [87]:
print(response)

The Rigid Wing Class refers to a specific class of hang gliders that have a rigid structure, as opposed to a flexible wing. These hang gliders have a frame made of materials such as aluminum or composite, which provides more stability and control compared to flexible wing hang gliders. The flight of the hang glider in the Rigid Wing Class is controlled by spoilers, which are typically located on top of the wing. The pilot hangs below the wing without any additional fairing. Rigid wing hang gliders offer better performance compared to other classes and are generally more expensive. They have a higher glide ratio and a wider speed range, allowing for faster and more efficient flight. Rigid wing hang gliders are often used by experienced pilots who are looking for higher performance and more advanced flying capabilities.


In [94]:
response = query_engine.query("What records have been set in hang gliding?")

In [95]:
print(response)

Various records have been set in hang gliding, including the world record for straight distance, altitude gained, and speed achieved. These records can vary depending on the category and type of hang glider used. Some notable records include Dustin B. Martin holding the world record for straight distance with a distance of 764 km (475 mi) in 2012, Judy Leden holding the altitude record for a balloon-launched hang glider at 11,800 m (38,800 ft), and Judy Leden also holding the gain of height record at 3,970 m (13,025 ft). These records are sanctioned by the FAI (Fédération Aéronautique Internationale).


### LLM- Based Retreivel

In [93]:
retriever = DocumentSummaryIndexRetriever(
    built_index,
    # choice_select_prompt=choice_select_prompt,
    # choice_batch_size=choice_batch_size,
    # format_node_batch_fn=format_node_batch_fn,
    # parse_choice_select_answer_fn=parse_choice_select_answer_fn,
    # service_context=service_context
)

In [96]:
retrieved_nodes = retriever.retrieve("What records have been set in hang gliding?")

In [97]:
print(retrieved_nodes[0].score)
print(retrieved_nodes[0].node.get_text())

10.0
Hang gliding is an air sport or recreational activity in which a pilot flies a light, non-motorised foot-launched heavier-than-air aircraft called a hang glider. Most modern hang gliders are made of an aluminium alloy or composite frame covered with synthetic sailcloth to form a wing. Typically the pilot is in a harness suspended from the airframe, and controls the aircraft by shifting body weight in opposition to a control frame.
Early hang gliders had a low lift-to-drag ratio, so pilots were restricted to gliding down small hills. By the 1980s this ratio significantly improved, and since then pilots have been able to soar for hours, gain thousands of feet of altitude in thermal updrafts, perform aerobatics, and glide cross-country for hundreds of kilometers. The Federation Aeronautique Internationale and national airspace governing organisations control some regulatory aspects of hang gliding. Obtaining the safety benefits of being instructed is highly recommended and indeed a m

### With Query Engine

In [99]:
def run_query(query, retriever):
    # configure response synthesizer
    response_synthesizer = get_response_synthesizer()

    # assemble query engine
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
    )

    response = query_engine.query(query)
    return response

In [100]:
# Example usage
query = "What records have been set in hang gliding?"
retriever = retriever
response = run_query(query, retriever)
print(response)

Hang gliding has seen various records set in terms of distance, duration, and altitude. These records showcase the progress and achievements in the sport, as well as the impressive capabilities of modern hang gliders.
