In [None]:
! pip install llama-index

In [1]:
import nest_asyncio

nest_asyncio.apply()

import os
import keys

from llama_index import ServiceContext
from llama_index.llms import AzureOpenAI
from llama_index.schema import MetadataMode

from llama_index.embeddings import OpenAIEmbedding

In [2]:
llm = AzureOpenAI(
    engine="raidGPT",
    model="gpt-4",
    temperature=0.0,
    api_base="https://raid-ses-openai.openai.azure.com/",
    api_key=keys.gpt_key,
    api_type="azure",
    api_version="2023-05-15"
)

emb_llm = OpenAIEmbedding(
    engine="swiftfaq-ada002",
    model="text-embedding-ada-002",
    temperature=0.0,
    api_base="https://raid-ses-openai.openai.azure.com/",
    api_key=keys.gpt_key,
    api_type="azure",
    api_version="2023-05-15"
)

## Experimenting with custom text splitter

In [3]:
from llama_index.node_parser import SimpleNodeParser
from llama_index.node_parser.extractors import (
    MetadataExtractor,
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    EntityExtractor,
    MetadataFeatureExtractor,
)
from llama_index.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=128)


class CustomExtractor(MetadataFeatureExtractor):
    def extract(self, nodes):
        metadata_list = [
            {
                "custom": node.metadata["document_title"]
                + "\n"
                + node.metadata["excerpt_keywords"]
            }
            for node in nodes
        ]
        return metadata_list


metadata_extractor = MetadataExtractor(
    extractors=[
        TitleExtractor(nodes=5, llm=llm),
        # EntityExtractor(prediction_threshold=0.5),
        SummaryExtractor(summaries=["prev", "self"], llm=llm),
        # KeywordExtractor(keywords=10, llm=llm),
        # CustomExtractor()
    ],
)

node_parser = SimpleNodeParser.from_defaults(
    text_splitter=text_splitter,
    # metadata_extractor=metadata_extractor,
)

In [4]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader('../data/124').load_data()

In [14]:
nodes = node_parser.get_nodes_from_documents(documents[10:30])

Extracting summaries: 100%|██████████| 31/31 [05:23<00:00, 10.45s/it]


In [20]:
nodes[2].metadata

{'page_label': '12',
 'file_name': 'AP3456 Vol 12 Helicopters.pdf',
 'document_title': 'Understanding the Principles and Techniques of Helicopter Movement and Hovering in Various Environments',
 'prev_section_summary': 'The section is from a document titled "Understanding the Principles and Techniques of Helicopter Movement and Hovering in Various Environments". It includes various figures explaining different aspects of helicopter movement and hovering. Key topics include the effect of long grass on recirculation, recirculation near a building, producing horizontal movement, flapping to equality, control orbit, pitch operating arm movement, relationship of blade position to control orbit position, high and low blade positions, advance angle, dragging hinge, variation in radius of blade CG resulting from flapping, and Hooke’s Joint Effect.',
 'section_summary': "The section discusses the principles of helicopter hovering and horizontal movement, specifically focusing on the take-off an

## Integration with Azure Cognitive Search

In [198]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential

from llama_index.vector_stores.cogsearch import (
    IndexManagement,
    CognitiveSearchVectorStore,
)

from llama_index import (
    LangchainEmbedding,
    SimpleDirectoryReader,
    StorageContext,
    ServiceContext,
    VectorStoreIndex,
)

service_endpoint = "https://rsaf-cognitive-search-service.search.windows.net"
index_name = "rsaf-cognitive-search"
key = keys.cognitive_key
credential = AzureKeyCredential(key)

## Creating own nodes from azure output

In [158]:
from llama_index.query_engine import CustomQueryEngine
from llama_index.retrievers import BaseRetriever
from llama_index.response_synthesizers import get_response_synthesizer, BaseSynthesizer
from llama_index.schema import Node, NodeWithScore

In [None]:
from llama_index.chat_engine import CondenseQuestionChatEngine, ContextChatEngine

#######################################
# THIS IS BROKEN AND I DON'T KNOW WHY #
#######################################


class CustomRetriever(BaseRetriever):
    """Custom retriever that performs both semantic search and hybrid search."""

    def __init__(
        self,
    ) -> None:
        """Init params."""

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        doc1 = Node(text='OFFICIAL (CLOSED)   \n13-1 \nOFFICIAL (CLOSED)   PAR T B \n \nCHAPTER 13  \n \nADVANCED TRANSITIONS  \n \nINTRODUCTION  \n \n13.1 Helicopter operations involve landing and taking off from confined \nspaces, high altitudes or at a high AUW. In all of these conditions, it may not be \nfeasible to employ the basic transition technique due  to limitations in the engine \npower available or availability of forward distance. In such scenarios, advanced \ntransition techniques are employed.  \n \n13.2 Objective . The objectives are; (1) determine the power margin available \nthrough hover and forward flight po wer checks, (2) select the appropriate \nadvanced transition technique, (3) conduct a OGE, IGE, cushion creep and \nrunning take off transition to the climb and (4) conduct a transition from forward \nflight to an OGE hover, zero speed and running landing.  \n \n \nPOWER DEMANDS IN A HELICOPTER  \n \n13.3 Prior to  learning advanced transition techniques, an understand ing of  \npower demands in a helicopter  will need to be established:  \n \na. Rotor profile power . This is the power required to drive the main \nrotors at minimum pitch at a c onstant NR in addition to the power required \nto drive the tail rotor and other ancillary equipment. Generally, the rotor \nprofile power remains at about the same value throughout the speed range \nwith minor increases in demand at higher airspeeds.  \n \nb. Induced p ower . To increase rotor thrust for hover or forward \nmotion, the blade pitch must be increased leading to an increase in rotor \ndrag. As such, to maintain NR more power must be produced to overcome \nthe rising drag of the blades. This increase in power is kno wn as induced \npower as it is the power required to overcome the rise in drag  when the \nblades induce air to flow through the rotor. However, induced flow \ndiminishes with forward speed and therefore the induced power.  \n \nc. Parasite power . Parasite drag, due to t he airframe, increases as \nforward airspeed increases. As such, the disc would need to be tilted \nfurther forward to increase the horizontal component of TRT leading and')
        doc2 = Node(text='power in a helicopter is extremely important')

        retrieve_nodes = [NodeWithScore(node=doc1, score = 0.2), NodeWithScore(node=doc2, score = 10.0)]

        return retrieve_nodes
    
chat_engine = ContextChatEngine.from_defaults(retriever=CustomRetriever, verbose=True, service_context=service_context)

chat_engine.chat("What are advanced transitions?")

### API friendly version

### This creates a new llamaindex compatible Azure Search Index

In [197]:
def create_azure_index(index_client, index_name : str, metadata_fields : dict, llm, emb_llm, filepath) -> str:
    
    """
    Spins up an azure cognitive search index.
    Will delete any index named as such so BE CAREFUL
    """
    
    vector_store = CognitiveSearchVectorStore(  
    search_or_index_client=index_client,  
    index_name=index_name,  
    filterable_metadata_field_keys=metadata_fields,  
    index_management=IndexManagement.CREATE_IF_NOT_EXISTS,  
    id_field_key="id",  
    chunk_field_key="content",  
    embedding_field_key="content_vector",  
    metadata_string_field_key="metadata",  
    doc_id_field_key="doc_id",  
)
    
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    service_context = ServiceContext.from_defaults(llm=llm, embed_model=emb_llm)
    
    documents = SimpleDirectoryReader(filepath).load_data()
    
    
    index = VectorStoreIndex.from_documents(
        documents, storage_context=storage_context, service_context=service_context
)
    
    return "{}".format(index_name) + " created"

In [None]:
create_azure_index()

### This allows you to query that index

In [200]:
# Create a custom class that calls on search client
class AzureQueryEngine(CustomQueryEngine):
    """Azure Custom Query."""

    search_client : SearchClient
    response_synthesizer: BaseSynthesizer
    
    def custom_query(self, query_str: str):
        
        nodes = []
        
        results = self.search_client.search(search_text=query_str, top=3)

        for i in results:
            nodes.append(NodeWithScore(node=Node(text=i["content"]), score=i['@search.score']))

        response_obj = self.response_synthesizer.synthesize(query_str, nodes)
        return response_obj

In [203]:
service_context = ServiceContext.from_defaults(llm=llm, embed_model=emb_llm)

search_client = SearchClient(endpoint=service_endpoint, index_name="llamaindex-demo", credential=credential)
   
synthesizer = get_response_synthesizer(response_mode="compact", service_context=service_context)
query_engine = AzureQueryEngine(search_client=search_client, response_synthesizer=synthesizer)

response = query_engine.query("What are advanced transitions?")

print(str(response))

ServiceRequestError: <urllib3.connection.HTTPSConnection object at 0x0000026FBE1A3C10>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed

In [202]:
# Create a custom class that calls on search client
class AzureQueryEngine(CustomQueryEngine):
    """Azure Custom Query."""

    # search_client : SearchClient
    response_synthesizer: BaseSynthesizer
    
    def custom_query(self, query_str: str):
        
        nodes = []
        
        results = search_client.search(search_text=query_str, top=3)

        for i in results:
            nodes.append(NodeWithScore(node=Node(text=i["content"]), score=i['@search.score']))

        response_obj = self.response_synthesizer.synthesize(query_str, nodes)
        return response_obj