In [46]:
from llama_index.readers.file import PDFReader
loader = PDFReader()
docs = loader.load("national_ai_rd_strategic_plan.pdf")

AttributeError: 'PDFReader' object has no attribute 'load'

In [6]:
from llama_index.core import Document
doc_text = "\n\n".join([d.get_content() for d in docs])
docs = [Document(text=doc_text)]

### creating heirarchies

In [7]:
from llama_index.core.node_parser import HierarchicalNodeParser,get_leaf_nodes

node_parser = HierarchicalNodeParser.from_defaults()

nodes = node_parser.get_nodes_from_documents(docs)
leaf_nodes = get_leaf_nodes(nodes)

nodes_by_id = {node.node_id: node for node in nodes}

### creating datset for elastic search

In [8]:
parent_ids_list = []
for i in range(0,len(leaf_nodes)):
    parent_ids_list.append(leaf_nodes[i].parent_node.node_id)



In [9]:
child_ids_list = []
for i in range(0,len(leaf_nodes)):
    child_ids_list.append(leaf_nodes[i].node_id)


In [10]:
child_contexts_list = []
for i in range(0,len(leaf_nodes)):
    child_contexts_list.append(leaf_nodes[i].text)


In [60]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')



In [12]:
import pandas as pd

df = pd.DataFrame({'parent_id':parent_ids_list,'child_id':child_ids_list,'child_context':child_contexts_list})

df["vectors"] = df["child_context"].apply(lambda x: model.encode(x))


In [13]:
df

Unnamed: 0,parent_id,child_id,child_context,vectors
0,092765e9-ea0a-4e58-912f-9fd518e175f9,50eb8e33-1e4e-4cf6-bb75-dcff016a49b3,October 2016 \n \n \n \n \n \n \n \n \n \nTHE ...,"[0.017220804, -0.0021966165, -0.05922687, -0.0..."
1,092765e9-ea0a-4e58-912f-9fd518e175f9,6a51ec6d-8fe7-407b-b9c9-60007ceadcde,One of the NSTC’s primary objectives is establ...,"[0.01857824, 0.023195736, -0.027405323, -0.006..."
2,092765e9-ea0a-4e58-912f-9fd518e175f9,6b44ffd8-ecc4-4a59-bd3a-7505ba8308c4,More information is available at www.whitehous...,"[-0.019053286, -0.00016339646, -0.004697498, -..."
3,092765e9-ea0a-4e58-912f-9fd518e175f9,ab82f499-eaec-4a01-aab3-72d823c36836,"The mission of OSTP is threefold; first, to \n...","[0.015104515, 0.025904894, -0.0059961327, -0.0..."
4,092765e9-ea0a-4e58-912f-9fd518e175f9,0ac890de-3177-40d5-93d2-9b0c5d248040,More information is \navailable at www.whiteho...,"[0.002918094, 0.02579336, -0.032515164, -0.006..."
...,...,...,...,...
294,28de23eb-2ffb-4cb0-895e-89f53247599f,9becf92f-478b-4b19-9c96-1127f878c76e,‬\n‬\n \n3-D \nThree Dimensional \nAI \nArtifi...,"[0.002823744, 0.015269775, -0.057292424, 0.017..."
295,28de23eb-2ffb-4cb0-895e-89f53247599f,cd202aa7-7476-4173-a5f4-91bc2f10d6dc,to Meaningfully Promote Excellence in \nTechno...,"[0.029960934, 0.010378938, -0.035866287, -0.03..."
296,8c9c1cb1-5241-4e53-9d7a-c87cd9a65c15,92af882a-466c-4f47-b9c6-98c35a603a16,Face Recognition Vendor Test \nGPS \nGlobal Po...,"[0.03137396, -0.011448868, -0.05008454, -0.007..."
297,8c9c1cb1-5241-4e53-9d7a-c87cd9a65c15,0e4ec31c-68bd-4551-be99-46965f94e193,Skills and Abilities \nLAX \nLos Angeles World...,"[0.07035522, 0.08283257, -0.018431637, -0.0102..."


### adding up data in elastic search

In [14]:
indexMapping = {
    "properties":{
        "parent_id":{
            "type":"text"
        },
        "child_id":{
            "type":"text"
        },
        "child_context":{
            "type":"text"
        },
        "vectors":{
            "type":"dense_vector",
            "dims":768,
            "index":True,
            "similarity":"l2_norm"
        }
    }
}

In [15]:
from elasticsearch import Elasticsearch

es = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic","=PGMBBMeamIbyzgpOeTB"),
    ca_certs="C:/Users/VANSH KHANEJA/PROJECTS/superteams_projects/ELASTIC SEARCH RAG/elasticsearch-8.14.1/config/certs/http_ca.crt"
)
es.ping()

True

In [16]:
es.ping()

True

In [17]:
es.indices.create(index="ai",mappings= indexMapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'ai'})

In [18]:
record_list = df.to_dict("records")
record_list

[{'parent_id': '092765e9-ea0a-4e58-912f-9fd518e175f9',
  'child_id': '50eb8e33-1e4e-4cf6-bb75-dcff016a49b3',
  'child_context': 'October 2016 \n \n \n \n \n \n \n \n \n \nTHE NATIONAL \nARTIFICIAL INTELLIGENCE \nRESEARCH AND DEVELOPMENT \nSTRATEGIC PLAN \nNational Science and Technology Council \n \nNetworking and Information Technology \nResearch and Development Subcommittee \n \n\n\n \nii \n \n \n\n\n \n \n \n \niii \nAbout the National Science and Technology Council \nThe National Science and Technology Council (NSTC) is the principal means by which the Executive \nBranch coordinates science and technology policy across the diverse entities that make up the Federal \nresearch and development (R&D) enterprise.',
  'vectors': array([ 1.72208045e-02, -2.19661649e-03, -5.92268705e-02, -2.63484474e-02,
         -4.14467379e-02, -8.87932908e-03,  8.30675568e-03,  8.71860946e-04,
         -3.89413745e-03,  1.32682743e-02,  3.57214659e-02, -6.30367221e-03,
          2.60035004e-02,  5.39653

In [19]:
for record in record_list:
    try:
        es.index(index="ai",document=record)
    except Exception as e:
        print(e)

In [20]:
es.count(index="ai")

ObjectApiResponse({'count': 299, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

### fetching reults 

In [90]:
test_query = "What were three waves of AI ?"
def find_matching_parent_ids(input_query):
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('all-mpnet-base-v2')
    vector_of_query = model.encode(input_query)
    query = {
            "field":"vectors",
            "query_vector":vector_of_query,
            "k":25,
            "num_candidates":299,
        }
    results = es.knn_search(index="ai",
                            knn=query,
                            source=["child_context","parent_id"]
                            )
    fetched_ids = []


    for i in results["hits"]["hits"]:
        fetched_ids.append(i['_source']['parent_id'])
    return fetched_ids


fetched_parent_ids = find_matching_parent_ids(test_query)
fetched_parent_ids


['73bb5c7d-221b-4b9a-9794-35f40b64634b',
 '8a438506-babc-4c39-b0d1-c2ce839b4616',
 'b377b083-8557-41b8-bd45-8735c9a7c439',
 'b377b083-8557-41b8-bd45-8735c9a7c439',
 '8a438506-babc-4c39-b0d1-c2ce839b4616',
 '55b1d850-5b7b-4b0e-834a-bd3a074519c2',
 '55b1d850-5b7b-4b0e-834a-bd3a074519c2',
 '55b1d850-5b7b-4b0e-834a-bd3a074519c2',
 '74f53ff8-7d1b-44fc-b23c-7553839856fa',
 '28de23eb-2ffb-4cb0-895e-89f53247599f',
 '6e8e73bf-2d43-4074-8b75-071997551771',
 'abccd3fc-9723-4932-9233-222e11b5c80c',
 '0d2397b1-a0b3-4193-8061-a5434412a6d0',
 '83db6573-7bd0-46d1-9189-9589007b8aaa',
 'c91508c1-7e2c-4957-a3a1-38854c67dee8',
 '2ef3eebb-dfba-412e-a3af-11df658c6777',
 '17e1774f-ea31-47a3-afa7-c1ed06a75c4c',
 'bffaafb3-081e-4d26-a39f-b28f6e8e68b4',
 '4858799f-5610-461a-bcc7-ca3b38a0ff45',
 '28de23eb-2ffb-4cb0-895e-89f53247599f',
 '71efca07-a61c-4a2b-8114-ef819b603cd7',
 '73bb5c7d-221b-4b9a-9794-35f40b64634b',
 'e85456d8-7f10-4486-89b4-f3c9185b1d49',
 'b48e504c-3d5f-4a51-b64d-d4d34f583135',
 '8a438506-babc-

### finding the most common chunks

In [91]:
def most_frequent_parent_ids(list_of_id):
    frequency_dict = {}
    threshold = 5
    for element in list_of_id:
        if element in frequency_dict:
            frequency_dict[element] += 1
        else:
            frequency_dict[element] = 1


    sorted_elements = sorted(frequency_dict.items(), key=lambda item: item[1], reverse=True)
    most_common_ids = []
    for i in range(0,threshold):
        most_common_ids.append(sorted_elements[i][0])
    return most_common_ids


most_common_ids=most_frequent_parent_ids(fetched_parent_ids)
most_common_ids  

['8a438506-babc-4c39-b0d1-c2ce839b4616',
 '55b1d850-5b7b-4b0e-834a-bd3a074519c2',
 '73bb5c7d-221b-4b9a-9794-35f40b64634b',
 'b377b083-8557-41b8-bd45-8735c9a7c439',
 '28de23eb-2ffb-4cb0-895e-89f53247599f']

### GETTING THEIR PARENT context

In [92]:
parent_context_list = []

for y in most_common_ids:
    for i in range(0,len(leaf_nodes)):
        if(y== leaf_nodes[i].parent_node.node_id):
            parent_context_list.append(nodes_by_id[leaf_nodes[i].parent_node.node_id].text)
            break
    

### performing reranking over it

In [107]:
from sentence_transformers import CrossEncoder
rankmodel = CrossEncoder("jinaai/jina-reranker-v1-tiny-en")

query = test_query
results = rankmodel.rank(query, parent_context_list, return_documents=True, top_k=5)

In [94]:
reranked_list = []
for i in range(0,5):
    reranked_list.append(results[i]['corpus_id'])

### collecting the most similar data

In [95]:
context = ""

for i in reranked_list[0:3]:
    context+=parent_context_list[i]+"\n\n\n\n\n\n"


In [96]:
print(context)

In [97]:
from langchain_groq import ChatGroq

llm = ChatGroq(temperature=0, model_name="llama3-70b-8192",groq_api_key="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")

### prompting

In [98]:
from langchain import PromptTemplate
from langchain import LLMChain

prompt_template = PromptTemplate(
    template="These are few Context: {context} for this question Question: {question} base on this context genrate a relevant concise Answer from thi context:",
    input_variables=["context", "question"]
)

In [99]:
llm_chain = LLMChain(llm=llm, prompt=prompt_template)


In [100]:
def generate_answer(context, question):
    input_data = {
        "context": context,
        "question": question
    }
    answer = llm_chain(input_data)
    return answer


## output

In [101]:
from rich import print
print("Your Question:  \n"+test_query+"\n\n"+"Bot Reply:  \n"+generate_answer(context, test_query)['text'])

^C


In [110]:

while True:
    test_query = input("\n\n\n\nEnter your query: ")
    if test_query == "":
      break
    fetched_parent_ids = find_matching_parent_ids(test_query)
    most_common_ids=most_frequent_parent_ids(fetched_parent_ids)
    parent_context_list = []
    for y in most_common_ids:
        for i in range(0,len(leaf_nodes)):
            if(y== leaf_nodes[i].parent_node.node_id):
                parent_context_list.append(nodes_by_id[leaf_nodes[i].parent_node.node_id].text)
                break
    
    query = test_query
    results = rankmodel.rank(query, parent_context_list, return_documents=True, top_k=5)

    reranked_list = []
    for i in range(0,5):
        reranked_list.append(results[i]['corpus_id'])

    context = ""

    for i in reranked_list[0:3]:
        context+=parent_context_list[i]+"\n\n"

    print("Your Question:  \n"+test_query+"\n\n"+"Bot Reply:  \n"+generate_answer(context, test_query)['text'])





Enter your query:  How can AI be made scalable






Enter your query:  What were three waves of AI






Enter your query:  
