In [4]:
from llama_index.readers.file import PyMuPDFReader

loader = PyMuPDFReader()
# docs0 = loader.load_data(file=Path("./data/llama2.pdf"))
docs = loader.load(file_path="Chapter-2-Introduction-to-the-Indian-FS-and-Markets.pdf")

In [5]:
from llama_index.core import Document
doc_text = "\n\n".join([d.get_content() for d in docs])
docs = [Document(text=doc_text)]

### creating heirarchies

In [6]:
from llama_index.core.node_parser import HierarchicalNodeParser,get_leaf_nodes

node_parser = HierarchicalNodeParser.from_defaults()

nodes = node_parser.get_nodes_from_documents(docs)
leaf_nodes = get_leaf_nodes(nodes)

nodes_by_id = {node.node_id: node for node in nodes}

### creating datset for elastic search

In [7]:
parent_ids_list = []
for i in range(0,len(leaf_nodes)):
    parent_ids_list.append(leaf_nodes[i].parent_node.node_id)



In [8]:
child_ids_list = []
for i in range(0,len(leaf_nodes)):
    child_ids_list.append(leaf_nodes[i].node_id)


In [9]:
child_contexts_list = []
for i in range(0,len(leaf_nodes)):
    child_contexts_list.append(leaf_nodes[i].text)


In [10]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')



In [11]:
import pandas as pd

df = pd.DataFrame({'parent_id':parent_ids_list,'child_id':child_ids_list,'child_context':child_contexts_list})

df["vectors"] = df["child_context"].apply(lambda x: model.encode(x))


In [12]:
df

Unnamed: 0,parent_id,child_id,child_context,vectors
0,48f0d003-0870-40b3-90eb-b76476c5de8a,320c93b9-caed-438f-a0df-e32d69b11e2a,INTRODUCTION TO THE INDIAN FINANCIAL SYSTEM AN...,"[-0.00020913576, -0.056976017, -0.036726363, -..."
1,48f0d003-0870-40b3-90eb-b76476c5de8a,0eb66f96-4f64-449d-9158-858cd8b41a7f,Indian Financial System \n263 \nFinancial Inst...,"[-0.009912057, -0.022014251, -0.03827713, -0.0..."
2,48f0d003-0870-40b3-90eb-b76476c5de8a,e0a780a5-a329-493a-8725-d15468ee3680,markets \n \nDescribe the structure and funct...,"[-0.009698601, -0.0136891855, -0.036911573, -0..."
3,48f0d003-0870-40b3-90eb-b76476c5de8a,3d5f5291-8d1b-429e-8686-21cee1b0b209,instruments for hedging foreign exchange \n \...,"[0.008545979, -0.038368683, -0.017907752, -0.0..."
4,48f0d003-0870-40b3-90eb-b76476c5de8a,4d831538-715e-4fed-a0d1-b2691b3471b5,This book or any portion thereof shall not be ...,"[0.02008367, -0.08584135, -0.011335132, -0.049..."
...,...,...,...,...
416,f6972d84-804c-4d54-ae6b-b6aee7ec50d2,ed5103ea-6254-499d-ae30-79f2b907ca76,Being a nation-wide commodity exchange having ...,"[0.018725704, -0.042945366, -0.036561977, -0.0..."
417,d45188f8-fb96-4b2b-9c17-ae4b624c7e2f,74277ea8-63b5-4649-8763-02d7ec0e3f70,This was followed by establishment of futures ...,"[-0.0074731037, -0.0023180048, -0.01791432, -0..."
418,d45188f8-fb96-4b2b-9c17-ae4b624c7e2f,d6c4426c-9668-4e1a-8bc8-f8accaa50fd3,This included the ability to restrict or ban t...,"[-0.011707778, 0.027524605, -0.010431205, -0.0..."
419,d45188f8-fb96-4b2b-9c17-ae4b624c7e2f,7f03a512-9d2b-48e1-9e99-e40c5ce66a20,The apprehensions about the role of speculatio...,"[0.0010791881, 0.0483737, -0.012705713, -0.054..."


### adding up data in elastic search

In [13]:
indexMapping = {
    "properties":{
        "parent_id":{
            "type":"text"
        },
        "child_id":{
            "type":"text"
        },
        "child_context":{
            "type":"text"
        },
        "vectors":{
            "type":"dense_vector",
            "dims":768,
            "index":True,
            "similarity":"l2_norm"
        }
    }
}

In [14]:
from elasticsearch import Elasticsearch

es = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic","=PGMBBMeamIbyzgpOeTB"),
    ca_certs="C:/Users/VANSH KHANEJA/PROJECTS/superteams_projects/ELASTIC SEARCH RAG/elasticsearch-8.14.1/config/certs/http_ca.crt"
)
es.ping()

True

In [15]:
es.ping()

True

In [16]:
es.indices.create(index="finance",mappings= indexMapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'finance'})

In [17]:
record_list = df.to_dict("records")
record_list

[{'parent_id': '48f0d003-0870-40b3-90eb-b76476c5de8a',
  'child_id': '320c93b9-caed-438f-a0df-e32d69b11e2a',
  'child_context': 'INTRODUCTION TO THE INDIAN FINANCIAL SYSTEM AND MARKETS \nUNIT - II \nCopyright© 2021 All rights reserved. This book or any portion thereof shall not be reproduced or used in any manner \nwhatsoever without the express written permission of the AAFM INDIATM \n261 \n \nCHAPTER - 2 \n2 \n INTRODUCTION TO THE  \nINDIAN FINANCIAL SYSTEM  \nAND MARKETS  \n \nCONTENTS \nPAGE NO.',
  'vectors': array([-2.09135760e-04, -5.69760166e-02, -3.67263630e-02, -6.13701791e-02,
          1.75237923e-03,  2.74826828e-02,  5.45985252e-02,  1.89476535e-02,
          2.99802255e-02,  2.90676970e-02,  1.12640541e-02,  1.85266435e-02,
          2.26066969e-02,  7.90786147e-02,  1.67028699e-02,  6.03851071e-03,
          2.68893968e-02, -4.61107679e-03, -2.54053790e-02,  2.69206222e-02,
         -2.49871635e-04, -3.07394657e-02, -8.03698227e-03,  2.71994788e-02,
         -2.57103727

In [18]:
for record in record_list:
    try:
        es.index(index="finance",document=record)
    except Exception as e:
        print(e)

In [19]:
es.count(index="finance")

ObjectApiResponse({'count': 421, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

### fetching reults 

In [34]:
test_query = "What are the Features of Capital Markets ?"
def find_matching_parent_ids(input_query):
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('all-mpnet-base-v2')
    vector_of_query = model.encode(input_query)
    query = {
            "field":"vectors",
            "query_vector":vector_of_query,
            "k":25,
            "num_candidates":299,
        }
    results = es.knn_search(index="finance",
                            knn=query,
                            source=["child_context","parent_id"]
                            )
    fetched_ids = []


    for i in results["hits"]["hits"]:
        fetched_ids.append(i['_source']['parent_id'])
    return fetched_ids


fetched_parent_ids = find_matching_parent_ids(test_query)
fetched_parent_ids


  results = es.knn_search(index="finance",


['c4b6b697-2f7e-46b1-be33-d2b97ff21e5d',
 'c4b6b697-2f7e-46b1-be33-d2b97ff21e5d',
 '6ef54b73-679a-4e35-803e-e544a6cf77df',
 '8f9b8d73-1589-4781-b054-f9aa6bf07c74',
 '2187672f-8af9-40f3-867d-57d23c34467d',
 '6b1aeed2-d56c-41f3-b345-80b0156e7d2a',
 '6ef54b73-679a-4e35-803e-e544a6cf77df',
 '2187672f-8af9-40f3-867d-57d23c34467d',
 '8b3ee900-29f6-45c4-9021-52d982ab9d97',
 '541310de-dce3-4479-821b-5d86d3fa4c9b',
 '2187672f-8af9-40f3-867d-57d23c34467d',
 '9c6511de-9981-4bc0-b681-a185b3ec200f',
 '48f0d003-0870-40b3-90eb-b76476c5de8a',
 '6ef54b73-679a-4e35-803e-e544a6cf77df',
 '8f9b8d73-1589-4781-b054-f9aa6bf07c74',
 '6b1aeed2-d56c-41f3-b345-80b0156e7d2a',
 '2187672f-8af9-40f3-867d-57d23c34467d',
 '8f9b8d73-1589-4781-b054-f9aa6bf07c74',
 '48f0d003-0870-40b3-90eb-b76476c5de8a',
 '8f9b8d73-1589-4781-b054-f9aa6bf07c74',
 '6497c330-97a9-419e-b623-6498b1509440',
 '48f0d003-0870-40b3-90eb-b76476c5de8a',
 '067bcfe9-c7f1-4452-bd1c-b7a23285187c',
 '09a5f499-a6b7-48e7-944a-10ff9c61cfbe',
 '6ef54b73-679a-

### finding the most common chunks

In [35]:
def most_frequent_parent_ids(list_of_id):
    frequency_dict = {}
    threshold = 5
    for element in list_of_id:
        if element in frequency_dict:
            frequency_dict[element] += 1
        else:
            frequency_dict[element] = 1


    sorted_elements = sorted(frequency_dict.items(), key=lambda item: item[1], reverse=True)
    most_common_ids = []
    for i in range(0,threshold):
        most_common_ids.append(sorted_elements[i][0])
    return most_common_ids


most_common_ids=most_frequent_parent_ids(fetched_parent_ids)
most_common_ids  

['6ef54b73-679a-4e35-803e-e544a6cf77df',
 '8f9b8d73-1589-4781-b054-f9aa6bf07c74',
 '2187672f-8af9-40f3-867d-57d23c34467d',
 '48f0d003-0870-40b3-90eb-b76476c5de8a',
 'c4b6b697-2f7e-46b1-be33-d2b97ff21e5d']

### GETTING THEIR PARENT context

In [36]:
parent_context_list = []

for y in most_common_ids:
    for i in range(0,len(leaf_nodes)):
        if(y== leaf_nodes[i].parent_node.node_id):
            parent_context_list.append(nodes_by_id[leaf_nodes[i].parent_node.node_id].text)
            break
    

### performing reranking over it

In [37]:
from sentence_transformers import CrossEncoder
rankmodel = CrossEncoder("jinaai/jina-reranker-v1-tiny-en")

query = test_query
results = rankmodel.rank(query, parent_context_list, return_documents=True, top_k=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at jinaai/jina-reranker-v1-tiny-en and are newly initialized: ['bert.embeddings.position_embeddings.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.1.intermediate.dense.bias', 'bert.encoder.layer.1.intermediate.dense.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.dense.bias', 'bert.encoder.layer.1.output.dense.weight', 'bert.encoder.layer.2.intermediate.dense.bias', 'bert.encoder.layer.2.intermediate.dense.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.dense.bias', 'bert.en

In [38]:
reranked_list = []
for i in range(0,5):
    reranked_list.append(results[i]['corpus_id'])

### collecting the most similar data

In [39]:
context = ""

for i in reranked_list[0:3]:
    context+=parent_context_list[i]+"\n\n\n\n\n\n"


In [40]:
print(context)

In [41]:
from langchain_groq import ChatGroq

llm = ChatGroq(temperature=0, model_name="llama3-70b-8192",groq_api_key="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")

### prompting

In [42]:
from langchain import PromptTemplate
from langchain import LLMChain

prompt_template = PromptTemplate(
    template="These are few Context: {context} for this question Question: {question} base on this context genrate a relevant concise Answer from thi context:",
    input_variables=["context", "question"]
)

In [43]:
llm_chain = LLMChain(llm=llm, prompt=prompt_template)


In [44]:
def generate_answer(context, question):
    input_data = {
        "context": context,
        "question": question
    }
    answer = llm_chain(input_data)
    return answer


## output

In [45]:
from rich import print
print("Your Question:  \n"+test_query+"\n\n"+"Bot Reply:  \n"+generate_answer(context, test_query)['text'])

In [33]:

while True:
    test_query = input("\n\n\n\nEnter your query: ")
    if test_query == "":
      break
    fetched_parent_ids = find_matching_parent_ids(test_query)
    most_common_ids=most_frequent_parent_ids(fetched_parent_ids)
    parent_context_list = []
    for y in most_common_ids:
        for i in range(0,len(leaf_nodes)):
            if(y== leaf_nodes[i].parent_node.node_id):
                parent_context_list.append(nodes_by_id[leaf_nodes[i].parent_node.node_id].text)
                break
    
    query = test_query
    results = rankmodel.rank(query, parent_context_list, return_documents=True, top_k=5)

    reranked_list = []
    for i in range(0,5):
        reranked_list.append(results[i]['corpus_id'])

    context = ""

    for i in reranked_list[0:3]:
        context+=parent_context_list[i]+"\n\n"

    print("Your Question:  \n"+test_query+"\n\n"+"Bot Reply:  \n"+generate_answer(context, test_query)['text'])





Enter your query:  What are the Features of Capital Markets ?


  results = es.knn_search(index="finance",






Enter your query:  What are debt markets ?


  results = es.knn_search(index="finance",






Enter your query:  
