In [1]:
from langchain_community.document_loaders import PyPDFLoader
import getpass
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings

In [2]:
dataset = "dataset.pdf"

In [3]:
loader = PyPDFLoader(dataset)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [4]:
len(pages)

101

In [5]:
pages[0:2]

Document(metadata={'producer': 'macOS Version 10.15.7 (Build 19H2) Quartz PDFContext', 'creator': 'Word', 'creationdate': '2020-10-16T05:33:41+00:00', 'author': 'Ranjan Vohra', 'moddate': '2022-04-30T18:17:43+05:30', 'source': 'dataset.pdf', 'total_pages': 101, 'page': 0, 'page_label': '1'}, page_content='1  \n  9th November 1992  To Shri S.C. Gupta Joint Secretary Lok Sabha Secretariat Parliament House Annexe New Delhi 110 001.   Dear Shri Gupta,  Ref: Your letter no. 10/3(4)/JC(SBI)/92 dt. 06.10.92   I acknowledge receipt of your letter asking me to appear before the Joint Parliamentary Committee on 12th and 13th November, 1992 and I hereby confirm my appearance.   I request you to grant me an opportunity to make a slide presentation on the occasion. As per your requirement, I am forwarding you 40 copies containing details of my presentation which may be treated as my Memorandum. You are requested to circulate it to the Committee Members.  Thanking you,  Yours faithfully,  (Harshad S

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [7]:
text_split = RecursiveCharacterTextSplitter(
             chunk_size=1000,  
             chunk_overlap=200,  
             add_start_index=True, 
)

In [8]:
all_split = text_split.split_documents(pages)

In [20]:
new_splits = all_split[0:20]

In [21]:
len(new_splits)

20

In [11]:
### Vector Embedding

In [12]:
## We are going to use Gemini model for embedding

In [13]:
if not os.environ.get("Google_API_KEY"):
    os.environ["Google_API_KEY"] = getpass.getpass("Enter Gemini key: ")
    

Enter Gemini key:  ········


In [14]:
## embedding:
embedding = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")

In [88]:
for i in range(0,len(new_splits)):
    embedding.embed_query(new_splits[i].page_content)

    

In [89]:
len(vector_1) , len(vector_5)

(768, 768)

In [90]:
len(vector_2) , len(vector_9)

(768, 768)

In [91]:
#### Now i will store them in a database of faiss

In [92]:
## setup faiss

In [114]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [115]:
embedding_dim = len(embedding.embed_query("hello world"))
index = faiss.IndexFlatL2(embedding_dim)

vector_store = FAISS(
    embedding_function=embedding,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [116]:
ids = vector_store.add_documents(documents=new_splits)


In [125]:
## saving the embedding
vector_store.save_local("faiss_index_dir")


In [103]:
# suppose now i only have the faiss index directory to use

In [126]:
from langchain.vectorstores import FAISS

In [127]:
returned_vector = FAISS.load_local("faiss_index_dir", embedding , allow_dangerous_deserialization=True)

In [128]:
returned_vector

<langchain_community.vectorstores.faiss.FAISS at 0x26f07ae89d0>

In [117]:
embedding_dim

768

In [129]:
new_data1 = await returned_vector.asimilarity_search("""
                   What did Dr. Manmohan Singh say about the root causes of the Indian banking scam in the context of structural reforms?""")


In [131]:
results = vector_store.similarity_search_with_score("What did Dr. Manmohan Singh say about the root causes of the Indian banking scam in the context of structural reforms?")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

Score: 0.40228527784347534

page_content='14  
  GENESIS OF THE SO CALLED SCAM  DR. MANMOHAN SINGH HAS STATED IN AN ARTICLE ON “AFTER ONE YEAR OF STRUCTURAL REFORMS IN INDIAN ECONOMY” THAT  “INDIA HAS AN OVER REGULATED BUT UNDER GOVERNED BANKING SYSTEM. OVER THE YEARS, A GROWING PORTION OF BANK DEPOSITS CAME TO BE INVESTED IN LOW YIELDING GOVERNMENT SECURITIES. MOREOVER, 40% OF BANK CREDIT HAS TO BE EARMARKED FOR PRIORITY SECTORS WITH VARYING ELEMENTS OF CONCESSIONALITY BUILT INTO IT. THUS BANKS GOT INVOLVED IN UNCONVENTIONAL MEANS OF IMPROVING THEIR PROFITABILITY WHICH IN THE ABSENCE OF EFFECTIVE CONTROL MECHANISM AND COLLUSION BETWEEN SOME BANK OFFICIALS AND STOCK MARKET OPERATORS LED TO DIVERSION OF BANK FUNDS INTO STOCK MARKET.”' metadata={'producer': 'macOS Version 10.15.7 (Build 19H2) Quartz PDFContext', 'creator': 'Word', 'creationdate': '2020-10-16T05:33:41+00:00', 'author': 'Ranjan Vohra', 'moddate': '2022-04-30T18:17:43+05:30', 'source': 'dataset.pdf', 'total_pages': 101, 'pa

In [130]:
new_data1[0]

Document(id='3437032d-ee2c-4254-9109-86c74d01ccf8', metadata={'producer': 'macOS Version 10.15.7 (Build 19H2) Quartz PDFContext', 'creator': 'Word', 'creationdate': '2020-10-16T05:33:41+00:00', 'author': 'Ranjan Vohra', 'moddate': '2022-04-30T18:17:43+05:30', 'source': 'dataset.pdf', 'total_pages': 101, 'page': 13, 'page_label': '14', 'start_index': 0}, page_content='14  \n  GENESIS OF THE SO CALLED SCAM  DR. MANMOHAN SINGH HAS STATED IN AN ARTICLE ON “AFTER ONE YEAR OF STRUCTURAL REFORMS IN INDIAN ECONOMY” THAT  “INDIA HAS AN OVER REGULATED BUT UNDER GOVERNED BANKING SYSTEM. OVER THE YEARS, A GROWING PORTION OF BANK DEPOSITS CAME TO BE INVESTED IN LOW YIELDING GOVERNMENT SECURITIES. MOREOVER, 40% OF BANK CREDIT HAS TO BE EARMARKED FOR PRIORITY SECTORS WITH VARYING ELEMENTS OF CONCESSIONALITY BUILT INTO IT. THUS BANKS GOT INVOLVED IN UNCONVENTIONAL MEANS OF IMPROVING THEIR PROFITABILITY WHICH IN THE ABSENCE OF EFFECTIVE CONTROL MECHANISM AND COLLUSION BETWEEN SOME BANK OFFICIALS AND ST

In [119]:
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x26f07aead70>

In [120]:
ids

['c1bc7397-3b6c-446e-a03e-b29bd8d8dfc1',
 'db4aaece-d2e3-4c43-952f-e415d1f3fe16',
 '8878676d-182e-4603-8922-dee6b6a1da13',
 'f6a14ebd-ae9c-4fa4-9cf9-822326e2b5ef',
 'c36776ab-b695-43bf-98ac-b20858879ff6',
 '451b3c41-e8d8-4c9e-ae44-d79164a0f537',
 'a2c13597-0f9d-4059-8d83-15ca3102795a',
 'cf27519e-a5d1-47c1-9a58-a88032cd37f7',
 'c2d992a6-7bc8-4a7c-aaf2-8b673a728d2c',
 'e99c665e-2f7c-4d68-8347-88091673ff25',
 'd4448d6d-15b3-43b2-8fc6-1fb68261f1d4',
 '9cefc7d9-cdc4-4f0d-a7ea-df54d3730826',
 '4929d1af-5e63-4040-9084-659072b33252',
 'c4ee9400-c8f2-4641-b60a-015c8d6680c1',
 '32438d55-230a-40dc-9217-0d4e8911dde5',
 '0403d21f-c2bd-4684-b263-af745781f175',
 '45debeca-0e19-43a4-a6c1-3b2fa69958bb',
 '3437032d-ee2c-4254-9109-86c74d01ccf8',
 'ed36607d-d53f-41ad-a75a-4741ed04d65f',
 '181b6fae-75d0-4d32-994b-7402d70830a7']

In [121]:
##testing

In [122]:
new_data = vector_store.similarity_search("""I request you to grant me an opportunity to make a slide presentation on the occasion. 
                                         As per your requirement, I am forwarding you 40 copies containing details of my presentation 
                                         which may be treated as my Memorandum.
                                         You are requested to circulate it to the Committee Members"""
    
)

In [123]:
new_data1 = await vector_store.asimilarity_search("""
                   What did Dr. Manmohan Singh say about the root causes of the Indian banking scam in the context of structural reforms?""")


In [124]:
new_data1[0]

Document(id='3437032d-ee2c-4254-9109-86c74d01ccf8', metadata={'producer': 'macOS Version 10.15.7 (Build 19H2) Quartz PDFContext', 'creator': 'Word', 'creationdate': '2020-10-16T05:33:41+00:00', 'author': 'Ranjan Vohra', 'moddate': '2022-04-30T18:17:43+05:30', 'source': 'dataset.pdf', 'total_pages': 101, 'page': 13, 'page_label': '14', 'start_index': 0}, page_content='14  \n  GENESIS OF THE SO CALLED SCAM  DR. MANMOHAN SINGH HAS STATED IN AN ARTICLE ON “AFTER ONE YEAR OF STRUCTURAL REFORMS IN INDIAN ECONOMY” THAT  “INDIA HAS AN OVER REGULATED BUT UNDER GOVERNED BANKING SYSTEM. OVER THE YEARS, A GROWING PORTION OF BANK DEPOSITS CAME TO BE INVESTED IN LOW YIELDING GOVERNMENT SECURITIES. MOREOVER, 40% OF BANK CREDIT HAS TO BE EARMARKED FOR PRIORITY SECTORS WITH VARYING ELEMENTS OF CONCESSIONALITY BUILT INTO IT. THUS BANKS GOT INVOLVED IN UNCONVENTIONAL MEANS OF IMPROVING THEIR PROFITABILITY WHICH IN THE ABSENCE OF EFFECTIVE CONTROL MECHANISM AND COLLUSION BETWEEN SOME BANK OFFICIALS AND ST

In [132]:
### now using eclidian distance to find the similarity and most lower distance will have closer match

In [135]:
from langchain_community.vectorstores import FAISS
vector_store_new = FAISS.load_local("faiss_index_dir" , embedding , allow_dangerous_deserialization = True)

In [138]:
v_index = vector_store_new.index

In [140]:
stored_vectors = v_index.reconstruct_n(0, v_index.ntotal)


In [141]:
stored_vectors

array([[-0.00106366,  0.00715559, -0.06084427, ..., -0.00116317,
        -0.02768887,  0.01467393],
       [ 0.06324232,  0.02489971, -0.048758  , ...,  0.03703496,
        -0.01691704,  0.01630147],
       [ 0.04001369, -0.00107695, -0.06623501, ...,  0.02761392,
        -0.00343676,  0.0381248 ],
       ...,
       [ 0.05889084, -0.01527281, -0.02560736, ...,  0.03543722,
        -0.03361479, -0.01988224],
       [ 0.0445253 ,  0.00058609, -0.04038693, ...,  0.03051963,
        -0.02246741, -0.02482348],
       [ 0.06156801, -0.02290945, -0.06145952, ...,  0.03249632,
        -0.02252628,  0.01240308]], shape=(20, 768), dtype=float32)

In [144]:
# user input:
query_vector = embedding.embed_query("What did Dr. Manmohan Singh say about the banking scam?")


In [145]:
import numpy as np
def compute_Elidian_Distance(x,y):
    x1 = np.array(x)
    x2 = np.array(y)
    return np.sqrt(np.sum(x1-x2)**2)
    
    

In [148]:
distances = []
for stored_vector in stored_vectors:
    distances.append(compute_Elidian_Distance(query_vector , stored_vector))

In [155]:
distances

[np.float64(0.2817903586058037),
 np.float64(0.7043296681440552),
 np.float64(1.2788254320257693),
 np.float64(1.3679556986826356),
 np.float64(0.8972238131882477),
 np.float64(1.307401616788411),
 np.float64(0.5411418436706299),
 np.float64(1.3587910908354388),
 np.float64(1.0849988155823667),
 np.float64(0.5406018695684907),
 np.float64(0.953243935284263),
 np.float64(0.9254246906730259),
 np.float64(0.12450256116426317),
 np.float64(1.8969382656505331),
 np.float64(0.7736140946981322),
 np.float64(0.9036632409079175),
 np.float64(1.4403763246473318),
 np.float64(0.5739828581863549),
 np.float64(0.6035573079207097),
 np.float64(1.0577815964679758)]

In [166]:
top_5 = 5
top_index = np.argsort(distances)[:1]

In [167]:
top_index

array([12])

In [172]:
for idx in top_index:
    doc_id = vector_store_new.index_to_docstore_id[idx]
    document = vector_store_new.docstore._dict[doc_id]
    print(f"Distance: {distances[idx]}")
    print("Document Content:\n", document.page_content)


Distance: 0.12450256116426317
Document Content:
 9  
       REQUEST   I REQUEST JPC NOT TO DRAW SURMISES BASED ON THE VERSIONS OF RBI, JANAKIRAMAN COMMITTEE AND CBI. IT IS BROKERS WHO CAN GIVE THE REAL INSIGHT INTO WHAT HAPPENED IN THE MARKET PLACE AS THEY ONLY KNOW IT FIRST HAND. TO APPRAISE THE MEMBERS OF THIS AUGUST BODY COMPLETELY I HAVE TAKEN LIBERTY TO DRAW THE REAL AND STARK PICTURE OF THE AFFAIRS.
