In [1]:
pip install PyPDF2 transformers==4.46.0 faiss-cpu

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"  # Temporary fix
os.environ["FAISS_NO_OPENMP"] = "1"  # Prevent FAISS from using OpenMP

In [3]:
from PyPDF2 import PdfReader
from tqdm import tqdm

In [4]:
path = "../../volumes/pdfs/2023050195.pdf"

pdf_content = PdfReader(path)

In [5]:
def get_pdf_texts(pdf_path):
    pdf_text_dict = {}
    for page_no, page in enumerate(tqdm(pdf_content.pages), start=1):
        text = page.extract_text()
        pdf_text_dict[page_no] = text
    return pdf_text_dict

In [6]:
# pdf_texts = []
# for page in tqdm(pdf_content.pages):
#     text = page.extract_text()
#     pdf_texts.append(text)

In [7]:
pdf_text_dict = get_pdf_texts(path)

100%|██████████| 404/404 [00:22<00:00, 17.88it/s]


In [12]:
print(pdf_text_dict[401])

THE CONSTITUTION OF  INDIA 
(Appendix I)  
 368
1 2 3 4 5      6 
43. (i) Chhit Nalgram Sitalkuchi Patgarm 66 49.5 
 (ii) Chhit Nalgram 
(Fragment) Sitalkuchi Patgarm 66  
44. (i) Batrigachh Dinhata Kaliganj 81 577.37 
 (ii) Batrigachh 
(Fragment) Dinhata Kaliganj 81  
 (iii) Batrigachh 
(Fragment) Dinhata Phulbari 9  
45. (i) Karala Dinhata Phulbari 9 269.91 
 (ii) Karala (fragment) Dinhata Phulbari 9  
 (iii) Karala (fragment) Dinhata Phulbari 8  
46. (i) Sipprasad Mustati Dinhata Phulbari 8 373.2 
 (ii) Sipprasad Mustati 
(Fragment) Dinhata Phulbari 6  
47. (i) Dakshin 
Masaldanga Dinhata Bhurungamari 6 571.38 
 (ii) Dakshin 
Masaldanga 
(Fragment) Dinhata Bhurungamari 6  
 (iii) Dakshin 
Masaldanga 
(Fragment) Dinhata Bhurungamari 6  
 (iv) Dakshin 
Masaldanga 
(Fragment) Dinhata Bhurungamari 6  
 (v) Dakshin 
Masaldanga 
(Fragment) Dinhata Bhurungamari 6  
 (vi) Dakshin 
Masaldanga 
(Fragment) Dinhata Bhurungamari 6  


In [21]:
print(pdf_text_dict[200])

THE CONSTITUTION OF  INDIA 
(Part XII. —Finance, Property, Contracts and Suits) 
 169
Provided that any property which at the date when it would have so 
accrued to His Majesty or to the Ruler of an Indian State was in the possession 
or under the control of the Government of India or the Government of a State 
shall, according as the purposes for which it was then used or held were 
purposes of the Union or of a State, vest in the Union or in that State. 
Explanation.— In this article, the expressions “Ruler” and “Indian State” 
have the same meanings as in article 363. 
1[297. Things of value within territorial waters or continental shelf 
and resources of the exclusive economic zone to vest in the Union .—(1) All 
lands, minerals and other things of value underlying the ocean within the 
territorial waters, or the continental shelf, or the exclusive economic zone, of 
India shall vest in the Union and be held for the purposes of the Union. 
(2) All other resources of the exclusive e

In [30]:
print(pdf_text_dict[1])

 
 
 
 
 
 THE CONSTITUTION OF INDIA 
[As on       May , 2022] 
2022 
 


In [31]:
print(pdf_text_dict[404])

 371APPENDIX III 
1DECLRATION UNDER ARTICLE 370(3) OF THE CONSTITUTION 
C.O. 273 
In exercise of the powers conferred by clause (3) of article 370  read 
with clause (1)  of article 370 of the Constitution of India, the President, on the 
recommendation of Parliament, is pleased to declare that, as from the 6th 
August, 2019, all clauses of the said article 370 shall cease to be operative 
except  the following which shall read as under, namely :—  
“370. All provisions of this Constitution, as amended from time to 
time, without any modifications or exceptions, shall apply to the State of 
Jammu and Kashmir notwithstanding anything contrary contained in 
article 152 or article 308 or any other article of this Constitution or any 
other provision of the Constitution of Jammu and Kashmir or any law, 
document, judgement, ordinance, order, by-law, rule, regulation, 
notification, custom or usage having the force of law in the territory of 
India, or any other instrument, treaty or agreem

In [8]:
import numpy as np

In [9]:
from transformers import AutoModel
from numpy.linalg import norm

cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
model = AutoModel.from_pretrained('../../volumes/models/jina-embeddings-v2-base-en/', trust_remote_code=True) # trust_remote_code is needed to use the encode method
embeddings = model.encode(['How is the weather today?', 'What is the current weather like today?'])
print(cos_sim(embeddings[0], embeddings[1]))

0.9341315


In [15]:
len(embeddings[0])

768

In [16]:
def get_embeddings(text:list):
    embeddings = model.encode(text)
    normalized_embeddings = embeddings/norm(embeddings[0])
    return normalized_embeddings

In [29]:
def batch_embeddings(texts, batch_size=5):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        embeddings = get_embeddings(batch)
        all_embeddings.extend(embeddings.tolist())
    return np.array(all_embeddings)

In [28]:
# res = batch_embeddings(list(pdf_text_dict.values())[0:10], batch_size=5)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:18<00:00,  9.26s/it]


In [31]:
result_embeddings =  batch_embeddings(list(pdf_text_dict.values()), batch_size=5)

100%|██████████████████████████████████████████████████████████████████████████████████| 81/81 [16:37<00:00, 12.31s/it]


In [10]:
# import pickle

# with open("embeddings.pkl", "wb") as p:
#     pickle.dump(result_embeddings, p)

In [24]:
list(pdf_text_dict.values())[0:10]

[' \n \n \n \n \n THE CONSTITUTION OF INDIA \n[As on       May , 2022] \n2022 \n ',
 ' \n \nPREFACE \n \nThis is the  fifth  pocket size edition of the Constitution of \nIndia in the diglot form. In this edition, the text of the \nConstitution of India has been brought up-to-date by \nincorporating therein all the amendments up to the Constitution \n(One Hundred and Fifth Amendment) Act, 2021. The foot notes \nbelow the text indicate the Constitution Amendment Acts by \nwhich such amendments have been made.  \nThe Constitution (One Hundredth Amendment) Act, 2015 \ncontaining details of acquired and transferred territories \nbetween the Governments of India and Bangladesh has been \nprovided in APPENDIX I. \nThe Constitution (Application to Jammu and Kashmir) \nOrder, 2019 and the declaration under article 370(3) of the \nConstitution have been provided respectively in Appendix II and \nAppendix III for reference. \n \n \nNew Delhi;                                              Dr. Reeta

In [25]:
list(pdf_text_dict.values())[0:10][1]

' \n \nPREFACE \n \nThis is the  fifth  pocket size edition of the Constitution of \nIndia in the diglot form. In this edition, the text of the \nConstitution of India has been brought up-to-date by \nincorporating therein all the amendments up to the Constitution \n(One Hundred and Fifth Amendment) Act, 2021. The foot notes \nbelow the text indicate the Constitution Amendment Acts by \nwhich such amendments have been made.  \nThe Constitution (One Hundredth Amendment) Act, 2015 \ncontaining details of acquired and transferred territories \nbetween the Governments of India and Bangladesh has been \nprovided in APPENDIX I. \nThe Constitution (Application to Jammu and Kashmir) \nOrder, 2019 and the declaration under article 370(3) of the \nConstitution have been provided respectively in Appendix II and \nAppendix III for reference. \n \n \nNew Delhi;                                              Dr. Reeta Vasishta, \n                                 Secretary to the Government of India.  

In [26]:
list(pdf_text_dict.values())[0:10][3]

'THE CONSTITUTION OF INDIA   \n____________                                                                     \n                             \nCONTENTS \n__________ \n \n                                                                                           \nPREAMBLE \nPART I \nTHE UNION AND ITS TERRITORY \nARTICLES \n  1. Name and territory of the Union. \n  2. Admission or establishment of new States. \n[2A.         Sikkim to be associated with the Union. —Omitted.] \n  3. Formation of new States and alteration of areas, boundaries or \nnames of existing  States. \n  4. Laws made under articles 2 and 3 to provide for the amendment of \nthe First and the Fourth Schedules and supplemental, incidental \nand consequential matters. \nPART II \nCITIZENSHIP \n  5. Citizenship at the commencement of the Constitution. \n6. Rights of citizenship of certain persons who have migrated to \nIndia from Pakistan. \n  7. Rights of citizenship of certain migrants to Pakistan. \n8. Rights of citi

In [None]:
res

In [11]:
import pickle

with open("embeddings.pkl", "rb") as p:
    embeddings = pickle.load(p)

In [12]:
embeddings[0]

array([-2.57174131e-02, -4.15116362e-03,  1.00257555e-02,  7.95926899e-02,
       -5.00002429e-02, -3.94614115e-02,  1.33040652e-03,  2.53507891e-03,
        3.81039083e-02,  6.45715967e-02, -3.22419666e-02,  2.89166835e-03,
       -4.41535637e-02,  6.61504979e-04, -5.27535751e-02,  3.93252261e-02,
        2.10104175e-02,  7.21139321e-03, -1.50410482e-03,  1.66766183e-03,
       -2.18644273e-02,  1.02400398e-02, -3.55437882e-02, -2.15868000e-03,
        2.28184578e-03,  1.63813438e-02,  1.12895691e-03,  7.70135522e-02,
        3.45573463e-02,  2.44991146e-02, -2.00111717e-02,  6.72883913e-03,
       -2.55250596e-02, -5.63424500e-03, -7.87753426e-03, -3.85418697e-03,
       -2.48634145e-02, -7.07174744e-03,  3.84833887e-02,  3.21004912e-02,
       -2.16781627e-04, -2.28117988e-03,  1.99199300e-02,  4.85927574e-02,
       -4.08204794e-02,  2.73830462e-02, -1.26146292e-02, -7.49849342e-03,
       -4.45639668e-03,  1.89360857e-04, -2.24946439e-03, -3.31600965e-03,
       -1.60166472e-02, -

In [13]:
embeddings[1]

array([-1.43356454e-02, -2.16784477e-02,  3.21097896e-02,  2.72039808e-02,
       -4.73676957e-02, -1.18169822e-02, -2.75448826e-03, -2.73635313e-02,
        2.27017626e-02,  7.22942054e-02, -3.79111543e-02,  2.92207138e-03,
       -2.39777602e-02,  1.10094333e-02, -2.19638925e-02,  2.56774910e-02,
        1.94586087e-02, -1.32784937e-02, -4.69307834e-03, -2.03847066e-02,
       -3.48538011e-02, -2.24664509e-02, -6.02852693e-03,  9.74042434e-03,
        8.85831658e-03,  4.31385264e-02,  2.20383871e-02,  5.08100763e-02,
        5.48611768e-02,  3.38501818e-02, -2.81725992e-02,  3.53083224e-03,
        7.75846944e-04, -1.82817597e-02, -1.83023009e-02, -3.09302025e-02,
       -7.22549707e-02,  5.76679828e-03,  7.24002123e-02,  3.40825729e-02,
       -4.33760975e-03,  1.72757935e-02, -5.63281029e-03,  7.27394000e-02,
       -4.40556072e-02, -7.95468804e-04, -4.01170459e-03, -2.62629353e-02,
       -2.25911066e-02, -1.19620198e-02, -9.47790034e-03, -2.43393853e-02,
        1.08281998e-02, -

In [None]:
d = 128  # Dimension of vectors
nb = 1000  # Number of database vectors
np.random.seed(42)
data = np.random.random((nb, d)).astype('float32')

In [None]:
data

In [15]:
import faiss
d = 768
index = faiss.IndexFlatL2(d)

In [16]:
index.add(embeddings)

In [17]:
faiss.write_index(index, "../../volumes/indexes/law_corpus_index.bin")

In [19]:
list(pdf_text_dict.values())[0:10][0]

' \n \n \n \n \n THE CONSTITUTION OF INDIA \n[As on       May , 2022] \n2022 \n '

In [20]:
list(pdf_text_dict.values())[0:10][1]

' \n \nPREFACE \n \nThis is the  fifth  pocket size edition of the Constitution of \nIndia in the diglot form. In this edition, the text of the \nConstitution of India has been brought up-to-date by \nincorporating therein all the amendments up to the Constitution \n(One Hundred and Fifth Amendment) Act, 2021. The foot notes \nbelow the text indicate the Constitution Amendment Acts by \nwhich such amendments have been made.  \nThe Constitution (One Hundredth Amendment) Act, 2015 \ncontaining details of acquired and transferred territories \nbetween the Governments of India and Bangladesh has been \nprovided in APPENDIX I. \nThe Constitution (Application to Jammu and Kashmir) \nOrder, 2019 and the declaration under article 370(3) of the \nConstitution have been provided respectively in Appendix II and \nAppendix III for reference. \n \n \nNew Delhi;                                              Dr. Reeta Vasishta, \n                                 Secretary to the Government of India.  

In [22]:
list(pdf_text_dict.values())[0:10][2]

' \nLIST OF ABBREVIATIONS USED \n \n \nArt., arts.  ........................................................  for Article, articles. \nCl., cls.     ........................................................   ″   Clause, clauses. \nC.O.          ........................................................   ″   Constitution Order. \nIns.            ........................................................   ″    Inserted. \nP., pp.       ........................................................   ″    Page, pages. \nPt.             ........................................................   ″    Part. \nRep.          ........................................................   ″    Repealed. \nSs., ss.     ..........................................................   ″    Section, sections. \nSch.         .........................................................   ″    Schedule. \nSubs.         ........................................................   ″    Substituted. \nw.e.f.       ...........

In [23]:
list(pdf_text_dict.values())[0:10][6]

'Contents \n \n   ARTICLES (iv)\n  40. Organisation of village panchayats. \n  41. Right to work, to education and to public assistance in certain \ncases. \n  42. Provision for just and humane conditions of work and maternity \nrelief. \n  43. Living wage, etc., for workers. \n43A. Participation of workers in management of Industries. \n43B. Promotion of co-operative societies. \n  44. Uniform civil code for the citizens. \n  45. Provision for early childhood care and educat ion to children \nbelow the age of six years. \n  46. Promotion of educational and economic interests of Scheduled \nCastes, Scheduled Tribes and other weaker sections. \n  47. Duty of the State to raise the level of nutrition and the standard \nof living and to improve public health. \n  48. Organisation of agriculture and animal husbandry. \n48A. Protection and improvement of environment and safeguarding of \nforests and wild life. \n  49. Protection of monuments and places and objects of national \nimportance. 

In [23]:
query = np.random.random((5, d)).astype('float32')

In [24]:
query

array([[0.04977802, 0.48656103, 0.28263223, ..., 0.9249028 , 0.49694043,
        0.6121883 ],
       [0.02063165, 0.32649937, 0.5049617 , ..., 0.41199866, 0.43590757,
        0.20361727],
       [0.37153634, 0.62894505, 0.84184766, ..., 0.6600337 , 0.12406218,
        0.18649617],
       [0.00358555, 0.36700717, 0.8201504 , ..., 0.3835851 , 0.03757534,
        0.33080533],
       [0.41282707, 0.23166862, 0.9693633 , ..., 0.92313445, 0.02211179,
        0.21810389]], dtype=float32)

In [25]:
faiss.write_index(index, "faiss_test_index.bin")
# print("Index saved successfully!")

In [1]:
import faiss
index = faiss.read_index("faiss_test_index.bin")
print("Index loaded successfully!")
print("Number of vectors in the index:", index.ntotal)

Index loaded successfully!
Number of vectors in the index: 10


In [2]:
print("Index type:", type(index))
print("Is Trained:", index.is_trained)
print("Number of vectors:", index.ntotal)


Index type: <class 'faiss.swigfaiss_avx2.IndexFlatL2'>
Is Trained: True
Number of vectors: 10


In [3]:
import numpy as np
vector_10 = np.zeros(768, dtype='float32')
index.reconstruct(10, vector_10) 

array([-4.33520480e-34,  1.96945212e-39,  1.63441963e-34,  7.23149259e-29,
        0.00000000e+00,  0.00000000e+00,  3.44876660e-28,  7.18866112e-43,
        1.71810193e+19,  7.14495034e+31,  5.44333404e-30,  7.18866112e-43,
        2.80259693e-45,  0.00000000e+00, -4.22660315e-38,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  6.38779203e-30,  7.18866112e-43,
        0.00000000e+00,  0.00000000e+00,  3.44716688e-28,  7.18866112e-43,
        1.71810193e+19,  7.14495034e+31,  5.44333404e-30,  7.18866112e-43,
        2.80259693e-45,  0.00000000e+00,  4.20622715e-38,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  6.38779203e-30,  7.18866112e-43,
        0.00000000e+00,  0.00000000e+00,  1.85896145e+34,  7.77666437e+31,
        1.71810666e+19,  7.14495034e+31,  1.57661930e-19,  1.94412705e+31,
        1.14918035e-38,  0.00000000e+00,  9.28516057e-39,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  6.38779203e-30,  7.18866112e-43,
        0.00000000e+00,  

In [4]:
test_emb = index.reconstruct(10, vector_10) 

In [5]:
len(test_emb)

768

In [6]:
print("Index Type:", type(index))
print("Index Dimension:", index.d)
print("Number of Vectors in Index:", index.ntotal)
print("Query Shape:", test_emb.shape)
print("Is Index Trained:", index.is_trained)

Index Type: <class 'faiss.swigfaiss_avx2.IndexFlatL2'>
Index Dimension: 768
Number of Vectors in Index: 10
Query Shape: (768,)
Is Index Trained: True


In [18]:
query = "which pocket size is this edition?"
query_embeddings = model.encode([query])

In [19]:
query_embeddings

array([[-3.73132437e-01, -2.49501050e-01,  5.96238971e-01,
         1.60232633e-01, -7.16607630e-01, -3.35337073e-01,
         2.09526420e-01, -5.55256248e-01,  6.18032277e-01,
         8.19526017e-01, -6.39822304e-01,  1.57655537e-01,
        -2.45462924e-01, -1.64276659e-01, -2.11889967e-02,
         5.78294039e-01,  3.00301939e-01, -4.89581376e-02,
         5.43662719e-03, -2.26307601e-01, -6.92653283e-02,
        -2.94720709e-01, -5.32017469e-01, -3.34114343e-01,
         2.40843698e-01,  1.39474165e+00,  8.41520846e-01,
         9.00752783e-01,  3.70669335e-01,  3.69088054e-01,
        -2.56014347e-01, -3.89395177e-01, -4.00742084e-01,
        -4.74080443e-01,  5.93443394e-01, -1.70544237e-01,
        -7.75071025e-01,  6.44626856e-01,  7.97609329e-01,
         8.33190203e-01,  1.20621324e-02,  4.71046597e-01,
        -3.30124229e-01,  1.22842538e+00, -7.82999873e-01,
        -8.05311918e-01,  2.30400667e-01,  1.82415247e-01,
        -2.73626685e-01,  2.95862228e-01, -2.96776861e-0

In [10]:
from retriever import vector_db_retriever

Index loaded successfully!
Number of vectors in the index: 404


In [25]:
query = " explain Seventh Amendment Act"
query_embeddings = model.encode([query])

In [26]:
result = vector_db_retriever(query_embeddings, 15)

In [27]:
for idx in result[0][0]:
    print(pdf_text_dict[idx])
    print("==========================================================================")

THE CONSTITUTION OF  INDIA 
(Part VI.—The States) 101
218. Application of certain provisions relating to Supreme Court to 
High Courts .—The provisions of clauses (4) and (5) of article 124 shall apply 
in relation to a High Court as they apply in relation to the Supreme Court with 
the substitution of references to the High Court for references to the Supreme 
Court. 
219. Oath or affirmation by Judges of High Courts. —Every person 
appointed to be a Judge of a High Court 1*** shall, before he enters upon his 
office, make and subscribe before the Governor of the State, or some person 
appointed in that behalf by him, an oath or affirmation according to the form 
set out for the purpose in the Third Schedule. 
2[220. Restriction on practice after being a permanent Judge .—No 
person who, after the commencement of this Constitution, has held office as a 
permanent Judge of a High Court shall plead or act in any court or before any 
authority in India except the Supreme Court and the ot

In [None]:
index.search(query_embeddings, 5)

In [7]:
test_emb = test_emb.reshape(1, -1)  # Reshape to (1, dimension)


In [8]:
test_emb

array([[ 9.24338506e-41,  0.00000000e+00, -1.97645558e+34,
         1.11390840e-38, -6.25416305e-08,  7.09057023e-43,
        -2.18818045e-16,  7.09057023e-43,  8.30269340e-42,
         0.00000000e+00,             nan,             nan,
         8.57594660e-43,  0.00000000e+00,  1.35631564e-19,
         1.35631564e-19,  1.65943575e-07,  6.86198462e-07,
         1.30291750e-11,  1.45852466e-19,  6.33692210e-10,
         6.40969056e-10,  5.20015877e+22,  2.50378429e-12,
         4.02852196e-11,  1.69701607e-07,  4.32033157e-05,
         1.66864609e-07,  2.33013520e-09,  4.12557526e-08,
         1.71538787e-07,  4.03717308e-08,  1.17036854e-19,
         1.35631564e-19,  2.57977081e-06,  1.68751939e-07,
         6.51968313e-10,  6.30808572e-10,  1.35671579e-19,
         1.68018843e-04,  6.66505962e-10,  1.30285106e-11,
         1.45859084e-19,  2.53488541e-09,  4.22524463e-05,
         5.34819389e+22,  2.58910536e-12,  4.16379153e-11,
         6.56483280e-07,  4.14877057e-08,  4.19508019e-0

In [9]:
print("Index Type:", type(index))
print("Index Dimension:", index.d)
print("Number of Vectors in Index:", index.ntotal)
print("Query Shape:", test_emb.shape)
print("Is Index Trained:", index.is_trained)


Index Type: <class 'faiss.swigfaiss_avx2.IndexFlatL2'>
Index Dimension: 768
Number of Vectors in Index: 10
Query Shape: (1, 768)
Is Index Trained: True


In [None]:
import faiss
import numpy as np

# Set the embedding dimension
dimension = 768  
num_vectors = 10  

# Create random embeddings
np.random.seed(42)
embeddings = np.random.random((num_vectors, dimension)).astype('float32')

# Create a FAISS index
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Create a random query vector
query_vector = np.random.random((1, dimension)).astype('float32')

# Search in the index
distances, indices = index.search(query_vector, 5)

# Print results
print("Search successful!")
print("Nearest Neighbors:", indices)
print("Distances:", distances)


In [None]:
index.search(test_emb, 5)

In [102]:
import torch

torch.nn.functional.cosine_similarity(torch.tensor(temp_1[0]).unsqueeze(0), torch.tensor(temp_1[1]).unsqueeze(0))

tensor([0.8617])

In [103]:
import torch

torch.nn.functional.cosine_similarity(torch.tensor(temp_2[0]).unsqueeze(0), torch.tensor(temp_2[1]).unsqueeze(0))

tensor([0.8617])

In [77]:
import numpy as np

np.linalg.norm(embeddings[0])

13.994422