In [1]:
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import StaticEmbedding

import os
import sys
import glob
import pandas as pd
import numpy as np
import pynndescent as nn

# Add the current directory to the path
sys.path.append(os.getcwd())
import preprocess

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
glob.glob(r"C:\Users\Steven\Desktop\*.pdf", )

[]

In [3]:
# define paths
main = r"C:\Users\Steven\Documents\Python\super-search"
data = f"{main}/data/tests"
test_file = f"{data}/32286.pdf"

## To-Do:

### Database management
-~~ We need a system to handle vector database as well as allowing for fast retrieval of the files corresponding to each vector~~
    - ~~Each file needs a file_id~~
        - ~~Links to the filepath~~
    - ~~Each chunk needs a chunk_id~~
        - ~~Links to the chunk text~~
            - ~~Importantly, link to the original text, not the processed text used for embedding.~~
                - ~~**I think the solution is to do the sentence chunking on the original text, before processing**~~
    - ~~file_id + chunk_id should uniquely identify a vector in the database~~
- ~~After taking a user query and encoding it, perform similarity search in the database
    to identify a row, then link to the row's filepath and text~~
- ~~Then print a hyperlink to the file, and print the original text~~
- **Problem**: how to return images, given that we first caption them with an LLM and then encode the caption?
    - Also, should we treat each image description as a chunk, or subchunk the images for more accuracy?

### Lexical search
- Use bm25s package to create ngram indices
- Incorporate lexical search into the queries to improve accuracy on exact matches to key phrases.

### Misc.
- Parallelized PDF processing
- Timing everything to understand where the bottlenecks are
- Save performance statistics to report in the app
    - How many PDF pages have been read? 
- ~~Switch to PyMuPDF~~ (It's much faster!!!)
- ~~Save page number to the index~~
    - ~~This actually doesn't seem possible because chunking works by combining all text to a single line.~~
- ~~Allow to read text files, including code (.py, .R, .do, .sql)~~
    - ~~pymupdf can do this~~

### Incorporating Images
- Use PyMuPDF to extract images from each page of the PDF.
- Goal is to pass each image to a multi-modal LLM for summary, which is then fed into the cleaned text.
    - Possibly LLaVA for describing the images.
    - Since captioning the images would take a super long time, this should be an optional step.
        - Ideally would be done after first parsing the text, but then you might have to regenerate the whole vector base
- **Why not just use an image encoder directly (if available)? Is it bad to mix embeddings from different encoders? probably**
    - Instead could encode images separately from the text, and have a separate search function for them.

### GUI
- The following should be customizable inputs:
    - Chunk token size ("larger is faster but less accurate") (default: 256)
    - Chunk overlap ("larger gives more context per chunk") (default: chunk_size / 4)
    - Choice of sentence transformer: provide a few options based on speed/accuracy tradeoff.
        - Fastest: static-retrieval-mrl-en-v1
        - Medium: bge-m3
        - Slowest: gte-large-en-v1.5
        - (these are subject to change)
    - Index database save location
    - Similarity matrix (default: cosine)

In [12]:
# Current encoding model implementation: static-retrieval-mrl-en-v1
# https://huggingface.co/sentence-transformers/static-retrieval-mrl-en-v1
# Model defaults to 1024 dense dimensions, but can be truncated to save space/time

truncated_dimensions = 1024

model = SentenceTransformer(
    "sentence-transformers/static-retrieval-mrl-en-v1"
    , device="cpu"
    , truncate_dim=truncated_dimensions
    )


In [10]:
## TESTING
# Importing a lot of PDFs to see how long this takes
papers_repo = r"C:\Users\Steven\Documents\Python\Data\NBER papers"

files = os.listdir(papers_repo)
files.sort(reverse=True)

full_dict = {
    'raw_chunk': []
    , 'processed_chunk': []
    , 'file_path': []
}

counter=1

for paper in files[0:100]:
    f = f"{papers_repo}/{paper}"
    iter_dict = preprocess.prepare_PDF(f)
    full_dict['raw_chunk'].extend(iter_dict['raw_chunk'])
    full_dict['processed_chunk'].extend(iter_dict['processed_chunk'])
    full_dict['file_path'].extend(iter_dict['file_path'])
    if counter%5==0:
        print(f"Finished file {counter}.")
    counter+=1

#df = pd.DataFrame.from_dict(full_dict)
#df

# Currently takes around 1 second per file (with tokenization chunking)
# Takes ~ 0.7 seconds with approximate chunking
# After switching to PyMuPDF, 0.1 seconds per file, but 3 cmsOpenProfileFromMem errors

Finished file 5.
Finished file 10.
Finished file 15.
Finished file 20.
Finished file 25.
Finished file 30.
Finished file 35.
Finished file 40.
Finished file 45.
Finished file 50.
Finished file 55.
Finished file 60.
Finished file 65.
Finished file 70.
Finished file 75.
MuPDF error: format error: cmsOpenProfileFromMem failed

Finished file 80.
Finished file 85.
Finished file 90.
Finished file 95.
MuPDF error: format error: cmsOpenProfileFromMem failed

Finished file 100.


In [13]:
vecs = model.encode(full_dict['processed_chunk'])
# This returns a np array of shape (n, d), where n is 
#     number of chunks and d is embedding dimensions.

full_dict['vector'] = [i for i in np.unstack(vecs)]
# Add the embeddings to our dataframe in a single variable,
#     so each cell contains the d-dimensional np vector.

full_dict
# takes 3-4 seconds

{'raw_chunk': ['NBER WORKING PAPER SERIES\nRETIREMENT AND THE EVOLUTION OF PENSION STRUCTURE\nLeora Friedberg\nAnthony Webb\nWorking Paper 9999\nhttp://www.nber.org/papers/w9999\nNATIONAL BUREAU OF ECONOMIC RESEARCH\n1050 Massachusetts Avenue\nCambridge, MA 02138\nSeptember 2003\nWe would like to thank Scott J. Adams, Hugo Benítez Silva, Courtney Coile, Vince Crawford, Daniel\nDulitzky, Marjorie Flavin, Alan Gustman, Ted Groves, Jon Gruber, Jim Poterba, and participants of several\nseminars for very helpful comments.  We are grateful to Vince Crawford, Cathy Liebowitz, and Bob Peticolas\nfor enormous help with obtaining and/or explaining the HRS pension data.The views expressed herein are\nthose of the authors and are not necessarily those of the National Bureau of Economic Research. \n©2003 by Leora Friedberg and Anthony Webb.  All rights reserved. Short sections of text, not to exceed two\nparagraphs, may be quoted without explicit permission provided that full credit, including © no

In [15]:
full_dict.__sizeof__()

168

In [33]:
# testing querying the index

query = 'Madison and Jefferson reaction in January and February 1792' # reference to paper 9943

# Encode the query
query_vec = model.encode(query)

# Search for nearest neighbors in the df



In [13]:
vecs.shape

(19619, 1024)

In [16]:
index = nn.NNDescent(vecs)
index.prepare()

In [34]:
index.query(query_vec.reshape(1,-1), k=3)
# 10829, 10830, 10851

(array([[10829, 10830, 10851]], dtype=int32),
 array([[133.33989, 139.48766, 140.08379]], dtype=float32))

In [35]:
for i in [10829, 10830, 10851]:
    print(df['raw_chunk'][i])
    print(df['file_path'][i])

strikingly similar to Hamilton’s earlier report that Jefferson and
 -3-
1  Annals of Congress, 1 (January 8, 1790), p. 969.
2  Annals of Congress, 1 (January 15, 1790), p. 1095.
Madison opposed.  
Given the report’s importance in the history of U.S. economic policy, this paper explores
the reception and immediate legislative impact of the report.  After briefly reviewing the
contents and proposals in the December 1791 report, the paper turns to Madison’s and
Jefferson’s reaction to it in January and February 1792.  In February and March 1792, Congress
debated bounties for the cod fisheries and additional revenue proposals involving tariffs, both of
which related to Hamilton’s report.  Finally, the paper examines the turn of manufacturing
interests away from the Federalists as the Jeffersonian Republican policy of reciprocity offered
the hope of
C:\Users\Steven\Documents\Python\Data\NBER papers/9943.pdf
1791 report, the paper turns to Madison’s and
Jefferson’s reaction to it in January 

In [8]:
os.getcwd()

'c:\\Users\\Steven\\Documents\\Python\\super-search\\Python Code'

In [9]:
preprocess.prepare_PDF(r"..\test file.txt")


AssertionError: This is not a PDF file. Use a different function.