In [66]:
# import packages
import random 
import re
import requests
import numpy as np

from pypdf import PdfReader
from time import perf_counter
from chonkie import SemanticChunker
from model2vec import StaticModel
from vicinity import Vicinity
from numpy import linalg
from sklearn.decomposition import PCA

ModuleNotFoundError: No module named 'sklearn'

In [7]:
# define paths
main = r"C:/Users\Steven\Documents/Python\Semantic Indexing Tool"
data = r"C:/Users\Steven\Documents/Python\Data/NBER papers"
test_file = f"{data}/32362.pdf"

indexes = f'{main}/indexes'

In [36]:
# Define models
model = StaticModel.from_pretrained("minishlab/potion-base-8M")

# Basic initialization with default parameters
chunker = SemanticChunker(
    embedding_model="minishlab/potion-base-8M",  # Default model
    threshold='auto',                            # Similarity threshold (0-1) or (1-100) or "auto"
    chunk_size=512,                              # Maximum tokens per chunk
    min_sentences=1,                             # Initial sentences per chunk
    similarity_window=3,                         # Number of sentences to compare for similarity
)

### Future goals:
#### PDF extraction:
- identify elements from the paper to exclude, such as the references pages
- identify the abstract to take advantage of its summary function.
    - idea: what if we finetune an LLM to input a paper and output the abstract? Could be neat.

In [18]:
# Stop Words
stop_words = [
      'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves'
    , 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours'
    , 'yourself', 'yourselves', 'he', 'him', 'his', 'himself'
    , 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself'
    , 'they', 'them', 'their', 'theirs', 'themselves', 'what'
    , 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were'
    , 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did'
    , 'doing', 'a', 'an', 'the', 'and', 'because'
    , 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about'
    , 'from', 'so', 'very', 'should', "should've"
    ]

# Preprocessing function
def preprocess(text):
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    for word in stop_words:
        d = word.title()
        text = text.replace(f' {word} ', ' ')
        text = text.replace(f' {d} ', ' ')
    text = text.replace('  ', ' ')
    text = text.replace('  ', ' ')
    return(text)

In [21]:
# Function to convert PDF to chunkable text
def prepare_PDF(in_path):
    reader = PdfReader(in_path)

    # combine all pages into one list
    paper = []
    for page in reader.pages:
        # extract text from page
        page_text = page.extract_text()

        # append to paper
        paper.append(page_text)

    # convert list into string
    paper_one_string = ' '.join(paper)
    paper_one_string = preprocess(paper_one_string)

    return(paper_one_string)

In [45]:
# Chunk and embed function
def vectorize(prepared_pdf):
    chunked = chunker.chunk(prepared_pdf)
    chunk_texts = [chunk.text for chunk in chunked]
    chunk_embeddings = model.encode(chunk_texts)
    return(chunk_embeddings, chunk_texts)

In [46]:
paper = prepare_PDF(test_file)
test = vectorize(paper)

In [54]:
linalg.eig(test[0])

LinAlgError: Last 2 dimensions of the array must be square

In [65]:
m=test[0]
n=m.T@m
linalg.eig(n).eigenvectors.shape

(256, 256)

In [48]:
# Create a Vicinity instance
vicinity = Vicinity.from_vectors_and_items(vectors=test[0], items=test[1])

In [49]:
vicinity.query()

<vicinity.vicinity.Vicinity at 0x2585e876540>

In [14]:
paper[0]

'NBER WORKING PAPER SERIES EDUCATION AND ADULT COGNITION IN A LOW-INCOME SETTING: DIFFERENCES AMONG ADULT SIBLINGS Yuan S. Zhang Elizabeth Frankenberg Duncan Thomas Working Paper 32362 http://www.nber.org/papers/w32362 NATIONAL BUREAU OF ECONOMIC RESEARCH 1050 Massachusetts Avenue Cambridge, MA 02138 April 2024 Financial support from the National Institute on Aging (K99/R00 AG070274 [Zhang]) and Eunice Kennedy Shriver National Institute of Child Health and Human Development (P2C HD050924 [Frankenberg] and T32 HD091058 [Zhang]) is gratefully acknowledged The views expressed herein are those of the authors and do not necessarily reflect the views of the National Bureau of Economic Research. NBER working papers are circulated for discussion and comment purposes. They have not been peer-reviewed or been subject to the review by the NBER Board of Directors that accompanies official NBER publications. © 2024 by Yuan S. Zhang, Elizabeth Frankenberg, and Duncan Thomas. All rights reserved. Sho

In [19]:
len(model.tokenize([paper[0]])[0])

256