In [19]:
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import StaticEmbedding

import os
import sys
import glob
import pandas as pd
import numpy as np
import pynndescent as nn
from tqdm import tqdm

# Add the current directory to the path
sys.path.append(os.getcwd())
from preprocess import prepare_PDF, prepare_text

In [15]:
# This recursively searches every subdirectory and file
filepath = r"C:\Users\Steven\Documents\Python"

glob.glob(f"{filepath}/**/*", recursive=True) 

['C:\\Users\\Steven\\Documents\\Python\\(2022-01-17) - Cooking.py',
 'C:\\Users\\Steven\\Documents\\Python\\(2022-01-17) - Mouse Movements.ipynb',
 'C:\\Users\\Steven\\Documents\\Python\\20221030 - YouTube Data.ipynb',
 'C:\\Users\\Steven\\Documents\\Python\\Archive',
 'C:\\Users\\Steven\\Documents\\Python\\Chess',
 'C:\\Users\\Steven\\Documents\\Python\\Collatz_Art',
 'C:\\Users\\Steven\\Documents\\Python\\Data',
 'C:\\Users\\Steven\\Documents\\Python\\Financial Sentiment Analysis',
 'C:\\Users\\Steven\\Documents\\Python\\Finding Ideal Climate.ipynb',
 'C:\\Users\\Steven\\Documents\\Python\\Fitbit Data',
 'C:\\Users\\Steven\\Documents\\Python\\Game Simulations',
 'C:\\Users\\Steven\\Documents\\Python\\Image Detection',
 'C:\\Users\\Steven\\Documents\\Python\\Image Manipulation',
 'C:\\Users\\Steven\\Documents\\Python\\Image Stuff.ipynb',
 'C:\\Users\\Steven\\Documents\\Python\\Interdec 2022',
 'C:\\Users\\Steven\\Documents\\Python\\Other Projects',
 'C:\\Users\\Steven\\Documents\\Pyth

In [25]:
def prepare_filelist(filepath, allowed_text_types = ['.txt', '.r', '.do', '.py', '.sas', '.sql', '.vba']):
    files = glob.glob(f"{filepath}/**/*", recursive=True)
    
    pdfs = [file for file in files if file.lower().endswith('.pdf')]
    
    t = []
    for i in allowed_text_types:
        t.append([file for file in files if file.lower().endswith(i)])
    
    texts = [ts for tss in t for ts in tss] # flatten the list of lists
    
    return {'pdfs': pdfs, 'texts': texts}

In [29]:
def prepare_directory(file_path, log_file = "chunking_log.txt", allowed_text_types = ['.txt', '.r', '.do', '.py', '.sas', '.sql', '.vba']):
    files = prepare_filelist(file_path, allowed_text_types)

    full_dict = {
        'raw_chunk': []
        , 'processed_chunk': []
        , 'file_path': []
    }
    
    # Process PDFs
    if len(files['pdfs']) > 0:
        for file in tqdm(files['pdfs'], desc = "Chunking PDFs"):
            iter_dict = prepare_PDF(file)
            full_dict['raw_chunk'].extend(iter_dict['raw_chunk'])
            full_dict['processed_chunk'].extend(iter_dict['processed_chunk'])
            full_dict['file_path'].extend(iter_dict['file_path'])
    
    # Process text files
    if len(files['texts']) > 0:
        for file in tqdm(files['texts'], desc = "Chunking Text Files"):
            iter_dict = prepare_text(file)
            full_dict['raw_chunk'].extend(iter_dict['raw_chunk'])
            full_dict['processed_chunk'].extend(iter_dict['processed_chunk'])
            full_dict['file_path'].extend(iter_dict['file_path'])

    assert len(full_dict['raw_chunk']) > 0, "Found no files to analyze."

    return(full_dict)

In [30]:
test = prepare_directory(r"C:\Users\Steven\Downloads")
test

Chunking PDFs:   0%|          | 0/36 [00:00<?, ?it/s]

Chunking PDFs: 100%|██████████| 36/36 [00:06<00:00,  5.83it/s]
Chunking Text Files: 100%|██████████| 38/38 [00:00<00:00, 105.46it/s]


{'raw_chunk': ['9/22/1998\nD.O.B.:\nM\nSEX:\n2431 : STEVEN VANOMMEREN\nAll Star Pediatrics, LLC  702 Gordon Drive, Exton, PA 193411253\n6103631330\nVaccination Record - Confidential Information\nPatient:\nReturn Address:\nRobert C. Duncheskie, MD\n702 Gordon Drive\nExton, PA 193411253\n12/27/2024\nDate printed:\nAge on date printed: 26 yrs. 3 mos.\nForecast: .........  Recommended Range......   Earliest\n..........................................................................................\n..........\nHPV ................ Today      - 09/21/2013\nFlu - Seasonal ..... Today      - \nMeningococcal B .... Today      - \nCOVID19 ............ Today      - \nPneumococcal ....... 09/22/2063 -    09/22/2000\nTd/Tdap Booster .... 02/16/2025 - 03/15/2025   02/16/2020\n*  Varicella Documented disease(05/09/2007).\n-\nVaccination\nNo active medication allergies or reactions; No documented food/non-medication allergies\nAllergies...\n02/16/2015\nTdap\n01/19/2010\nTdap\n01/19/2010\nMCV4\n10/14/

In [None]:
# Current encoding model implementation: static-retrieval-mrl-en-v1
# https://huggingface.co/sentence-transformers/static-retrieval-mrl-en-v1
# Model defaults to 1024 dense dimensions, but can be truncated to save space/time
truncated_dimensions = 1024
model = SentenceTransformer(
    "sentence-transformers/static-retrieval-mrl-en-v1"
    , device="cpu"
    , truncate_dim=truncated_dimensions
    )

## TESTING
# Importing a lot of PDFs to see how long this takes
papers_repo = r"C:\Users\Steven\Documents\Python\Data\NBER papers"

files = os.listdir(papers_repo)
files.sort(reverse=True)

full_dict = {
    'raw_chunk': []
    , 'processed_chunk': []
    , 'file_path': []
}

counter=1

for paper in files[0:100]:
    f = f"{papers_repo}/{paper}"
    iter_dict = prepare_PDF(f)
    full_dict['raw_chunk'].extend(iter_dict['raw_chunk'])
    full_dict['processed_chunk'].extend(iter_dict['processed_chunk'])
    full_dict['file_path'].extend(iter_dict['file_path'])
    if counter%5==0:
        print(f"Finished file {counter}.")
    counter+=1

df = pd.DataFrame.from_dict(full_dict)
df

# Currently takes around 1 second per file (with tokenization chunking)
# Takes ~ 0.7 seconds with approximate chunking
# After switching to PyMuPDF, 0.1 seconds per file, but 3 cmsOpenProfileFromMem errors

vecs = model.encode(df['processed_chunk'])
# This returns a np array of shape (n, d), where n is 
#     number of chunks and d is embedding dimensions.

df['vector'] = [i for i in np.unstack(vecs)]
# Add the embeddings to our dataframe in a single variable,
#     so each cell contains the d-dimensional np vector.

df
# takes 3-4 seconds