In [50]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
from langchain_community.embeddings import OllamaEmbeddings
import pandas as pd
import time
import os

In [32]:
directory = 'pdfs/'
pdfs = []
for filename in os.listdir(directory):
    if filename.endswith('.pdf'):
        filepath = os.path.join(directory, filename)
        pdfs.append(filepath)

In [34]:
loader = PyMuPDFLoader('pdfs/1-s2.0-S0022437522001335-main.pdf')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=80)
documents = loader.load()
documents[0].metadata

{'source': 'pdfs/1-s2.0-S0022437522001335-main.pdf',
 'file_path': 'pdfs/1-s2.0-S0022437522001335-main.pdf',
 'page': 0,
 'total_pages': 6,
 'format': 'PDF 1.7',
 'title': 'A pilot study evaluating the effectiveness of preventing railway suicides by mid-track fencing, which restrict easy access to high-speed train tracks',
 'author': 'Johan Fredin-Knutzén',
 'subject': 'Journal of Safety Research, 83 (2022) 232-237. doi:10.1016/j.jsr.2022.08.019',
 'keywords': '',
 'creator': 'Elsevier',
 'producer': 'Acrobat Distiller 8.0.0 (Windows)',
 'creationDate': "D:20221130162908+05'30'",
 'modDate': "D:20221130163005+05'30'",
 'trapped': ''}

In [39]:
start_time = time.time()
stats = []
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=80)
    documents = loader.load()
    chunks = text_splitter.split_documents(documents)
    word_count = 0
    for chunk in chunks:
        word_count += len(chunk.page_content.split(' '))
    obj = {
        'file': pdf,
        'title': documents[0].metadata['title'],
        'words': word_count,
        'chunks': len(chunks),
        'pages': len(documents),
        'author': documents[0].metadata['author'],
        'subject': documents[0].metadata['subject'],
        'keywords': documents[0].metadata['keywords'],
        'producer': documents[0].metadata['producer'],
        'creator': documents[0].metadata['creator'],
        'created': documents[0].metadata['creationDate'],
    }
    stats.append(obj)
end_time = time.time()
print(f'took {(end_time-start_time)} to chunk {len(stats)} pdfs')

took 1.8941690921783447 to chunk 52 pdfs


In [40]:
stats = pd.DataFrame(stats)
stats.head()

Unnamed: 0,file,title,words,chunks,pages,author,subject,keywords,producer,creator,created
0,pdfs/fpsyt-09-00116.pdf,Decision-Making in Suicidal Behavior: The Prot...,8405,66,9,Gergö Hadlaczky,Background: Loss aversion is a central and wel...,"loss aversion, decision-making, suicide, attem...",Adobe PDF Library 10.0.1,Adobe InDesign CS6 (Windows),D:20180331094507+05'30'
1,pdfs/1-s2.0-S0022437523001639-main.pdf,A pilot study evaluating the preventive effect...,6206,47,7,Johan Fredin-Knutzén,"Journal of Safety Research, 88 (2024) 78-84. d...","Suicide,Accident,Railway,Station,Platform-end ...",Acrobat Distiller 8.1.0 (Windows),Elsevier,D:20240312122711Z
2,pdfs/149.pdf,,5226,45,7,,,,Apogee Create Series3 v1.0,3B2 Total Publishing System 7.51n/W,D:20051222144342
3,pdfs/10.1177_14034948211000836.pdf,,4006,31,6,,,,Adobe PDF Library 9.9,Adobe InDesign CS5.5 (7.5),D:20220526144906+05'30'
4,pdfs/ten.tea.2009.0426.pdf,untitled,7240,61,10,,,,Acrobat Distiller 6.0 (Windows),3B2 Total Publishing System 7.51o/W,D:20100125114559+05'30'


### 516 pages in total
### 2746 chunks in total
### 317485 words in total
### 34M in pdf data
print(f'{sum(stats["pages"])} pages in total')
print(f'{sum(stats["chunks"])} chunks in total')
print(f'{sum(stats["words"])} words in total')
print('34M in pdf data')

In [60]:
models = [
    ('gte-base:latest', '117 MB'),
    ('command-r:v0.1', '20 GB'),
    ('qwen2:0.5b', '352 MB'),
    ('phi3:mini-128k', '2.2 GB'),
    ('mxbai-embed-large:latest', '669 MB'),
    ('all-minilm:latest', '45 MB'),
    ('nomic-embed-text:latest', '274 MB'),
    ('gemma:2b-instruct-v1.1-q2_K', '1.2 GB'),
    ('tinyllama:latest', '637 MB'),
    ('llama3:latest', '4.7 GB')
]

In [68]:
N_TEST_CHUNKS = 10
loader = PyMuPDFLoader('pdfs/1-s2.0-S0022437522001335-main.pdf')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=80)
test_doc = loader.load()
test_chunks = text_splitter.split_documents(test_doc)[:N_TEST_CHUNKS]

embed_stats = []
start_time = time.time()
for model in models:
    print(f'Begin: {model}')
    model_start = time.time()
    embedding = OllamaEmbeddings(model=model[0])
    embedding.embed_documents(test_chunks)

    model_end = time.time()
    embed_stats.append({
        'model': model[0],
        'size': model[1],
        'time': f'{model_end-model_start}s'
    })

end_time = time.time()
print(f'Total Embed Processing took {end_time-start_time} seconds at {N_TEST_CHUNKS} per model.')
embed_stats = pd.DataFrame(embed_stats)

Begin: ('gte-base:latest', '117 MB')
Begin: ('command-r:v0.1', '20 GB')
Begin: ('qwen2:0.5b', '352 MB')
Begin: ('phi3:mini-128k', '2.2 GB')
Begin: ('mxbai-embed-large:latest', '669 MB')
Begin: ('all-minilm:latest', '45 MB')
Begin: ('nomic-embed-text:latest', '274 MB')
Begin: ('gemma:2b-instruct-v1.1-q2_K', '1.2 GB')
Begin: ('tinyllama:latest', '637 MB')
Begin: ('llama3:latest', '4.7 GB')
Total Embed Processing took 1201.8721401691437 seconds at 10 per model.


In [76]:
embed_stats['n_chunks'] = N_TEST_CHUNKS
embed_stats['chunk_size'] = 1000
embed_stats['overlap'] = 80
embed_stats['time_per_chunk'] = pd.to_numeric(embed_stats['time'].str.extract('([0-9.]+)')[0]) / embed_stats['n_chunks']
embed_stats['estimate_all_chunks'] = embed_stats['time_per_chunk'] * 2746
print(f'Total time all {sum(embed_stats["estimate_all_chunks"])} seconds')
print(f'Total time all {sum(embed_stats["estimate_all_chunks"])/60} minutes')
print(f'Total time all {sum(embed_stats["estimate_all_chunks"])/60/60} hours')

Total time all 330027.4991134167 seconds
Total time all 5500.458318556945 minutes
Total time all 91.67430530928242 hours


In [78]:
(330027.4991134167 - 306812.633707) / 60

386.9144234402784

In [80]:
embed_stats['hours_per_model'] = embed_stats['estimate_all_chunks'] / 60 / 60
embed_stats

Unnamed: 0,model,size,time,n_chunks,chunk_size,overlap,time_per_chunk,estimate_all_chunks,hours_per_model
0,gte-base:latest,117 MB,1.2513840198516846s,10,1000,80,0.125138,343.630052,0.095453
1,command-r:v0.1,20 GB,1117.3074789047241s,10,1000,80,111.730748,306812.633707,85.225732
2,qwen2:0.5b,352 MB,4.989994049072266s,10,1000,80,0.498999,1370.252366,0.380626
3,phi3:mini-128k,2.2 GB,22.486275672912598s,10,1000,80,2.248628,6174.7313,1.715203
4,mxbai-embed-large:latest,669 MB,2.841114044189453s,10,1000,80,0.284111,780.169917,0.216714
5,all-minilm:latest,45 MB,0.7188971042633057s,10,1000,80,0.07189,197.409145,0.054836
6,nomic-embed-text:latest,274 MB,1.6609559059143066s,10,1000,80,0.166096,456.098492,0.126694
7,gemma:2b-instruct-v1.1-q2_K,1.2 GB,11.922788858413696s,10,1000,80,1.192279,3273.997821,0.909444
8,tinyllama:latest,637 MB,6.755856990814209s,10,1000,80,0.675586,1855.15833,0.515322
9,llama3:latest,4.7 GB,31.9133939743042s,10,1000,80,3.191339,8763.417985,2.434283
