In [50]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
from langchain_community.embeddings import OllamaEmbeddings
import pandas as pd
import time
import os

In [32]:
directory = 'pdfs/'
pdfs = []
for filename in os.listdir(directory):
    if filename.endswith('.pdf'):
        filepath = os.path.join(directory, filename)
        pdfs.append(filepath)

In [34]:
loader = PyMuPDFLoader('pdfs/1-s2.0-S0022437522001335-main.pdf')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=80)
documents = loader.load()
documents[0].metadata

{'source': 'pdfs/1-s2.0-S0022437522001335-main.pdf',
 'file_path': 'pdfs/1-s2.0-S0022437522001335-main.pdf',
 'page': 0,
 'total_pages': 6,
 'format': 'PDF 1.7',
 'title': 'A pilot study evaluating the effectiveness of preventing railway suicides by mid-track fencing, which restrict easy access to high-speed train tracks',
 'author': 'Johan Fredin-Knutzén',
 'subject': 'Journal of Safety Research, 83 (2022) 232-237. doi:10.1016/j.jsr.2022.08.019',
 'keywords': '',
 'creator': 'Elsevier',
 'producer': 'Acrobat Distiller 8.0.0 (Windows)',
 'creationDate': "D:20221130162908+05'30'",
 'modDate': "D:20221130163005+05'30'",
 'trapped': ''}

In [39]:
start_time = time.time()
stats = []
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=80)
    documents = loader.load()
    chunks = text_splitter.split_documents(documents)
    word_count = 0
    for chunk in chunks:
        word_count += len(chunk.page_content.split(' '))
    obj = {
        'file': pdf,
        'title': documents[0].metadata['title'],
        'words': word_count,
        'chunks': len(chunks),
        'pages': len(documents),
        'author': documents[0].metadata['author'],
        'subject': documents[0].metadata['subject'],
        'keywords': documents[0].metadata['keywords'],
        'producer': documents[0].metadata['producer'],
        'creator': documents[0].metadata['creator'],
        'created': documents[0].metadata['creationDate'],
    }
    stats.append(obj)
end_time = time.time()
print(f'took {(end_time-start_time)} to chunk {len(stats)} pdfs')

took 1.8941690921783447 to chunk 52 pdfs


In [40]:
stats = pd.DataFrame(stats)
stats.head()

Unnamed: 0,file,title,words,chunks,pages,author,subject,keywords,producer,creator,created
0,pdfs/fpsyt-09-00116.pdf,Decision-Making in Suicidal Behavior: The Prot...,8405,66,9,Gergö Hadlaczky,Background: Loss aversion is a central and wel...,"loss aversion, decision-making, suicide, attem...",Adobe PDF Library 10.0.1,Adobe InDesign CS6 (Windows),D:20180331094507+05'30'
1,pdfs/1-s2.0-S0022437523001639-main.pdf,A pilot study evaluating the preventive effect...,6206,47,7,Johan Fredin-Knutzén,"Journal of Safety Research, 88 (2024) 78-84. d...","Suicide,Accident,Railway,Station,Platform-end ...",Acrobat Distiller 8.1.0 (Windows),Elsevier,D:20240312122711Z
2,pdfs/149.pdf,,5226,45,7,,,,Apogee Create Series3 v1.0,3B2 Total Publishing System 7.51n/W,D:20051222144342
3,pdfs/10.1177_14034948211000836.pdf,,4006,31,6,,,,Adobe PDF Library 9.9,Adobe InDesign CS5.5 (7.5),D:20220526144906+05'30'
4,pdfs/ten.tea.2009.0426.pdf,untitled,7240,61,10,,,,Acrobat Distiller 6.0 (Windows),3B2 Total Publishing System 7.51o/W,D:20100125114559+05'30'


### 516 pages in total
### 2746 chunks in total
### 317485 words in total
### 34M in pdf data
print(f'{sum(stats["pages"])} pages in total')
print(f'{sum(stats["chunks"])} chunks in total')
print(f'{sum(stats["words"])} words in total')
print('34M in pdf data')

In [45]:
models = [
    'flan-t5-xl:latest',
    'gte-base:latest',
    'command-r:v0.1',
    'qwen2:0.5b',
    'phi3:mini-128k',
    'mxbai-embed-large:latest',
    'all-minilm:latest',
    'nomic-embed-text:latest',
    'gemma:2b-instruct-v1.1-q2_K',
    'tinyllama:latest',
    'llama3:latest'
]

In [49]:
N_TEST_CHUNKS = 10
loader = PyMuPDFLoader('pdfs/1-s2.0-S0022437522001335-main.pdf')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=80)
test_doc = documents = loader.load()
test_chunks = text_splitter.split_documents(test_doc)[:N_TEST_CHUNKS]

embed_stats = []
start_time = time.time()
for model in models:
    embedding = OllamaEmbeddings(model='mxbai-embed-large:latest')
    temp = embedding.embed_query('This is a quick text')

end_time = time.time()
print(f'Total Processing took {end_time-start_time} seconds at {N_TEST_CHUNKS} per model.')

15

In [51]:
embedding = OllamaEmbeddings(model='mxbai-embed-large:latest') # optionally self host ollama and use http://localhost:11434
temp = embedding.embed_query('This is a quick text')
len(temp)

1024