In [None]:
import json
import sys

sys.path.append('C:/Users/tjker/Desktop/Research/Projects/lit_review/lit_review')
import utils

sys.path.append('C:/Users/tjker/Desktop/Research/Projects/lit_review/configs')
from build_kg_config import config
kg = utils.load_kg(config)

Now that we have all of this nice data organized we are going to load in each pdf and chunk it up and then create a Chunk node for each chunk

In [4]:
import pymupdf4llm
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open(f"{config['data']['data_path']}/paper_node_to_pdf_with_url.json", 'r') as file:
    updated_data = json.load(file)

# Global constants
VECTOR_INDEX_NAME = 'paper_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2500,
    chunk_overlap  = 250,
    length_function = len,
    is_separator_regex = False,
)

def paper_data_from_file(paper, text_splitter):
    chunks_with_metadata = []
    md_text = pymupdf4llm.to_markdown(paper['pdf_path'])
    split_text = text_splitter.split_text(md_text)
    for i, chunk in enumerate(split_text):
        chunks_with_metadata.append({
            'text': chunk, 
            'paperId': paper['paperId'],
            'chunkId': f"{i}_{paper['paperId']}"
        })
    print(f'\tSplit into {len(split_text)} chunks')
    return chunks_with_metadata

kg.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
""")

merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.paperId = $chunkParam.paperId, 
        mergedChunk.text = $chunkParam.text
RETURN mergedChunk
"""

node_count = 0
for paper in updated_data:
    chunks = paper_data_from_file(paper, text_splitter)
    for chunk in chunks:
        kg.query(merge_chunk_node_query, 
                params={
                    'chunkParam': chunk
                })
        node_count += 1
    print(f"Created {node_count} nodes")

Processing C:/Users/tjker/Desktop/Research/Projects/lit_review/data/paper_pdfs/ca743e75ce090bbf686307e41bd8747661768fbe.pdf...
	Split into 27 chunks
Created 27 nodes
Processing C:/Users/tjker/Desktop/Research/Projects/lit_review/data/paper_pdfs/e342165a614588878ad0f4bc9bacf3905df34d08.pdf...
	Split into 74 chunks
Created 101 nodes
Processing C:/Users/tjker/Desktop/Research/Projects/lit_review/data/paper_pdfs/4f1502111d35aa6651dfaedfeb1184b3c3dd2fcb.pdf...
	Split into 26 chunks
Created 127 nodes
Processing C:/Users/tjker/Desktop/Research/Projects/lit_review/data/paper_pdfs/3333fa6dc9d39cad3d5cd87da9ae39e5a6aefe27.pdf...
	Split into 28 chunks
Created 155 nodes
Processing C:/Users/tjker/Desktop/Research/Projects/lit_review/data/paper_pdfs/36a5328c337697b96c9e6a9a04df0c924aa421f7.pdf...
	Split into 20 chunks
Created 175 nodes
Processing C:/Users/tjker/Desktop/Research/Projects/lit_review/data/paper_pdfs/0bbd619ad6dfb69114735d6d8ca166c20301188b.pdf...
	Split into 26 chunks
Created 201 nodes

In [5]:
kg.query("""
         CREATE VECTOR INDEX `paper_chunks` IF NOT EXISTS
          FOR (c:Chunk) ON (c.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 2048,
            `vector.similarity_function`: 'cosine'    
         }}
""")

[]

In [6]:
result = kg.query("""
    MATCH (c:Chunk) 
    RETURN elementId(c) AS chunk_id, c.text AS text
    """
)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm.auto import tqdm

tokenizer = AutoTokenizer.from_pretrained(config['model']['model_id'])
model = AutoModel.from_pretrained(config['model']['model_id'])

def compute_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():     
        outputs = model(**inputs) 
    return outputs.last_hidden_state.mean(dim=1).squeeze(0).tolist()


for record in tqdm(result):
    chunk_id = record["chunk_id"]
    text = record["text"]
    
    if text:
        embedding = compute_embedding(text)
        kg.query("""
            MATCH (c:Chunk) 
            WHERE elementId(c) = $chunk_id
            SET c.textEmbedding = $embedding
            RETURN elementId(c) AS chunk_id, c.textEmbedding AS embedding
            """, params={"chunk_id":chunk_id, "embedding":embedding}
        )

  0%|          | 0/714 [00:00<?, ?it/s]