In [1]:
from dotenv import load_dotenv
import os
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm.auto import tqdm

from langchain_neo4j import Neo4jGraph

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv('C:/Users/tjker/Desktop/Research/Projects/lit_review/lit_review/.env', override=True)
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USERNAME = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
NEO4J_DATABASE = os.getenv("NEO4J_DATABASE")

In [3]:
kg = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD)

cypher = """
MATCH (n)
RETURN count(n) AS numberOfNodes
"""

result = kg.query(cypher)
result

[{'numberOfNodes': 878}]

In [4]:
kg.query("""
    DROP INDEX abstract_embeddings    
  """
)

[]

# Prepping text for RAG

In [5]:
# from sentence_transformers import SentenceTransformer

# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# model_id = "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf"
# model_id = "meta-llama/Llama-3.1-8B-Instruct"
# model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model_id = "meta-llama/Llama-3.2-1B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)

def compute_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():     
        outputs = model(**inputs) 
    return outputs.last_hidden_state.mean(dim=1).squeeze(0).tolist()


In [6]:
result = kg.query("""
    MATCH (p:Paper) 
    RETURN elementId(p) AS node_id, p.abstract AS abstract
    """
)

In [7]:
for record in tqdm(result):
    node_id = record["node_id"]
    abstract = record["abstract"]
    
    if abstract:
        # embedding = model.encode(abstract)
        embedding = compute_embedding(abstract)
        kg.query("""
            MATCH (p:Paper) WHERE elementId(p) = $node_id
            SET p.abstractEmbedding = $embedding
            RETURN elementId(p) AS node_id, p.abstract AS abstract
            """, params={"node_id":node_id, "embedding":embedding}
        )

100%|██████████| 771/771 [16:14<00:00,  1.26s/it]


In [8]:
kg.query("""
  CREATE VECTOR INDEX abstract_embeddings IF NOT EXISTS
  FOR (p:Paper) ON (p.abstractEmbedding) 
  OPTIONS { indexConfig: {
    `vector.dimensions`: 2048,
    `vector.similarity_function`: 'cosine'
  }}"""
)

[]

In [53]:
kg.query("""
  SHOW VECTOR INDEXES
  """
)

[{'id': 2,
  'name': 'abstract_embeddings',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Paper'],
  'properties': ['abstractEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2025, 1, 20, 20, 18, 58, 444000000, tzinfo=<UTC>),
  'readCount': 61}]

In [12]:
question = "Transformer architecture"
# question_embedding = model.encode(question)
question_embedding = compute_embedding(question)

kg.query("""
    CALL db.index.vector.queryNodes(
        'abstract_embeddings', 
        $top_k, 
        $question_embedding
        ) YIELD node AS paper, score
    RETURN paper.title, paper.abstract, score
    """, 
    params={"top_k":5,
            "question_embedding": question_embedding
            })

[{'paper.title': 'LightFace: A Hybrid Deep Face Recognition Framework',
  'paper.abstract': 'Face recognition constitutes a relatively a popular area which has emerged from the rulers of the social media to top universities in the world. Those frontiers and rule makers recently designed deep learning based custom face recognition models. A modern face recognition pipeline consists of four common stages: detecting, alignment, representation and verification. However, face recognition studies mainly mention the representation stage of a pipeline. In this paper, first of all a review face recognition has been done and then the description of the developed lightweight hybrid high performance face recognition framework has been made. Its hybrid feature enables to switch face recognition models among state-of-the-art ones.',
  'score': 0.8669426441192627},
 {'paper.title': 'GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium',
  'paper.abstract': 'When it comes 

# Create Fulltext index for title searching

In [None]:
# kg.query("""
#     CREATE FULLTEXT INDEX paperTitleIndex FOR (p:Paper) ON EACH [p.title]
#     """
# )

In [None]:

title = 'Latent Space Editing in Transformer-Based Flow Matching'
kg.query("""
    CALL db.index.fulltext.queryNodes('paperTitleIndex', $title)     
    YIELD node, score
    RETURN node.paperId, score
    LIMIT 1
    """, params={'title': title}
)

[{'node.paperId': 'ca743e75ce090bbf686307e41bd8747661768fbe',
  'score': 12.453763008117676}]

# Loading PDFs and adding to KG

In [None]:
import requests

pdf_urls = [
    'https://arxiv.org/pdf/2312.10825', 
    'https://arxiv.org/pdf/2211.13227',
    'https://arxiv.org/pdf/2312.04410',
    "https://arxiv.org/pdf/2210.05559",
    'https://arxiv.org/pdf/2312.07330',
    'https://www.mdpi.com/1999-4893/17/3/125',
    "https://arxiv.org/pdf/2412.05984",
    "https://arxiv.org/pdf/2210.06462",
    "https://pdf.sciencedirectassets.com/315710/1-s2.0-S2468502X24X00029/1-s2.0-S2468502X24000019/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjELL%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLWVhc3QtMSJIMEYCIQD9bqDAJg03RCd8F1xN1Do2wEO9pfqK%2FYutivd1i2roFAIhAKpkWXP%2BIKEx9E6JSYUP1BJdXNYDcWwdKIXU%2FEmNQ7GvKrwFCKv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEQBRoMMDU5MDAzNTQ2ODY1IgxgoiaPGOeSGzPKN9IqkAWkti50dRew%2FQd9dU4dDpv1RpeyBy0JLYTuBpQZnlIULcjDzXIqOFGNJgcjxgJXUF8CeMERSUz4q1MSZPRz8jRIMsBLqEPxGFNO0Jpf2BD0wCdumLfMvFgLmXh0q55WBHClYJ2HmJCqiLHaXsqvPDpGlQUS6o272lk6I9LEPyiRxEouswlwgBTDthqHR8yXeCX3v4G%2Bx3wgwz%2B4qHPtit3Og96AkBxvNmlZWd8PZiYl153fFFbabv09zPPplA%2Fo6F8NQasq18dTYfN710JLmI7lZ4203g2xJJ77vZNBlcJdjG%2BTY83uRMBBrzom%2FtSxv2PJJa05dHJjIp6v55ygwNIF0lMP4GO%2B1ZJ2N%2FN2Zumjvc%2FM7MY4TE7zRAsyjf2ZmvLDmVgJCjD8PaECHQJm5%2Fg9IvSVcKQsPId%2FyTTwiAxDOVo4JjeGj6w6XVwajHfvXl%2BvU4ou7Rqmp7%2F4hcCvnNy9pRvnpAmmRn4aeOIkdL0ykmhGuopfyILMyGZu5Nk%2F0Ko7dyKlUEr1fYR10KPelX60KZ9tkqphocAVn8mLKDA9Bn%2BEWjkt3RDyOVSKePxd%2FjDSz6o5yprz%2BVR0UyQH0d93JBFSnIrAZ8YN40nSKdT%2FVRZyQWoFChCRVtqMtVhxMkBmCZP0HQ0JdjRewknHpMyGY6PpSYrFlp0vBNaEVZnQqw8GVTHe7MBPAAuXxm1vsNfwTZ6TVWPGoqxpHo1%2FPk3BIkfU7e1biUXR6tmSPbYVu0THxC0sEmB9f78LmaDzPIOyH%2FpdhKrq4bZRWPJVa5of9%2Bx%2BIoWiZ%2BOykeT8gOopw6ktR6Fd%2F2gsxj01QCeFlDJvFpZ%2Biot9t59TeqAAK%2FKGKSOKGivcVstWLzG5QxMr7jDOm7q8BjqwAUA4cy1qm1K%2B7jEqASWFknxzorw2upxSFuj8kIF4Z6KWK2V46N%2B9hwSp1GC18Q8IbccYC7A0cHty%2FzVuhNmWcEA5LC0CGqRXMfryONG0FZL4MtDM3yoJsHBUE%2BcaeXQnzWD%2BZUOJUS%2BVHtncLI6UBl8GXKkr3FIzBqawQ35IYFag4jgwcFHgYN80o33LVNPPv27WLoFnLHb5%2FsnqJLews%2BkY79%2B4Pm34d2f2W0JY3Rv6&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20250120T184637Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY4YQRFCS3%2F20250120%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=ea99f34a396f71a81945c07ec0473983c547caa2f90fdf5b6198d945eefdf107&hash=222b44b292bc8ad7b943efb168b684155393f92d7c0b2ce7a95ac142a06f56f4&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2468502X24000019&tid=spdf-0969c9cf-fab2-4073-8cb7-4c1c20fb5c2a&sid=d65f2cef3561484338481245b5940b16f742gxrqa&type=client&tsoh=d3d3LnNjaWVuY2VkaXJlY3QuY29t&ua=13135f5453535c045457&rr=90513a748b9c2c69&cc=us",
    "https://arxiv.org/pdf/2302.05543",
    "https://arxiv.org/pdf/2210.10960",
    "https://arxiv.org/pdf/2307.12868",
    "https://arxiv.org/pdf/2210.05559",
    "https://arxiv.org/pdf/2208.01626",
    "https://arxiv.org/pdf/2112.05744",
    "https://arxiv.org/pdf/2401.18085",
    "https://arxiv.org/pdf/2104.00820",
    "https://arxiv.org/pdf/2402.17723",
    "https://arxiv.org/pdf/2303.11073",
    "https://arxiv.org/pdf/2212.08698",
    "https://arxiv.org/pdf/2205.12952",
    "https://arxiv.org/pdf/2004.05571",
    "https://arxiv.org/pdf/2204.11824v1",
    "https://arxiv.org/pdf/2111.15640",
    "https://arxiv.org/pdf/2211.12572",
    "https://arxiv.org/pdf/2106.05744"
]

for i, pdf_url in enumerate(pdf_urls):
    response = requests.get(pdf_url)
    if response.status_code == 200:
        with open(f"documents/paper_{i}.pdf", 'wb') as file:
            file.write(response.content)
        print(f"PDF saved as paper_{i}.pdf")
    else:
        print(f"Failed to download PDF. Status code: {response.status_code}")


In [46]:
import pymupdf4llm
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 3000,
    chunk_overlap  = 300,
    length_function = len,
    is_separator_regex = False,
)

for i in range(25):
    if i in [5, 8]:
        continue
    md_text = pymupdf4llm.to_markdown(f"C:/Users/tjker/Desktop/Research/Projects/lit_review/lit_review/documents/paper_{i}.pdf")

    split_text = text_splitter.split_text(md_text)

    # abstract_pattern = r"(?i)(?:^|\n)\s*(?:\*\*|##?)\s*abstract\s*(?:\*\*|##?)?\s*\n+([\s\S]*?)(?=\n\s*(?:\*\*|##?|###?)\s*\w+|\Z)"
    abstract_pattern = r"(?i)(?:^|\n)\s*(?:\*\*|#+)\s*abstract\s*(?:\*\*|#+)?\s*\n+([\s\S]*?)(?=\n\s*(?:\*\*|#+)\s*\w+|\Z)"

    match = re.search(abstract_pattern, split_text[0])
    abstract = match.group(1) if match else "Abstract not found"
    abstract = re.sub(r"[\n_]+", " ", abstract).strip()
    abstract = abstract.replace("- ", "")
    paper_embedding = compute_embedding(abstract)

    results = kg.query("""
        CALL db.index.vector.queryNodes(
            'abstract_embeddings', 
            $top_k, 
            $paper_embedding
            ) YIELD node AS paper, score
        RETURN paper.title, paper.abstract, score
        """, 
        params={"top_k":3,
                "paper_embedding": paper_embedding
                })

    scores = [x['score'] for x in results]
    print(scores)

Processing C:/Users/tjker/Desktop/Research/Projects/lit_review/lit_review/documents/paper_0.pdf...
[0.9997963905334473, 0.9809634685516357, 0.9802992343902588]
Processing C:/Users/tjker/Desktop/Research/Projects/lit_review/lit_review/documents/paper_1.pdf...
[0.9934802055358887, 0.9789113998413086, 0.9761033058166504]
Processing C:/Users/tjker/Desktop/Research/Projects/lit_review/lit_review/documents/paper_2.pdf...
[0.9837110042572021, 0.9725377559661865, 0.9714546203613281]
Processing C:/Users/tjker/Desktop/Research/Projects/lit_review/lit_review/documents/paper_3.pdf...
[0.9985926151275635, 0.9832584857940674, 0.982003927230835]
Processing C:/Users/tjker/Desktop/Research/Projects/lit_review/lit_review/documents/paper_4.pdf...
[0.9932358264923096, 0.9775490760803223, 0.9749095439910889]
Processing C:/Users/tjker/Desktop/Research/Projects/lit_review/lit_review/documents/paper_6.pdf...
[0.9983997344970703, 0.9783437252044678, 0.9763391017913818]
Processing C:/Users/tjker/Desktop/Researc

In [None]:
# Global constants
VECTOR_INDEX_NAME = 'paper_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

# Constructing a KG from text documents
This is the next thing to do on my list
See https://learn.deeplearning.ai/courses/knowledge-graphs-rag for help

In [None]:
from dotenv import load_dotenv
import os

# Common data processing
import json
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI


# Warning control
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
# Note the code below is unique to this course environment, and not a 
# standard part of Neo4j's integration with OpenAI. Remove if running 
# in your own environment.
OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'

# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

In [None]:
import pymupdf4llm

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

def create_paper_chunks(file):
    chunks_with_metadata = []
    md_text = pymupdf4llm.to_markdown(file)

In [None]:
def split_form10k_data_from_file(file):
    chunks_with_metadata = [] # use this to accumlate chunk records
    file_as_object = json.load(open(file)) # open the json file
    for item in ['item1','item1a','item7','item7a']: # pull these keys from the json
        print(f'Processing {item} from {file}') 
        item_text = file_as_object[item] # grab the text of the item
        item_text_chunks = text_splitter.split_text(item_text) # split the text into chunks
        chunk_seq_id = 0
        for chunk in item_text_chunks[:20]: # only take the first 20 chunks
            form_id = file[file.rindex('/') + 1:file.rindex('.')] # extract form id from file name
            # finally, construct a record with metadata and the chunk text
            chunks_with_metadata.append({
                'text': chunk, 
                'f10kItem': item,
                'chunkSeqId': chunk_seq_id,
                'formId': f'{form_id}', # pulled from the filename
                'chunkId': f'{form_id}-{item}-chunk{chunk_seq_id:04d}',
                'names': file_as_object['names'],
                'cik': file_as_object['cik'],
                'cusip6': file_as_object['cusip6'],
                'source': file_as_object['source'],
            })
            chunk_seq_id += 1
        print(f'\tSplit into {chunk_seq_id} chunks')
    return chunks_with_metadata

In [None]:
merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.names = $chunkParam.names,
        mergedChunk.formId = $chunkParam.formId, 
        mergedChunk.cik = $chunkParam.cik, 
        mergedChunk.cusip6 = $chunkParam.cusip6, 
        mergedChunk.source = $chunkParam.source, 
        mergedChunk.f10kItem = $chunkParam.f10kItem, 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
        mergedChunk.text = $chunkParam.text
RETURN mergedChunk
"""

In [None]:
kg.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
""")

In [None]:
node_count = 0
for chunk in first_file_chunks:
    print(f"Creating `:Chunk` node for chunk ID {chunk['chunkId']}")
    kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
    node_count += 1
print(f"Created {node_count} nodes")

In [None]:
kg.query("""
         CREATE VECTOR INDEX `form_10k_chunks` IF NOT EXISTS
          FOR (c:Chunk) ON (c.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
""")

In [None]:
kg.query("SHOW INDEXES")

In [None]:
kg.query("""
    MATCH (chunk:Chunk) WHERE chunk.textEmbedding IS NULL
    WITH chunk, genai.vector.encode(
      chunk.text, 
      "OpenAI", 
      {
        token: $openAiApiKey, 
        endpoint: $openAiEndpoint
      }) AS vector
    CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)
    """, 
    params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

In [None]:
def neo4j_vector_search(question):
  """Search for similar nodes using the Neo4j vector index"""
  vector_search_query = """
    WITH genai.vector.encode(
      $question, 
      "OpenAI", 
      {
        token: $openAiApiKey,
        endpoint: $openAiEndpoint
      }) AS question_embedding
    CALL db.index.vector.queryNodes($index_name, $top_k, question_embedding) yield node, score
    RETURN score, node.text AS text
  """
  similar = kg.query(vector_search_query, 
                     params={
                      'question': question, 
                      'openAiApiKey':OPENAI_API_KEY,
                      'openAiEndpoint': OPENAI_ENDPOINT,
                      'index_name':VECTOR_INDEX_NAME, 
                      'top_k': 10})
  return similar

In [None]:
search_results = neo4j_vector_search(
    'In a single sentence, tell me about Netapp.'
)

In [None]:
neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=OpenAIEmbeddings(),
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name=VECTOR_INDEX_NAME,
    node_label=VECTOR_NODE_LABEL,
    text_node_properties=[VECTOR_SOURCE_PROPERTY],
    embedding_node_property=VECTOR_EMBEDDING_PROPERTY,
)

retriever = neo4j_vector_store.as_retriever()

In [None]:
chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOpenAI(temperature=0), 
    chain_type="stuff", 
    retriever=retriever
)

In [None]:
def prettychain(question: str) -> str:
    """Pretty print the chain's response to a question"""
    response = chain({"question": question},
        return_only_outputs=True,)
    print(textwrap.fill(response['answer'], 60))

In [None]:
prettychain("""
    Tell me about Apple. 
    Limit your answer to a single sentence.
    If you are unsure about the answer, say you don't know.
""")