In [1]:
import json
import sys
from tqdm.auto import tqdm

sys.path.append('C:/Users/tjker/Desktop/Research/Projects/lit_review/lit_review')
import utils
import rag_utils as rag
import kg_builder as kgb

sys.path.append('C:/Users/tjker/Desktop/Research/Projects/lit_review/configs')
from create_chunks_config import config
kg = utils.load_kg(config)

In [2]:
kg.query("""
    MATCH (c:Chunk) 
    DETACH DELETE c
    """
)

kg.query(""" 
DROP INDEX paper_chunks IF EXISTS
"""
)

[]

Now that we have all of this nice data organized we are going to load in each pdf and chunk it up and then create a Chunk node for each chunk

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open(f"{config['data']['data_path']}/{config['data']['paper_chunk_output_name']}", 'r') as file:
    updated_data = json.load(file)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = config['chunks']['chunk_size'],
    chunk_overlap  = config['chunks']['chunk_overlap'],
    length_function = len,
    is_separator_regex = False,
)

kg.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
""")

merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.paperId = $chunkParam.paperId, 
        mergedChunk.source = $chunkParam.paperId,
        mergedChunk.text = $chunkParam.text
RETURN mergedChunk
"""

node_count = 0
for paper in tqdm(updated_data):
    chunks = kgb.paper_data_from_file(paper, text_splitter)
    for chunk in chunks:
        kg.query(merge_chunk_node_query, 
                params={
                    'chunkParam': chunk
                })
        node_count += 1
    # break
print(f"Created {node_count} nodes")

  0%|          | 0/27 [00:00<?, ?it/s]

Created 4118 nodes


In [4]:
kg.query("""
  CREATE VECTOR INDEX `paper_chunks` IF NOT EXISTS
  FOR (c:Chunk) ON (c.textEmbedding) 
  OPTIONS { indexConfig: {
    `vector.dimensions`: $dimension,
    `vector.similarity_function`: $similarity    
  }}""", params={'dimension': config['embedding']['size'], 'similarity': config['embedding']['similarity']}
)

[]

In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer(config['embedding']['model_id'])

all_chunk_nodes = kg.query("""
    MATCH (c:Chunk) 
    RETURN elementId(c) AS chunk_id, c.text AS text
    """
)

for record in tqdm(all_chunk_nodes):
    chunk_id = record["chunk_id"]
    text = record["text"]
    
    if text:
        # embedding = rag.compute_embedding(text, tokenizer, model)
        embedding = model.encode(text)
        kg.query("""
            MATCH (c:Chunk) 
            WHERE elementId(c) = $chunk_id
            SET c.textEmbedding = $embedding
            RETURN elementId(c) AS chunk_id, c.textEmbedding AS embedding
            """, params={"chunk_id":chunk_id, "embedding":embedding}
        )

  0%|          | 0/3999 [00:00<?, ?it/s]

In [45]:
from langchain_neo4j import Neo4jVector
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain

from dotenv import load_dotenv
import textwrap

load_dotenv('C:/Users/tjker/Desktop/Research/Projects/lit_review/.env', override=True)

llm = HuggingFaceEndpoint(model=config['model']['model_id'])

chunk_vector = Neo4jVector.from_existing_index(
    HuggingFaceEmbeddings(model_name=config['embedding']['model_id']),
    graph=kg,
    index_name='paper_chunks',
    embedding_node_property='textEmbedding',
    text_node_property='text',
)

retriever_20 = chunk_vector.as_retriever(search_kwargs={"k": 20})
retriever_0 = chunk_vector.as_retriever()
chunk_retriever = RetrievalQA.from_llm(llm=llm, retriever=retriever_20)

In [35]:
question = "What are some ways that you can leverage the structure of a latent space to influence generation?"
response = chunk_retriever.invoke({"query": question})

print(textwrap.fill(response['result'], 60))

 You can leverage the structure of a latent space to
influence generation in various ways, such as manipulating
the latent variables, using pre-trained visual encoders to
modulate the capacity, injecting noise, or using
crossattention modules to capture semantic relationships.


In [36]:
# question = "How can we uncover the underlying structure of the latent space in GANs and Diffusion Models?"
# result = chunk_vector.similarity_search(question, k=3)
# for doc in result:
#     print(doc.metadata["chunkId"], "-", doc.page_content)

In [46]:
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever_0
)

In [47]:
question = "What are some ways that the structure of a latent space can be leveraged to influence the generation process?"
response = chain.invoke({"question": question},
        return_only_outputs=True,)

# print(textwrap.fill(response['result'], 60))
response

{'answer': ' The structure of a latent space can be leveraged to influence the generation process through its evolution.\n',
 'sources': 'ca743e75ce090bbf686307e41bd8747661768fbe, d7074976c2609568902a6b6ca45f6c71d9cb66bf'}

In [31]:
import textwrap
def prettychain(question: str) -> str:
    """Pretty print the chain's response to a question"""
    response = chain.invoke({"question": question},
        return_only_outputs=True,)
    print(textwrap.fill(response['answer'], 60))
    
prettychain('who wrote Self-Guided Diffusion Models?')

 Vincent Tao Hu, David W Zhang, Yuki M Asano, Gertjan J
Burghouts, and Cees G M Snoek.


In [48]:
chain.invoke({"question": 'who wrote Self-Guided Diffusion Models?'})

{'question': 'who wrote Self-Guided Diffusion Models?',
 'answer': ' Vincent Tao Hu, David W Zhang, Yuki M Asano, Gertjan J Burghouts, and Cees G M Snoek.\n',
 'sources': ''}

In [49]:
chain.invoke({"question": 'who wrote DiGress: Discrete Denoising diffusion for graph generation?'})

{'question': 'who wrote DiGress: Discrete Denoising diffusion for graph generation?',
 'answer': ' Denoising diffusion probabilistic models were written by Jonathan Ho, Ajay Jain, and Pieter Abbeel.\n',
 'sources': '33f3f31f871070f19b0c3e967a24e322bfc178f2, 33, 280, 22, 281, 23, and 282.'}