<a href="https://colab.research.google.com/github/tamoghna21/RAG_LLM/blob/main/2a_GraphRAG_Neo4j.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### QA From a Graph database (Graph RAG Implementation) using Open source LLM (Mistral-7B-Instruct-v0.2) and Langchain

#### Select Runtime > GPU

In [None]:
!pip install -q langchain tiktoken neo4j python-dotenv transformers
#!pip install -q wikipedia
!pip install -q langchain-community # Required for langchain.graphs
!pip install -q langchain-core
!pip install -U -q sagemaker
!pip install -q torch # for the LLM model
#!pip install -q ragatouille

!pip install -q accelerate
!pip install -q -i https://pypi.org/simple/ bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.5/973.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.0/203.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.2/310.2 kB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.4/124.4 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.0/53.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from langchain_community.graphs import Neo4jGraph

graph = Neo4jGraph(
    url="neo4j+s://demo.neo4jlabs.com", database="companies",username="companies", password="companies"
)
# Ref: https://github.com/neo4j/NaLLM

#graph = Neo4jGraph(
#    url="neo4j+s://databases.neo4j.io",username="neo4j", password="neo4j"
#)


In [None]:
print(graph.schema)

Node properties:
Person {name: STRING, id: STRING, summary: STRING}
Organization {isDissolved: BOOLEAN, nbrEmployees: INTEGER, revenue: FLOAT, name: STRING, motto: STRING, id: STRING, summary: STRING, isPublic: BOOLEAN}
IndustryCategory {name: STRING, id: STRING}
City {name: STRING, summary: STRING, id: STRING}
Country {name: STRING, id: STRING, summary: STRING}
Article {id: STRING, sentiment: FLOAT, author: STRING, siteName: STRING, summary: STRING, date: DATE_TIME, title: STRING}
Chunk {text: STRING, embedding: LIST, embedding_google: LIST}
Fewshot {Question: STRING, Cypher: STRING, id: INTEGER, embedding: LIST}
Relationship properties:

The relationships:
(:Person)-[:HAS_PARENT]->(:Person)
(:Person)-[:HAS_CHILD]->(:Person)
(:Organization)-[:HAS_CEO]->(:Person)
(:Organization)-[:HAS_INVESTOR]->(:Organization)
(:Organization)-[:HAS_INVESTOR]->(:Person)
(:Organization)-[:IN_CITY]->(:City)
(:Organization)-[:HAS_CATEGORY]->(:IndustryCategory)
(:Organization)-[:HAS_BOARD_MEMBER]->(:Person

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

os.chdir("/content/drive/My Drive/")

from dotenv import load_dotenv
load_dotenv(os.path.join('', './.env'))
os.environ["HUGGINGFACE_TOKEN"] = os.getenv('HUGGINGFACE_TOKEN') # Required to access the Mistral-7B model from Huggingface

Mounted at /content/drive


In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
import warnings
import torch

from huggingface_hub import login
login(token=os.environ["HUGGINGFACE_TOKEN"])

warnings.filterwarnings('ignore')

LLM_MODEL='mistralai/Mistral-7B-Instruct-v0.2'
#LLM_MODEL='meta-llama/Llama-2-7b-chat-hf'

tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL,trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# bitsandbytes parameters
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# Set up quantization config
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

model = AutoModelForCausalLM.from_pretrained(LLM_MODEL, device_map='auto', quantization_config=bnb_config,torch_dtype=torch.bfloat16)

#pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16, device_map="auto", max_new_tokens = 720, do_sample=True, top_k=30, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
#pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16, device_map="auto")

pipe = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    #temperature=0.2,
    #repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=1000,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    do_sample=True,
    top_k=30,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id
)



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
from langchain.llms import HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe, model_kwargs = {'temperature':0.25, 'max_tokens':4000, 'stop_sequence': "\n\n"})

  warn_deprecated(


In [None]:
from langchain.chains import GraphCypherQAChain
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough

graph.refresh_schema()

CYPHER_GENERATION_TEMPLATE_old = """
Instructions:
Generate a Neo4j Cypher statement to query a Neo4j graph database.
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.

The question is:
{question}
"""

CYPHER_GENERATION_TEMPLATE = """
### [INST] Instructions:
Generate a Neo4j Cypher statement to query a Neo4j graph database.
Use only the provided relationship types and properties in the Schema.
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.

### QUESTION:
{question} [/INST]
"""

CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)

# # Using seperate LLMs for cypher and Answer generation
#cypher_chain = GraphCypherQAChain.from_llm( # Using seperate LLMs for cypher and Answer generation
#    cypher_llm = ChatOpenAI(temperature=0, model_name='gpt-4'),
#    qa_llm = ChatOpenAI(temperature=0), graph=graph, verbose=True,
#)

cypher_chain = GraphCypherQAChain.from_llm(cypher_llm =llm,
                                           qa_llm =llm,
                                           graph=graph,
                                           cypher_prompt=CYPHER_GENERATION_PROMPT,
                                           verbose=True,
                                           return_intermediate_steps=False) #True

# Create llm chain
#llm_chain = CYPHER_GENERATION_PROMPT | llm

In [None]:
graph_result = cypher_chain.invoke("Is there an organization with name Deja vu Security? ")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (o:Organization)
WHERE o.name = 'Deja vu Security'
RETURN o
[0m
Full Context:
[32;1m[1;3m[{'o': {'summary': 'Software company based in Seattle, Washington, United States and owned by Accenture', 'revenue': 4600000.0, 'isDissolved': False, 'nbrEmployees': 30, 'name': 'Deja vu Security', 'motto': 'Securing the most prolific technologies in the world | Acquired by Accenture in June, 2019', 'isPublic': False, 'id': 'E91iaB3VQN3K-0YIMTmJjRw'}}][0m

[1m> Finished chain.[0m


In [None]:
graph_result

{'query': 'Is there an organization with name Deja vu Security? ',
 'result': ' Yes, there is an organization named Deja vu Security. It is a private company owned by Accenture. The company is based in Seattle, Washington, United States, and it has approximately 30 employees. The revenue of this company is 4.6 million US dollars. The motto of this company is "Securing the most prolific technologies in the world". Deja vu Security was not public and was acquired by Accenture in June, 2019.'}

In [None]:
cypher_chain.invoke("Find another company from the same country where Deja vu Security is based? ")['result']



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (d:Organization { name: 'Deja vu Security' })-[:IN_COUNTRY]->(c:Country)-[:IN_COUNTRY]->(company:Organization)
RETURN company.name
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


' I cannot provide a specific answer without information, but some examples of companies based in the United States, the country where Deja vu Security is located, include Apple Inc., Microsoft Corporation, and Amazon.com, Inc.'

References:
https://medium.com/neo4j/enhanced-qa-integrating-unstructured-and-graph-knowledge-using-neo4j-and-langchain-6abf6fc24c27

https://neo4j.com/developer-blog/knowledge-graph-rag-application/

https://github.com/neo4j/NaLLM

