# Knowledge Graphs를 활용한 RAG 성능 향상

In [None]:
!pip install -qU neo4j langchain langchain_openai langchain-community langchain_neo4j

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/102.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.8/102.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.6/84.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.2/313.2 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m476.1/476.1 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/204.8 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## neo4j 인스턴스 설정

In [None]:
from langchain_neo4j import Neo4jGraph
import os

In [None]:
# 환경변수 설정
os.environ["NEO4J_URL"] = "neo4j+s://545cefaa.databases.neo4j.io"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "*"
os.environ["NEO4J_DATABASE"] = "neo4j"
os.environ["OPENAI_API_KEY"] = "*"

In [None]:
graph = Neo4jGraph(
    url = os.environ["NEO4J_URL"],
    username = os.environ["NEO4J_USERNAME"],
    password = os.environ["NEO4J_PASSWORD"]
)

In [None]:
def reset_database(graph):
  """
  데이터베이스 초기화
  """
  # 모든 노드와 관계 삭제
  graph.query('Match (n) detach delete n')

  # 모든 제약조건 삭제
  constraints = graph.query('Show constraints')
  for constraint in constraints:
    constraint_name = constraint.get('name')
    if constraint_name:
      graph.query(f'Drop constraint {constraint_name}')

  # 모든 인덱스 삭제
  indexes = graph.query('show indexes')
  for index in indexes:
    index_name = index.get('name')
    index_type = index.get('type')
    if index_name and index_type != 'constraint':
      graph.query(f'Drop index {index_name}')

  print('데이터베이스가 초기화 되었습니다')

# 데이터베이스 초기화
reset_database(graph)

데이터베이스가 초기화 되었습니다


## 데이터셋 그래프로 불러오기

In [None]:
q_load_articles = """
LOAD CSV WITH HEADERS
FROM 'https://raw.githubusercontent.com/dcarpintero/generative-ai-101/main/dataset/synthetic_articles.csv'
AS row
FIELDTERMINATOR ';'
MERGE (a:Article {title:row.Title})
SET a.abstract = row.Abstract,
    a.publication_date = date(row.Publication_Date)
FOREACH (researcher in split(row.Authors, ',') |
    MERGE (p:Researcher {name:trim(researcher)})
    MERGE (p)-[:PUBLISHED]->(a))
FOREACH (topic in [row.Topic] |
    MERGE (t:Topic {name:trim(topic)})
    MERGE (a)-[:IN_TOPIC]->(t))
"""

graph.query(q_load_articles)

[]

In [None]:
# 스키마 확인
graph.refresh_schema()
print(graph.get_schema)

Node properties:
Article {title: STRING, abstract: STRING, publication_date: DATE}
Researcher {name: STRING}
Topic {name: STRING}
Relationship properties:

The relationships:
(:Article)-[:IN_TOPIC]->(:Topic)
(:Researcher)-[:PUBLISHED]->(:Article)


## 벡터 인덱스 구축하기

In [None]:
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings

In [None]:
vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(),
    url=os.environ['NEO4J_URI'],
    username=os.environ['NEO4J_USERNAME'],
    password=os.environ['NEO4J_PASSWORD'],
    index_name='articles',
    node_label="Article",
    text_node_properties=['topic', 'title', 'abstract'],
    embedding_node_property='embedding',
)

## 유사도 기반 질의응답

벡터 인덱스를 검색기로 사용하는 질의응답(QA) 체인을 생성

In [None]:
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

prompt = ChatPromptTemplate.from_template("""
다음 문서를 참고해서 질문에 답변해줘.
문서:
{context}

질문:
{question}
""")

retriever = vector_index.as_retriever()

qa_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

qa_chain.invoke("이 문서의 핵심 내용은?")

'이 문서는 여러 연구 논문에 대한 정보를 담고 있으며, 각 논문은 언어 모델과 관련된 다양한 주제를 다루고 있습니다. 핵심 내용은 다음과 같습니다:\n\n1. **Attention Mechanism Enhancements for Improved Language Understanding**: 새로운 주의 메커니즘을 도입하여 언어 모델이 장기 의존성과 맥락 정보를 더 잘 포착할 수 있도록 개선하는 방법을 제안합니다.\n\n2. **Quantum-Inspired Algorithms for Language Model Training**: 양자 컴퓨팅 원칙에서 영감을 받은 새로운 훈련 알고리즘을 제안하여 모델 수렴 속도를 크게 향상시킬 가능성을 보여줍니다.\n\n3. **Transformer Architecture Innovations**: 트랜스포머 아키텍처에 대한 새로운 수정안을 제안하여 긴 시퀀스를 처리하는 효율성을 개선합니다.\n\n4. **Ensuring Transparency in AI Decision-Making Systems**: 고위험 결정-making 맥락에서 AI 시스템의 투명성과 해석 가능성을 높이기 위한 프레임워크를 제시합니다.\n\n이 문서는 언어 모델의 성능 향상, 효율성 개선, 그리고 AI 시스템의 투명성 증대와 관련된 최신 연구 결과를 요약하고 있습니다.'

In [None]:
r = qa_chain.invoke("which articles discuss how AI might affect our daily life? include the article titles and abstracts.")
print(r)

The articles that discuss how AI might affect our daily life are:

1. **Title:** The Impact of AI on Employment: A Comprehensive Study  
   **Abstract:** This study analyzes the potential effects of AI on various job sectors and suggests policy recommendations to mitigate negative impacts.

2. **Title:** The Societal Implications of Advanced AI: A Multidisciplinary Analysis  
   **Abstract:** Our study brings together experts from various fields to analyze the potential long-term impacts of advanced AI on society, economy, and culture.

3. **Title:** Ethical Considerations in AI Development  
   **Abstract:** We explore the ethical implications of rapid AI advancement and propose guidelines for responsible development.

4. **Title:** The Role of AI in Combating Climate Change: Opportunities and Challenges  
   **Abstract:** Our research explores how AI can be leveraged to address climate change, discussing both its potential benefits and the associated ethical considerations.

These ar

In [None]:
r2 = qa_chain.invoke('AI가 우리의 일상생활에 어떤 영향을 미칠지 논의하는 논문들은 무엇인가요? 논문 제목과 초록을 포함해주세요.')
print(r2)

AI가 우리의 일상생활에 미치는 영향을 논의하는 논문은 다음과 같습니다:

1. **논문 제목**: The Impact of AI on Employment: A Comprehensive Study  
   **초록**: This study analyzes the potential effects of AI on various job sectors and suggests policy recommendations to mitigate negative impacts.

2. **논문 제목**: The Societal Implications of Advanced AI: A Multidisciplinary Analysis  
   **초록**: Our study brings together experts from various fields to analyze the potential long-term impacts of advanced AI on society, economy, and culture. 

이 두 논문은 AI가 일상생활에 미치는 다양한 영향을 다루고 있습니다.


## 추론을 위한 지식 그래프 탐색

## Graph-Cypher-Chain과 Langchain 활용

In [None]:
from langchain_neo4j import GraphCypherQAChain
from langchain_openai import ChatOpenAI

graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    cypher_llm = ChatOpenAI(temperature=0, model_name='gpt-4o'),
    qa_llm = ChatOpenAI(temperature=0, model_name='gpt-4o'),
    graph=graph,
    allow_dangerous_requests=True,
    verbose=True,
)

## 자연어를 사용한 쿼리

In [None]:
cypher_chain.invoke(
    {"query": "How many articles has published Emily Chen?"}
)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (r:Researcher {name: "Emily Chen"})-[:PUBLISHED]->(a:Article)
RETURN COUNT(a) AS numberOfArticles
[0m
Full Context:
[32;1m[1;3m[{'numberOfArticles': 7}][0m

[1m> Finished chain.[0m


{'query': 'How many articles has published Emily Chen?',
 'result': 'Emily Chen has published 7 articles.'}

In [None]:
cypher_chain.invoke(
    {"query": "Emily Chen은 몇 개의 논문을 출판했나요?"}
)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (r:Researcher {name: "Emily Chen"})-[:PUBLISHED]->(a:Article)
RETURN COUNT(a) AS numberOfArticles
[0m
Full Context:
[32;1m[1;3m[{'numberOfArticles': 7}][0m

[1m> Finished chain.[0m


{'query': 'Emily Chen은 몇 개의 논문을 출판했나요?',
 'result': 'Emily Chen은 7개의 논문을 출판했습니다.'}

In [None]:
cypher_chain.invoke(
    {"query": "세 개 이상의 논문을 함께 출판한 두 명의 연구자는?"}
)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (r1:Researcher)-[:PUBLISHED]->(a:Article)<-[:PUBLISHED]-(r2:Researcher)
WITH r1, r2, COUNT(a) AS sharedArticles
WHERE sharedArticles > 3
RETURN r1.name, r2.name
[0m
Full Context:
[32;1m[1;3m[{'r1.name': 'David Johnson', 'r2.name': 'Emily Chen'}, {'r1.name': 'Robert Taylor', 'r2.name': 'Emily Chen'}, {'r1.name': 'Emily Chen', 'r2.name': 'David Johnson'}, {'r1.name': 'Emily Chen', 'r2.name': 'Robert Taylor'}][0m

[1m> Finished chain.[0m


{'query': '세 개 이상의 논문을 함께 출판한 두 명의 연구자는?',
 'result': '세 개 이상의 논문을 함께 출판한 두 명의 연구자는 Emily Chen과 David Johnson, Emily Chen과 Robert Taylor입니다.'}

In [None]:
cypher_chain.invoke(
    {"query": "are there any pair of researchers who have published more than three articles together?"}
)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (r1:Researcher)-[:PUBLISHED]->(a:Article)<-[:PUBLISHED]-(r2:Researcher)
WHERE r1 <> r2
WITH r1, r2, COUNT(a) AS sharedArticles
WHERE sharedArticles > 3
RETURN r1.name, r2.name, sharedArticles
[0m
Full Context:
[32;1m[1;3m[{'r1.name': 'David Johnson', 'r2.name': 'Emily Chen', 'sharedArticles': 4}, {'r1.name': 'Robert Taylor', 'r2.name': 'Emily Chen', 'sharedArticles': 4}, {'r1.name': 'Emily Chen', 'r2.name': 'David Johnson', 'sharedArticles': 4}, {'r1.name': 'Emily Chen', 'r2.name': 'Robert Taylor', 'sharedArticles': 4}][0m

[1m> Finished chain.[0m


{'query': 'are there any pair of researchers who have published more than three articles together?',
 'result': 'Yes, David Johnson and Emily Chen, as well as Robert Taylor and Emily Chen, have published more than three articles together.'}

In [None]:
cypher_chain.invoke(
    {"query": "동료들과 가장 많은 협업을 한 연구원은?"}
)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (r:Researcher)-[:PUBLISHED]->(a:Article)<-[:PUBLISHED]-(colleague:Researcher)
WHERE r <> colleague
RETURN r.name AS Researcher, COUNT(DISTINCT colleague) AS Collaborations
ORDER BY Collaborations DESC
LIMIT 1
[0m
Full Context:
[32;1m[1;3m[{'Researcher': 'David Johnson', 'Collaborations': 6}][0m

[1m> Finished chain.[0m


{'query': '동료들과 가장 많은 협업을 한 연구원은?',
 'result': '가장 많은 협업을 한 연구원은 David Johnson입니다.'}

In [None]:
cypher_chain.invoke(
    {"query": "Which researcher has collaborated with the most peers?"}
)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (r:Researcher)-[:PUBLISHED]->(:Article)<-[:PUBLISHED]-(peer:Researcher)
WITH r, COUNT(DISTINCT peer) AS peerCount
RETURN r.name AS researcher, peerCount
ORDER BY peerCount DESC
LIMIT 1
[0m
Full Context:
[32;1m[1;3m[{'researcher': 'David Johnson', 'peerCount': 6}][0m

[1m> Finished chain.[0m


{'query': 'Which researcher has collaborated with the most peers?',
 'result': 'David Johnson has collaborated with the most peers, with a peer count of 6.'}