# 검색 증강 생성(RAG)

In [16]:
!pip install node2vec

Collecting node2vec
  Downloading node2vec-0.5.0-py3-none-any.whl.metadata (849 bytes)
Collecting gensim<5.0.0,>=4.3.0 (from node2vec)
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0.0,>=1.24.0 (from node2vec)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting scipy<1.14.0,>=1.7.0 (from gensim<5.0.0,>=4.3.0->node2vec)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Downloading node2vec-0.5.0-py3-none-any.whl (7.2 kB)
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## 그래프 기반 지식 표현

In [1]:
from typing import Dict, List, Tuple

class KnowledgeGraph:
    def __init__(self):
        self.nodes: Dict[str, Dict] = {}
        self.edges: Dict[str, List[Tuple[str, str]]] = {}

    def add_node(self, node_id: str, properties: Dict):
        self.nodes[node_id] = properties

    def add_edge(self, source: str, target: str, relation: str):
        if source not in self.edges:
            self.edges[source] = []
        self.edges[source].append((target, relation))

    def get_neighbors(
        self, node_id: str) -> List[Tuple[str, str]
    ]:
        return self.edges.get(node_id, [])


In [2]:
# 사용 예
kg = KnowledgeGraph()
# kg.add_node("Paris", {"type": "City", "country": "France"})
# kg.add_node("France", {"type": "Country", "continent": "Europe"})
# kg.add_edge("Paris", "France", "capital_of")

kg.add_node("파리", {"유형": "도시", "국가": "프랑스"})
kg.add_node("프랑스", {"유형": "국가", "대륙": "유럽"})
kg.add_edge("파리", "프랑스", "capital_of")

print(kg.get_neighbors("Paris"))

[]


## 그래프 RAG 아키텍처 설계

In [3]:
import networkx as nx
from sentence_transformers import SentenceTransformer
import torch

class GraphRAG:
    def __init__(self, kg: KnowledgeGraph, model_name: str):
        self.kg = kg
        self.model = SentenceTransformer(model_name)
        self.graph = self.build_networkx_graph()
        self.node_embeddings = self.compute_node_embeddings()

    def build_networkx_graph(self):
        G = nx.DiGraph()
        for node_id, properties in self.kg.nodes.items():
            G.add_node(node_id, **properties) # Unpack the properties dictionary
        for source, edges in self.kg.edges.items():
            for target, relation in edges:
                G.add_edge(source, target, relation=relation)
        return G

    def compute_node_embeddings(self):
        embeddings = {}
        for node_id, properties in self.kg.nodes.items():
            text = f"{node_id} {' '.join(properties.values())}"
            embedding = self.model.encode(text)
            embeddings[node_id] = embedding
        return embeddings

    def retrieve(self, query: str, k: int = 5) -> List[str]:
        query_embedding = self.model.encode(query)
        similarities = {
            node_id: torch.cosine_similarity(
                torch.tensor(query_embedding),
                torch.tensor(emb), dim=0
            )
            for node_id, emb in self.node_embeddings.items()}
        return sorted(
            similarities, key=similarities.get, reverse=True
        )[:k]


In [4]:
graph_rag = GraphRAG(kg, "all-MiniLM-L6-v2")
retrieved_nodes = graph_rag.retrieve("프랑스의 수도는 어디인가?")
print("검색된 노드:", retrieved_nodes)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


검색된 노드: ['파리', '프랑스']


## 그래프 임베딩

In [11]:
from node2vec import Node2Vec

class AdvancedGraphRAG(GraphRAG):
    def __init__(self, kg: KnowledgeGraph, model_name: str):
        super().__init__(kg, model_name)
        self.node2vec_embeddings = self.compute_node2vec_embeddings()

    def compute_node2vec_embeddings(self):
        # Change dimensions to match the text embedding dimension
        node2vec = Node2Vec(
            self.graph, dimensions=384, walk_length=30, num_walks=200, workers=4
        )
        model = node2vec.fit(window=10, min_count=1)
        return {node: model.wv[node]
            for node in self.graph.nodes()
        }

    def retrieve(self, query: str, k: int = 5) -> List[str]:
        query_embedding = self.model.encode(query)
        combined_similarities = {}

        for node_id in self.graph.nodes():
            text_sim = torch.cosine_similarity(
                torch.tensor(query_embedding),
                torch.tensor(self.node_embeddings[node_id]),
                dim=0
            )
            # Now dimensions should match after changing node2vec dimensions
            graph_sim = torch.cosine_similarity(
                torch.tensor(query_embedding),
                torch.tensor(self.node2vec_embeddings[node_id]),
                dim=0
            )
            combined_similarities[node_id] = 0.5 * text_sim + 0.5 * graph_sim

        return sorted(
            combined_similarities, key=combined_similarities.get, reverse=True
        )[:k]

In [12]:
# 사용 예
advanced_graph_rag = AdvancedGraphRAG(kg, "all-MiniLM-L6-v2")
retrieved_nodes = advanced_graph_rag.retrieve("프랑스의 수도는 어디인가?")
print("검색된 노드:", retrieved_nodes)


Computing transition probabilities:   0%|          | 0/2 [00:00<?, ?it/s]

검색된 노드: ['파리', '프랑스']


## 쿼리 확장

In [14]:
import random

class QueryExpansionGraphRAG(AdvancedGraphRAG):
    def expand_query(self, query: str, num_expansions: int = 2) -> List[str]:
        initial_nodes = super().retrieve(query, k=3)
        expanded_queries = [query]
        for node in initial_nodes:
            neighbors = list(self.graph.neighbors(node))
            if neighbors:
                random_neighbor = random.choice(neighbors)
                expanded_query = (
                    f"{query}"
                    f"{self.graph.nodes[random_neighbor].get('type', '')}"
                    f"{random_neighbor}"
                )
                expanded_queries.append(expanded_query)
                if len(expanded_queries) >= num_expansions + 1:
                    break
        return expanded_queries

    def retrieve(self, query: str, k: int = 5) -> List[str]:
        expanded_queries = self.expand_query(query)
        all_retrieved = []
        for q in expanded_queries:
            all_retrieved.extend(super().retrieve(q, k))
        return list(dict.fromkeys(all_retrieved))[:k]


In [15]:
# 사용 예
query_expansion_rag = QueryExpansionGraphRAG(kg, "all-MiniLM-L6-v2")
retrieved_nodes = query_expansion_rag.retrieve("프랑스의 수도는 어디인가?")
print("검색된 노드:", retrieved_nodes)


Computing transition probabilities:   0%|          | 0/2 [00:00<?, ?it/s]

검색된 노드: ['파리', '프랑스']


In [44]:
from transformers import AutoModelForCausalLM, AutoTokenizer

class GenerativeGraphRAG(QueryExpansionGraphRAG):
    def __init__(self, kg: KnowledgeGraph, retriever_model: str, generator_model: str):
        super().__init__(kg, retriever_model)
        self.generator = AutoModelForCausalLM.from_pretrained(generator_model)
        self.generator_tokenizer = AutoTokenizer.from_pretrained(generator_model)

    def generate_response(self, query: str, max_length: int = 300) -> str: # Increased max_length
        retrieved_nodes = self.retrieve(query)
        context = self.build_graph_context(retrieved_nodes)
        # Ensure prompt is in Korean
        prompt = f"그래프 문맥:\n{context}\n\n질문: {query}\n답변:"
        inputs = self.generator_tokenizer(prompt, return_tensors="pt")
        # Explicitly pass input_ids and attention_mask
        outputs = self.generator.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"] if "attention_mask" in inputs else None,
            max_length=max_length
        )
        return self.generator_tokenizer.decode(outputs[0], skip_special_tokens=True)

    def build_graph_context(self, nodes: List[str]) -> str:
        context = []
        for node in nodes:
            context.append(f"노드: {node}")
            context.append(f"속성: {self.graph.nodes[node]}")
            for neighbor, edge_data in self.graph[node].items():
                context.append(
                    f"  '{neighbor}'와 '{edge_data['relation']}' 관계로 연결됨"
                )
        return "\n".join(context)

In [45]:
# 사용 예
generative_graph_rag = GenerativeGraphRAG(kg, "all-MiniLM-L6-v2", "gpt2-medium")
response = generative_graph_rag.generate_response("프랑스의 수도는 어디인가?")
print("생성된 응답:", response)


Computing transition probabilities:   0%|          | 0/3 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


생성된 응답: 그래프 문맥:
노드: User 12345
속성: {'type': 'User', 'interests': '아이폰'}
노드: 맥북
속성: {'type': '노트북', 'brand': '애플'}
노드: 아이폰
속성: {'type': '스마트폰', 'brand': '애플'}
  '맥북'와 '같은 브랜드' 관계로 연결됨

질문: 프랑스의 수도는 어디인가?
답변: 아이폰
속성: {'type': '스마트폰', 'brand': '애�


## 그래프 RAG 애플리케이션과 사용 사례

In [46]:
class RecommendationGraphRAG(GenerativeGraphRAG):
    def get_recommendations(self, user_id: str, num_recommendations: int = 5) -> List[str]:
        user_node = self.retrieve(f"User {user_id}", k=1)[0]
        user_interests = self.graph.nodes[user_node].get('interests', [])

        potential_recommendations = set()
        for interest in user_interests:
            related_items = self.retrieve(interest, k=3)
            potential_recommendations.update(related_items)

        recommendations = list(
            potential_recommendations - set(user_interests)
        )[:num_recommendations]
        return recommendations

    def explain_recommendation(self, user_id: str, item_id: str) -> str:
        query = f"사용자 {user_id}가 {item_id}에 관심을 갖는 이유는?"
        return self.generate_response(query)


In [47]:
# 사용 예
kg = KnowledgeGraph()
kg.add_node("User 12345", {"type": "User", "interests": "아이폰"})
kg.add_node("아이폰", {"type": "스마트폰", "brand": "애플"})
kg.add_node("맥북", {"type": "노트북", "brand": "애플"})
kg.add_edge("아이폰", "맥북", "같은 브랜드")

recommendation_rag = RecommendationGraphRAG(
    kg, "all-MiniLM-L6-v2", "gpt2-medium"
)
user_id = "12345"
recommendations = recommendation_rag.get_recommendations(user_id)
print(f"사용자 {user_id}를 위한 추천:", recommendations)

for item in recommendations[:2]:
    explanation = recommendation_rag.explain_recommendation(user_id, item)
    print(f"{item} 추천에 대한 설명:", explanation)


Computing transition probabilities:   0%|          | 0/3 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


사용자 12345를 위한 추천: ['아이폰', 'User 12345', '맥북']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


아이폰 추천에 대한 설명: 그래프 문맥:
노드: User 12345
속성: {'type': 'User', 'interests': '아이폰'}
노드: 맥북
속성: {'type': '노트북', 'brand': '애플'}
노드: 아이폰
속성: {'type': '스마트폰', 'brand': '애플'}
  '맥북'와 '같은 브랜드' 관계로 연결됨

질문: 사용자 12345가 아이폰에 관심을 갖는 이유는?
답변: 아이폰에 관심을 갖�
User 12345 추천에 대한 설명: 그래프 문맥:
노드: User 12345
속성: {'type': 'User', 'interests': '아이폰'}
노드: 맥북
속성: {'type': '노트북', 'brand': '애플'}
노드: 아이폰
속성: {'type': '스마트폰', 'brand': '애플'}
  '맥북'와 '같은 브랜드' 관계로 연결됨

질문: 사용자 12345가 User 12345에 관심을 갖는 이유는?
답변: 아이폰 아이폰 아이폰 아이�


## 부분 그래프 샘플링

In [54]:
import networkx as nx

class ScalableGraphRAG(GenerativeGraphRAG):
    def __init__(
        self, kg: KnowledgeGraph, retriever_model: str,
        generator_model: str, max_subgraph_size: int = 1000
    ):
        super().__init__(kg, retriever_model, generator_model)
        self.max_subgraph_size = max_subgraph_size

    def retrieve(self, query: str, k: int = 5) -> List[str]:
        initial_nodes = super().retrieve(query, k=k)
        subgraph = self.sample_subgraph(initial_nodes)
        return self.rank_nodes_in_subgraph(subgraph, query)[:k]

    def sample_subgraph(self, seed_nodes: List[str]) -> nx.Graph:
        subgraph = nx.Graph()
        frontier = set(seed_nodes)
        while len(subgraph) < self.max_subgraph_size and frontier:
            node = frontier.pop()
            if node not in subgraph:
                subgraph.add_node(node, **self.graph.nodes[node])
                neighbors = list(self.graph.neighbors(node))
                for neighbor in neighbors:
                    if len(subgraph) < self.max_subgraph_size:
                        subgraph.add_edge(
                            node, neighbor,
                            **self.graph[node][neighbor]
                        )
                        frontier.add(neighbor)
                    else:
                        break
        return subgraph

    def rank_nodes_in_subgraph(self, subgraph: nx.Graph, query: str) -> List[str]:
        query_embedding = self.model.encode(query)
        node_scores = {}
        for node in subgraph.nodes():
            node_embedding = self.node_embeddings[node]
            score = torch.cosine_similarity(
                torch.tensor(query_embedding),
                torch.tensor(node_embedding), dim=0
            )
            node_scores[node] = score
        return sorted(node_scores, key=node_scores.get, reverse=True)


In [55]:
# 사용 예
kg = KnowledgeGraph()
kg.add_node("파리", {"유형": "도시", "국가": "프랑스"})
kg.add_node("프랑스", {"유형": "국가", "대륙": "유럽"})
kg.add_edge("파리", "프랑스", "capital_of")

scalable_graph_rag = ScalableGraphRAG(kg, "all-MiniLM-L6-v2", "gpt2-medium")
retrieved_nodes = scalable_graph_rag.retrieve("프랑스의 수도는 어디인가?")
print("검색된 노드:", retrieved_nodes)


Computing transition probabilities:   0%|          | 0/2 [00:00<?, ?it/s]

검색된 노드: ['파리', '프랑스']
