In [7]:
import networkx as nx
import pandas as pd
import numpy as np
from typing import List, Dict, Any
import community.community_louvain as community_louvain
import openai
import json
from getpass import getpass

class GraphRAG:
    def __init__(self,
                 reviews_df: pd.DataFrame,
                 hotels_df: pd.DataFrame,
                 users_df: pd.DataFrame,
                 openai_api_key: str,
                 openai_base_url: str = "https://api.openai.com/"
    ):
        """
        Initialize GraphRAG following the paper's workflow
        """
        openai.api_key = openai_api_key
        openai.base_url=openai_base_url
        self.reviews_df = reviews_df
        self.hotels_df = hotels_df
        self.users_df = users_df

    def element_instances_extraction(self) -> List[Dict[str, Any]]:
        """
        Step 1: Extract Element Instances
        Convert raw data into structured instances
        """
        instances = []

        # Extract hotel instances
        for _, hotel in self.hotels_df.iterrows():
            instances.append({
                'type': 'hotel',
                'id': hotel['hotel_id'],
                'name': hotel['name_hotel'],
                'location': hotel['location'],
                'description': hotel['descriptions'],
                'address': hotel['address'],
                'country': hotel['country'],
            })

        # Extract review instances
        for _, review in self.reviews_df.iterrows():
            instances.append({
                'type': 'review',
                'id': review['review_id'],
                'hotel_id': review['hotel_id'],
                'user_id': review['user_id'],
                'rating': review['review_rating'],
                'text': review['review_text_full'],
                'stay_duration': review['stay_duration'],
                'stay_type': review['stay_type'],
                'room_view': review['room_view'],
            })

        # Extract user instances
        for _, user in self.users_df.iterrows():
            instances.append({
                'type': 'user',
                'id': user['user_id'],
                'country': user['user_country']
            })

        return instances

    def determine_relationships(self, instances: List[Dict[str, Any]]) -> List[Dict[str, str]]:
        """
        Determine relationships between entities (user, review, hotel) based on input instances.

        Args:
          instances (List[Dict[str, Any]]): A list of dictionaries where each dictionary includes:
        Return:
          List[Dict[str, str]]: A list of dictionaries representing relationships between entities.

        """

        relationships = []

        # Create lookup dictionaries for faster access
        users = {instance['id']: instance for instance in instances if instance['type'] == 'user'}
        hotels = {instance['id']: instance for instance in instances if instance['type'] == 'hotel'}
        reviews = {instance['id']: instance for instance in instances if instance['type'] == 'review'}

        # Establish User -> Review relationships
        for review_id, review in reviews.items():
            user_id = review.get('user_id')
            if user_id and user_id in users:
                relationships.append({
                    'source_type': 'user',
                    'source_id': user_id,
                    'target_type': 'review',
                    'target_id': review_id,
                    'relationship': 'writes'
                })

        # Establish Review -> Hotel relationships
        for review_id, review in reviews.items():
            hotel_id = review.get('hotel_id')
            if hotel_id and hotel_id in hotels:
                relationships.append({
                    'source_type': 'review',
                    'source_id': review_id,
                    'target_type': 'hotel',
                    'target_id': hotel_id,
                    'relationship': 'associated_with'
                })

        # Establish User -> Hotel relationships (indirect via Review)
        for review_id, review in reviews.items():
            user_id = review.get('user_id')
            hotel_id = review.get('hotel_id')
            if user_id and hotel_id and user_id in users and hotel_id in hotels:
                relationships.append({
                    'source_type': 'user',
                    'source_id': user_id,
                    'target_type': 'hotel',
                    'target_id': hotel_id,
                    'relationship': 'interacts_with'
                })

        return relationships

    def element_summaries(self, relationships: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Step 3: Generate Element Summaries
        Use LLM to create concise summaries for each relationship.
        Args:
            relationships (List[Dict[str, Any]]): A list of dictionaries representing relationships between entities.

        Returns:
            List[Dict[str, Any]]: A list of dictionaries where summaries each relationship.
        """
        summaries = []

        for relationship in relationships:
            try:
                response = openai.chat.completions.create(
                    model=MODEL,
                    messages=[
                        {
                            "role": "system",
                            "content": """
                            You are a data summarization assistant. Your task is to create structured, concise, and human-readable summaries of relationships between entities.
                            The output must use the format:
                            Relationships:
                            <source_type> <source_id> -> <relationship> -> <target_type> <target_id>
                            """
                        },
                        {
                            "role": "user",
                            "content": json.dumps(relationship)
                        }
                    ]
                )

                summary = response.choices[0].message.content
                summaries.append({
                    'source_id': relationship['source_id'],
                    'target_id': relationship['target_id'],
                    'relationship': relationship['relationship'],
                    'summary': summary
                })

            except Exception as e:
                print(f"Error generating summary for relationship: {e}")

        return summaries

    def graph_communities(self, summaries: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
        """
        Step 3: Build a graph from the element summaries and detect communities using the Louvain algorithm.
        Args:
            summaries (List[Dict[str, Any]]): A list of dictionaries where each dictionary includes:

        Returns:
            Dict[str, List[Dict[str, Any]]]: A dictionary where keys represent community labels and values are lists of nodes in each community.
        """
        G = nx.Graph()

        for summary in summaries:
            try:
                source_id = summary.get('source_id')
                target_id = summary.get('target_id')
                relationship = summary.get('relationship', 'related_to')

                if source_id and target_id:
                    G.add_edge(source_id, target_id, relationship=relationship)

            except KeyError as e:
                print(f"Error processing summary: Missing key {e}")

        try:
            partition = community_louvain.best_partition(G)

            communities = {}
            for node, community_id in partition.items():
                if community_id not in communities:
                    communities[community_id] = []
                communities[community_id].append(node)

            return communities

        except ImportError:
            print("Louvain community detection library not installed. Install it via `pip install python-louvain`.")
            return {}
        except Exception as e:
            print(f"Error during community detection: {e}")
            return {}

    def community_summaries(self, communities: Dict[str, List[Dict[str, Any]]]) -> Dict[str, Dict[str, Any]]:
        """
        Step 4: Generate Community Summaries
        Args:
            communities (Dict[str, List[Dict[str, Any]]]): A dictionary where keys represent community labels and values are lists of nodes in each community.

        Returns:
            Dict[str, Dict[str, Any]]:A dictionary where keys represent community labels and values are dictionaries with the following keys:
            - 'instances': List of instances in the community.
            - 'summary': Summary of the community.

        """
        community_summaries = {}

        for community_id, community_instances in communities.items():
            try:
                response = openai.chat.completions.create(
                    model=MODEL,
                    messages=[
                        {
                            "role": "system",
                            "content": """
                            Analyze the following community of instances.
                            Create a comprehensive summary that captures:
                            - Key characteristics
                            - Common themes
                            - Unique attributes of the community
                            """
                        },
                        {
                            "role": "user",
                            "content": json.dumps(community_instances)
                        }
                    ]
                )

                summary = response.choices[0].message.content
                community_summaries[community_id] = {
                    'instances': community_instances,
                    'summary': summary
                }

            except Exception as e:
                print(f"Error generating community summary for {community_id}: {e}")

        return community_summaries

    def community_answers(self,
                           community_summaries: Dict[str, Dict[str, Any]],
                           query: str) -> List[Dict[str, Any]]:
        """
        Step 5: Generate Community-level Answers
        Args:
            community_summaries (Dict[str, Dict[str, Any]]): Dictionary containing community summaries.
            query (str): The query of user

        Returns:
            List[Dict[str, Any]]: List of community each coumunity answers.
        """
        community_answers = []

        for community_id, community_data in community_summaries.items():
            try:
                response = openai.chat.completions.create(
                    model=MODEL,
                    messages=[
                        {
                            "role": "system",
                            "content": f"""
                            Given the query: "{query}"
                            Analyze the following community summary and generate a relevant answer.
                            """
                        },
                        {
                            "role": "user",
                            "content": json.dumps(community_data)
                        }
                    ]
                )

                answer = response.choices[0].message.content
                community_answers.append({
                    'community_id': community_id,
                    'answer': answer
                })

            except Exception as e:
                print(f"Error generating community answer for {community_id}: {e}")

        return community_answers

    def global_answer(self,
                      community_answers: List[Dict[str, Any]],
                      query: str) -> str:
        """
          Step 6: Generate Global Answer
          Synthesize community-level answers into a comprehensive response.

          Args:
              community_answers (List[Dict[str, Any]]): List of summaries or answers for each community.
              query (str): The global query to be answered based on community-level information.

          Returns:
            str: The final synthesized global answer.
        """
        try:
            intermediate_answers = [answer['answer'] for answer in community_answers]

            final_response = openai.chat.completions.create(
                model=MODEL,
                messages=[
                    {
                        "role": "system",
                        "content": """
                        Synthesize the following intermediate answers into a single, comprehensive response to the global query.
                        The final response must:
                        - Be concise yet thorough.
                        - Address the query directly.
                        - Integrate all relevant information from the intermediate answers.
                        - Eliminate redundancy and ensure logical coherence.
                        """
                    },
                    {
                        "role": "user",
                        "content": f"Query: {query}\nIntermediate Answers: {intermediate_answers}"
                    }
                ]
            )

            return final_response.choices[0].message.content

        except Exception as e:
            print(f"Error generating global answer: {e}")
            return "Unable to generate a global answer at this time."


if __name__ == "__main__":
    
    MODEL="llama-3.1-70b-versatile"
    openai_api_key = getpass("Enter your OpenAI API key: ")
    openai_base_url="https://api.groq.com/openai/v1/"
    
    hotels = pd.read_parquet("../scraper/booking/output/warehouse/dim_hotel.parquet")[:10]
    reviews = pd.read_parquet("../scraper/booking/output/warehouse/fact_review.parquet")[:10]
    users = pd.read_parquet("../scraper/booking/output/warehouse/dim_user.parquet")[:10]
    graph_rag = GraphRAG(reviews, hotels, users, openai_api_key, openai_base_url)

    instances = graph_rag.element_instances_extraction()
    relationships = graph_rag.determine_relationships(instances)
    summaries = graph_rag.element_summaries(relationships)
    print(f"summaries:{summaries}")
    communities = graph_rag.graph_communities(summaries)

    community_summaries = graph_rag.community_summaries(communities)
    print(f"community_summaries:{community_summaries}")
    query = "What are the best hotels for families in DaLat?"
    
    community_answers = graph_rag.community_answers(community_summaries, query)
    print(f"community_answers:{community_answers}")
    global_answer = graph_rag.global_answer(community_answers, query)

    print(global_answer)

summaries:[{'source_id': '70bc55b31ea555f16b4b2cee1d5cf901', 'target_id': '18147552e5edfbf872e83e79b5cba6ed', 'relationship': 'writes', 'summary': 'Relationships:\nuser 70bc55b31ea555f16b4b2cee1d5cf901 -> writes -> review 18147552e5edfbf872e83e79b5cba6ed'}, {'source_id': '527047c66c6af54086db833d12e1127d', 'target_id': '9cc8c8848fe6076d34bdd91f9e5cc704', 'relationship': 'writes', 'summary': 'Relationships:\nuser 527047c66c6af54086db833d12e1127d -> writes -> review 9cc8c8848fe6076d34bdd91f9e5cc704'}, {'source_id': '0b5a7b3db9977926290b4f518d3643a6', 'target_id': '168bf7b378ebd25a4ad7582a9fbcaffd', 'relationship': 'writes', 'summary': 'Relationships:\nuser 0b5a7b3db9977926290b4f518d3643a6 -> writes -> review 168bf7b378ebd25a4ad7582a9fbcaffd'}, {'source_id': '97ae00172b8f4fab84922c3d73be1430', 'target_id': 'c1d262e9150aa7ffbd28a1c33a8814d7', 'relationship': 'writes', 'summary': 'Relationships:\nuser 97ae00172b8f4fab84922c3d73be1430 -> writes -> review c1d262e9150aa7ffbd28a1c33a8814d7'}, {

In [4]:
import warnings
warnings.filterwarnings("ignore")
import os
from typing import Any, Dict, List

import pandas as pd
from dotenv import load_dotenv
from neo4j import GraphDatabase


class HotelRecommender:
    def __init__(self, uri: str, username: str, password: str):
        self.driver = GraphDatabase.driver(uri, auth=(username, password))

    def close(self) -> None:
        self.driver.close()

    def _create_constraints(self) -> None:
        with self.driver.session() as session:
            constraints = [
                "CREATE CONSTRAINT IF NOT EXISTS FOR (h:Hotel) REQUIRE h.hotel_id IS UNIQUE",
                "CREATE CONSTRAINT IF NOT EXISTS FOR (u:User) REQUIRE u.user_id IS UNIQUE",
                "CREATE CONSTRAINT IF NOT EXISTS FOR (a:Amenity) REQUIRE a.name IS UNIQUE",
                "CREATE CONSTRAINT IF NOT EXISTS FOR (d:StayDuration) REQUIRE d.duration IS UNIQUE",
                "CREATE CONSTRAINT IF NOT EXISTS FOR (t:StayType) REQUIRE t.type IS UNIQUE",
            ]
            for constraint in constraints:
                session.run(constraint)

    def _create_static_nodes(self, session) -> None:
        # Create amenities
        amenities = [
            "Air Conditioning",
            "TV",
            "Balcony",
            "Food Service",
            "Parking",
            "Vehicle Hire",
        ]
        query = "UNWIND $amenities AS name MERGE (a:Amenity {name: name})"
        session.run(query, amenities=amenities)

        durations = ["Short", "Medium", "Long"]
        query = (
            "UNWIND $durations AS duration MERGE (d:StayDuration {duration: duration})"
        )
        session.run(query, durations=durations)

        types = ["Business", "Leisure", "Family", "Solo"]
        query = "UNWIND $types AS type MERGE (t:StayType {type: type})"
        session.run(query, types=types)

    def load_data(
        self,
        hotels_df: pd.DataFrame,
        reviews_df: pd.DataFrame,
        batch_size: int = 1000,
    ) -> None:
        self._create_constraints()

        reviews_df = reviews_df.dropna(subset=["stay_duration", "stay_type"])
        with self.driver.session() as session:
            self._create_static_nodes(session)

            # Create hotels and their amenity relationships
            hotels_query = """
            UNWIND $hotels AS hotel
            MERGE (h:Hotel {hotel_id: hotel.hotel_id})
            SET h.name = hotel.name_hotel,
                h.description = hotel.descriptions,
                h.url = hotel.url_hotel,
                h.address = hotel.address
            WITH h, hotel
            UNWIND [
                {name: 'Air Conditioning', has: hotel.has_air_conditioning},
                {name: 'TV', has: hotel.has_tv},
                {name: 'Balcony', has: hotel.has_balcony},
                {name: 'Food Service', has: hotel.has_food_serving},
                {name: 'Parking', has: hotel.has_parking},
                {name: 'Vehicle Hire', has: hotel.has_hire_vehicle}
            ] AS amenity
            MATCH (a:Amenity {name: amenity.name})
            WHERE amenity.has = true
            MERGE (h)-[:HAS_AMENITY]->(a)
            """

            for i in range(0, len(hotels_df), batch_size):
                batch = hotels_df.iloc[i : i + batch_size]
                session.run(hotels_query, hotels=batch.to_dict("records"))

            # Create reviews with stay duration and type relationships
            reviews_query = """
            UNWIND $reviews AS review
            MATCH (h:Hotel {hotel_id: review.hotel_id})
            MERGE (u:User {user_id: review.user_id})
            SET u.username = review.username
            MERGE (d:StayDuration {duration: review.stay_duration})
            MERGE (t:StayType {type: review.stay_type})
            CREATE (rel:REVIEWED {
                rating: review.review_rating,
                title: review.review_title,
                text: review.review_text_full,
                room_view: review.room_view
            })<-[:REVIEWED]-(u)-[:REVIEWED]->(h)
            CREATE (rel)-[:HAS_DURATION]->(d)
            CREATE (rel)-[:HAS_TYPE]->(t)
            """

            for i in range(0, len(reviews_df), batch_size):
                batch = reviews_df.iloc[i : i + batch_size]
                session.run(reviews_query, reviews=batch.to_dict("records"))

    def recommend_hotels(
        self,
        amenities: List[str] = None,
        stay_type: str = None,
        stay_duration: str = None,
        min_rating: float = 7.0,
        top_k: int = 5,
    ) -> List[Dict[str, Any]]:
        matches = ["MATCH (h:Hotel)"]

        if amenities:
            for amenity in amenities:
                matches.append(
                    f"MATCH (h)-[:HAS_AMENITY]->(:Amenity {{name: '{amenity}'}})"
                )

        query = f"""
        {' '.join(matches)}
        OPTIONAL MATCH (h)<-[:REVIEWED]-(:User)-[:REVIEWED]->(rel:REVIEWED)
        {f"WHERE EXISTS ((rel)-[:HAS_TYPE]->(:StayType {{type: $stay_type}}))" if stay_type else ""}
        {f"AND EXISTS ((rel)-[:HAS_DURATION]->(:StayDuration {{duration: $stay_duration}}))" if stay_duration else ""}
        WITH h, avg(rel.rating) as avg_rating, count(rel) as review_count
        WHERE avg_rating IS NULL OR avg_rating >= $min_rating
        RETURN h.hotel_id as hotel_id,
            h.name as name,
            h.description as description,
            h.address as address,
            coalesce(avg_rating, 0) as avg_rating,
            review_count,
            coalesce(avg_rating * 0.7 + log(review_count + 1) * 0.3, 0) as score
        ORDER BY score DESC
        LIMIT $limit
        """

        with self.driver.session() as session:
            results = session.run(
                query,
                stay_type=stay_type,
                stay_duration=stay_duration,
                min_rating=min_rating,
                limit=top_k,
            )
            return [dict(record) for record in results]


load_dotenv()

recommender = HotelRecommender(
    os.getenv("NEO4J_URI"), os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD")
)

results = recommender.recommend_hotels(
    amenities=["Air Conditioning", "TV"], stay_type="Leisure", stay_duration="Short"
)
for result in results:
    print(result)

recommender.close()



{'hotel_id': '1c29ed343d736184468c0f500b31a147', 'name': 'Golf Valley Hotel', 'description': 'Tọa lạc tại thành phố Đà Lạt, cách CLB chơi golf Dalat Palace 1,9 km, Golf Valley Hotel cung cấp chỗ nghỉ với nhà hàng, chỗ đỗ xe riêng miễn phí, trung tâm thể dục và quầy bar. Cách Quảng trường Lâm Viên 1,8 km và Hồ Xuân Hương 2,1 km, khách sạn có sảnh khách chung và sân hiên. Chỗ nghỉ cung cấp dịch vụ lễ tân 24 giờ, dịch vụ phòng và dịch vụ thu đổi ngoại tệ cho khách. Khách sạn cung cấp phòng nghỉ gắn máy điều hòa có bố trí bàn làm việc, ấm đun nước, tủ lạnh, két an toàn, TV màn hình phẳng, ban công và phòng tắm riêng với vòi sen. Các phòng tại Golf Valley Hotel đều được trang bị ga trải giường và khăn tắm. Chỗ nghỉ phục vụ bữa sáng kiểu lục địa. Du khách có thể chơi phi tiêu và thuê xe hơi tại Golf Valley Hotel. Khách sạn nằm cách Vườn hoa Đà Lạt 2,2 km và Công viên Yersin Đà Lạt 2,3 km. Sân bay gần nhất là sân bay Liên Khương, cách Golf Valley Hotel 30 km.', 'address': '94 Bui Thi Xuan, Wa