In [3]:
from abc import ABC
from abc import abstractmethod
from typing import Any, Dict
from typing import Dict
from typing import List
from typing import Optional

import pandas as pd
import openai
import json
import logging
from neo4j import GraphDatabase
from neo4j import Session
from neo4j.exceptions import Neo4jError
from llama_index.llms.openai import OpenAI

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("RecommendationPipeline")

In [4]:

class BaseHotelRecommender(ABC):
    def __init__(self, uri: str, username: str, password: str):
        """
        Base class for hotel recommendation systems.

        Args:
            uri (str): URI for the Neo4j database.
            username (str): Username for the Neo4j database.
        """
        self.driver = GraphDatabase.driver(uri, auth=(username, password))

    def close(self) -> None:
        """
        Close the connection to the Neo4j database.
        """
        self.driver.close()

    @abstractmethod
    def _create_constraints(self) -> None:
        """
        Create constraints in the Neo4j database.
        """
        raise NotImplementedError

    @abstractmethod
    def recommend_hotels(self) -> None:
        """
        Recommend hotels to users.
        """
        raise NotImplementedError

In [5]:
class CypherGraphHotelRecommender(BaseHotelRecommender):
    def __init__(self, uri: str, username: str, password: str):
        """
        Initialize the CypherGraphHotelRecommender.

        Args:
            uri (str): URI for the Neo4j database.
            username (str): Username for the Neo4j database.
            password (str): Password for the Neo4j database.
        """
        super().__init__(uri=uri, username=username, password=password)

    def _create_constraints(self) -> None:
        """
        Create constraints in the Neo4j database.
        """
        with self.driver.session() as session:
            constraints = [
                "CREATE CONSTRAINT IF NOT EXISTS FOR (h:Hotel) REQUIRE h.hotel_id IS UNIQUE",
                "CREATE CONSTRAINT IF NOT EXISTS FOR (u:User) REQUIRE u.user_id IS UNIQUE",
                "CREATE CONSTRAINT IF NOT EXISTS FOR (a:Amenity) REQUIRE a.name IS UNIQUE",
                "CREATE CONSTRAINT IF NOT EXISTS FOR (d:StayDuration) REQUIRE d.duration IS UNIQUE",
                "CREATE CONSTRAINT IF NOT EXISTS FOR (t:StayType) REQUIRE t.type IS UNIQUE",
            ]
            for constraint in constraints:
                session.run(constraint)

    def _create_static_nodes(self, session: Session) -> None:
        """
        Create static nodes in the Neo4j database.

        Args:
            session: Neo4j session object.
        """
        amenities = [
            "Air Conditioning",
            "TV",
            "Balcony",
            "Food Service",
            "Parking",
            "Vehicle Hire",
        ]
        query = "UNWIND $amenities AS name MERGE (a:Amenity {name: name})"
        session.run(query, amenities=amenities)

        durations = ["Short", "Medium", "Long"]
        query = (
            "UNWIND $durations AS duration MERGE (d:StayDuration {duration: duration})"
        )
        session.run(query, durations=durations)

        types = ["Couple", "Family", "Group", "Solo traveller"]
        query = "UNWIND $types AS type MERGE (t:StayType {type: type})"
        session.run(query, types=types)

    def load_data(
        self,
        hotels_df: pd.DataFrame,
        reviews_df: pd.DataFrame,
        batch_size: int = 1000,
    ) -> None:
        """
        Load data into the Neo4j database.

        Args:
            hotels_df (pd.DataFrame): DataFrame containing hotel data.
            reviews_df (pd.DataFrame): DataFrame containing review data.
            batch_size (int): Batch size for loading data
        """
        self._create_constraints()

        reviews_df = reviews_df.dropna(subset=["stay_duration", "stay_type"])
        reviews_df = reviews_df[
            reviews_df["stay_duration"].isin(["Short", "Medium", "Long"])
        ]
        reviews_df = reviews_df[
            reviews_df["stay_type"].isin(
                ["Couple", "Family", "Group", "Solo traveller"]
            )
        ]

        with self.driver.session() as session:
            self._create_static_nodes(session)

            # Create hotels and their amenity relationships
            hotels_query = """
            UNWIND $hotels AS hotel
            MERGE (h:Hotel {hotel_id: hotel.hotel_id})
            SET h.name = hotel.name_hotel,
                h.description = hotel.descriptions,
                h.url = hotel.url_hotel,
                h.address = hotel.address
            WITH h, hotel
            UNWIND [
                {name: 'Air Conditioning', has: hotel.has_air_conditioning},
                {name: 'TV', has: hotel.has_tv},
                {name: 'Balcony', has: hotel.has_balcony},
                {name: 'Food Service', has: hotel.has_food_serving},
                {name: 'Parking', has: hotel.has_parking},
                {name: 'Vehicle Hire', has: hotel.has_hire_vehicle}
            ] AS amenity
            MATCH (a:Amenity {name: amenity.name})
            WHERE amenity.has = true
            MERGE (h)-[:HAS_AMENITY]->(a)
            """

            for i in range(0, len(hotels_df), batch_size):
                batch = hotels_df.iloc[i : i + batch_size]
                session.run(hotels_query, hotels=batch.to_dict("records"))

            # Create reviews with stay duration and type relationships
            reviews_query = """
            UNWIND $reviews AS review
            MATCH (h:Hotel {hotel_id: review.hotel_id})
            MERGE (u:User {user_id: review.user_id})
            SET u.username = review.username
            MERGE (d:StayDuration {duration: review.stay_duration})
            MERGE (t:StayType {type: review.stay_type})
            CREATE (r:Review {
                rating: review.review_rating,
                title: review.review_title,
                text: review.review_text_full,
                room_view: review.room_view
            })
            MERGE (u)-[:REVIEWED]->(r)
            MERGE (r)-[:REVIEWED]->(h)
            MERGE (r)-[:HAS_DURATION]->(d)
            MERGE (r)-[:HAS_TYPE]->(t)
            """

            for i in range(0, len(reviews_df), batch_size):
                batch = reviews_df.iloc[i : i + batch_size]
                session.run(reviews_query, reviews=batch.to_dict("records"))

    def recommend_hotels(
        self,
        amenities: List[str] = None,
        stay_type: Optional[str] = None,
        stay_duration: Optional[str] = None,
        min_rating: float = 5.0,
        limit: int = 5,
    ) -> List[Dict[str, Any]]:
        """
        Recommend hotels to users.

        Args:
            amenities (List[str]): List of amenities to filter by.
            stay_type (str): Type of stay to filter by.
            stay_duration (str): Duration of stay to filter by.
            min_rating (float): Minimum rating for hotels.
            limit (int): Number of hotels to return.

        Returns:
            List[Dict[str, Any]]: List of recommended hotels.
        """
        matches = ["MATCH (h:Hotel)"]

        if amenities:
            for amenity in amenities:
                matches.append(
                    f"MATCH (h)-[:HAS_AMENITY]->(:Amenity {{name: '{amenity}'}})"
                )

        if stay_type:
            matches.append("MATCH (r)-[:HAS_TYPE]->(:StayType {type: $stay_type})")

        if stay_duration:
            matches.append(
                "MATCH (r)-[:HAS_DURATION]->(:StayDuration {duration: $stay_duration})"
            )

        query = f"""
        {' '.join(matches)}
        MATCH (r:Review)-[:REVIEWED]->(h)
        WITH h, avg(r.rating) as avg_rating, count(r) as review_count
        WHERE avg_rating IS NULL OR avg_rating >= $min_rating
        RETURN h.hotel_id as hotel_id,
            h.name as name,
            h.description as description,
            h.address as address,
            coalesce(avg_rating, 0) as avg_rating,
            review_count,
            coalesce(avg_rating * 0.7 + log(review_count + 1) * 0.3, 0) as score
        ORDER BY score DESC
        LIMIT $limit
        """

        with self.driver.session() as session:
            results = session.run(
                query,
                stay_type=stay_type,
                stay_duration=stay_duration,
                min_rating=min_rating,
                limit=limit,
            )
            return [dict(record) for record in results]


In [6]:
class GraphRAGHotelRecommender(BaseHotelRecommender):
    def __init__(
        self, uri: str, username: str, password: str, openai_model: str = "gpt-4o-mini"
    ):
        """
        Initialize the GraphRAGHotelRecommender.

        Args:
            uri (str): URI for the Neo4j database.
            username (str): Username for the Neo4j database.
            password (str): Password for the Neo4j database.
            openai_model (str): OpenAI model to use for LLM operations. Defaults to "gpt-4o-mini".
        """
        super().__init__(uri=uri, username=username, password=password)
        self.openai_model = openai_model

        # Detect communities and summarize them
        self.communities = self.communities_detection()
        self.summaries = self.communities_summarization(self.communities)

    def _create_constraints(self):
        """
        Create constraints in the Neo4j database.
        """
        return super()._create_constraints()

    def communities_detection(
        self, graph_name: str = "hotelCommunityGraph"
    ) -> Dict[str, Any]:
        """
        Detect communities within the graph database using Louvain algorithm.

        Args:
            graph_name (str): Name of the graph projection in Neo4j.

        Returns:
            Dict[str, Any]: Communities with their nodes and relationships.
        """
        try:
            with self.driver.session() as session:
                # Check and create graph if necessary
                result = session.run(f"""
                    CALL gds.graph.exists('{graph_name}')
                    YIELD exists
                    RETURN exists
                """)
                graph_exists = result.single()["exists"]

                if not graph_exists:
                    session.run(f"""
                        CALL gds.graph.project(
                            '{graph_name}',
                            '*',
                            '*'
                        )
                    """)

                # Run Louvain community detection
                session.run(f"""
                    CALL gds.louvain.write('{graph_name}', {{
                        writeProperty: 'communityId'
                    }})
                """)

                # Fetch communities with relationships
                result = session.run("""
                    MATCH (n)
                    WHERE n.communityId IS NOT NULL
                    WITH DISTINCT n.communityId AS community
                    MATCH (n1)-[r]-(n2)
                    WHERE n1.communityId = community AND n2.communityId = community
                    WITH community,
                         COLLECT(DISTINCT {
                             id: id(n1),
                             labels: labels(n1),
                             properties: properties(n1)
                         }) + COLLECT(DISTINCT {
                             id: id(n2),
                             labels: labels(n2),
                             properties: properties(n2)
                         }) AS nodes,
                         COLLECT(DISTINCT {
                             type: type(r),
                             properties: properties(r),
                             source: id(startNode(r)),
                             target: id(endNode(r))
                         }) AS relationships
                    RETURN community, nodes, relationships
                    ORDER BY community
                """)

                communities = {}
                for record in result:
                    unique_nodes = {
                        node["id"]: node for node in record["nodes"]
                    }.values()
                    communities[record["community"]] = {
                        "nodes": list(unique_nodes),
                        "relationships": record["relationships"],
                    }
                return communities

        except Neo4jError as e:
            raise RuntimeError(f"Error during community detection: {e}")

    def communities_summarization(
        self, communities: Dict[str, Any]
    ) -> List[Dict[str, Any]]:
        """
        Summarize detected communities using LLM.

        Args:
            communities (Dict[str, Any]): Communities data.

        Returns:
            List[Dict[str, Any]]: Summaries for each community.
        """
        summaries = []
        for community_id, community_data in communities.items():
            try:
                nodes = ", ".join(
                    f"{node['properties'].get('name', 'Unknown')} ({', '.join(node['labels'])})"
                    for node in community_data["nodes"]
                )
                prompt = (
                    "You are an AI assistant specializing in hotel recommendations. "
                    "Summarize the characteristics of the following hotels or places "
                    "to support recommendations:\n\n"
                    f"Community Nodes: {nodes}\n\n"
                    "Provide a summary:"
                )
                response = openai.chat.completions.create(
                    model=self.openai_model,
                    messages=[{"role": "system", "content": prompt}],
                )
                summaries.append(
                    {
                        "community": community_id,
                        "summary": response.choices[0].message.content.strip(),
                    }
                )
            except Exception as e:
                raise RuntimeError(f"Error summarizing community {community_id}: {e}")
        return summaries

    def recommend_hotels(self, query: str) -> str:
        """
        Generate hotel recommendations based on the user's query.

        Args:
            query (str): User query.

        Returns:
            str: Recommendation response.
        """
        try:
            summary_text = "\n".join(
                f"Community {s['community']}: {s['summary']}" for s in self.summaries
            )
            prompt = (
                "Using the following community summaries, generate a hotel recommendation "
                "based on the user's query:\n\n"
                f"User Query: {query}\n\nCommunity Summaries:\n{summary_text}\n\n"
                "Provide recommendations:"
            )
            response = openai.chat.completions.create(
                model=self.openai_model,
                messages=[{"role": "system", "content": prompt}],
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            raise RuntimeError(f"Error in recommending hotels: {e}")


In [7]:
class RecommendationPipeline:
    def __init__(
        self,
        openai_model: str = "gpt-4o-mini",
        neo4j_uri: str = None,
        neo4j_username: str = None,
        neo4j_password: str = None,
    ):
        """
        Initialize GraphRAG with LLM-driven functionality.

        Args:
            openai_model (str): OpenAI model name. Defaults to "gpt-4o-mini".
            neo4j_uri (str): Neo4j URI.
            neo4j_username (str): Neo4j username.
            neo4j_password (str): Neo4j password.
        """
        self.openai_model = openai_model

        # Initialize the recommender
        self.graph_rag_hotel_recommender = GraphRAGHotelRecommender(
            neo4j_uri, neo4j_username, neo4j_password, openai_model=openai_model
        )

        self.cypher_graph_hotel_recommender = CypherGraphHotelRecommender(
            neo4j_uri,
            neo4j_username,
            neo4j_password,
        )

    def run(self, query: str) -> Dict[str, Any]:
        """
        Run the recommendation pipeline.

        Args:
            query (str): User query.

        Returns:
            Dict[str, Any]: Recommended hotels.
        """
        logger.info("Starting recommendation pipeline.")
        try:
            # First get recommendations from both methods
            cypher_recommendations = self.process_cypher_recommendations(query=query)
            graph_rag_recommendations = self.process_graph_rag_recommendations(
                query=query
            )

            # Then combine using hybrid approach
            final_recommendations = []
            if cypher_recommendations and graph_rag_recommendations:
                final_recommendations = self.process_hybrid_recommendations(
                    cypher_recommendations=cypher_recommendations,
                    graph_rag_recommendations=graph_rag_recommendations,
                    query=query,
                )
            else:
                final_recommendations = (
                    cypher_recommendations or graph_rag_recommendations
                )

            prompt = (
                "Here are the raw recommendations:\n"
                f"Recommendations: {final_recommendations}\n"
                "Your task is to format the output of the recommendations as per the following template. "
                "Please strictly follow the below format and ensure the content is well-structured, clear, and concise:\n\n"
                "1. **[Hotel Name]**: [Short description of the hotel]. Include key features and amenities such as [specific features mentioned in the query]. Provide a brief evaluation of the hotel and its suitability based on the query.\n"
                "For example, if the user query is 'I am looking for a hotel with air conditioning and TV', the output should be formatted as follows:\n\n"
                "Based on your query for a hotel with air conditioning and TV, I recommend considering the following options:\n\n"
                "1. **Dalat Flower Hotel & Spa**: This luxury hotel features comfortable rooms with air conditioning and TVs, "
                "along with upscale amenities including a spa and fine dining. It's perfect for those looking to indulge while ensuring "
                "all essential amenities are covered.\n"
                "2. **Du Miên Hotel**: Known for its strategic location near attractions, Du Miên Hotel offers essential amenities like "
                "air conditioning and televisions. The comfortable and well-appointed rooms make it a good option for both leisure and "
                "business travelers.\n\n"
                "Make sure to strictly follow the structure and format in your response. Each recommendation should be clearly numbered "
                "with the hotel's name in bold, followed by a concise description of the amenities and features. Ensure proper punctuation "
                "and clarity in the sentences. The response should directly address the user's query while maintaining a professional tone."
            )

            response = openai.chat.completions.create(
                model=self.openai_model,
                messages=[{"role": "system", "content": prompt}],
            )
            content = response.choices[0].message.content
            return content
        except Exception as e:
            logger.error(f"Error running recommendation pipeline: {e}")
            raise

    def process_cypher_recommendations(self, query: str) -> List[Dict[str, Any]]:
        """
        Process Cypher-based recommendations.

        Args:
            query (str): User query.

        Returns:
            List[Dict[str, Any]]: Recommended hotels.
        """
        logger.info("Processing Cypher recommendations.")

        try:
            # Extract parameters from the query using LLM
            prompt = (
                "You are an AI assistant specializing in hotel recommendations. "
                "Your task is to extract key details from the user query and return them as a structured JSON object. "
                "The details to extract are:\n"
                "- **amenities**: A list of amenities mentioned in the query (e.g., Wifi, Air Conditioning). Ensure each amenity is properly capitalized.\n"
                "- **stay_type**: The type of stay mentioned, if any (e.g., Family, Business, Solo). If not mentioned, set it to None.\n"
                "- **stay_duration**: The duration of the stay, if specified (e.g., Short, Long, Weekend). If not mentioned, set it to None.\n\n"
                f"User Query: {query}\n\n"
                "Provide the output strictly as a JSON object in the following format:\n"
                "{'amenities': [<list of amenities>], 'stay_type': <stay type>, 'stay_duration': <stay duration>}\n\n"
                "Important:\n"
                "1. Ensure the JSON keys are always lowercase (e.g., 'amenities', 'stay_type', 'stay_duration').\n"
                "2. Capitalize the values appropriately (e.g., 'Wifi', 'Air Conditioning').\n"
                "3. If a parameter is missing or not mentioned, set it to None.\n"
                "4. The response must be a valid JSON object without additional text or explanations.\n\n"
                "Example:\n"
                "For the user query: 'I need a hotel with Wifi and Air Conditioning for a short stay', the output should be:\n"
                '{"amenities": ["Wifi", "Air Conditioning"], "stay_type": None, "stay_duration": "Short"}\n\n'
                "I expect the property names enclosed in double quotes."
                "Now, extract the details and provide the output in the specified JSON format."
            )

            response = openai.chat.completions.create(
                model=self.openai_model,
                messages=[{"role": "system", "content": prompt}],
            )
            content = response.choices[0].message.content.strip()

            # Clean the response
            if content.startswith("```") and content.endswith("```"):
                content = content.strip("```").strip("json").strip()
            cleaned_content = content.replace("None", "null")

            # Extract the parameters
            response_json = json.loads(cleaned_content)
            amenities = response_json.get("amenities", [])
            stay_type = response_json.get("stay_type", "")
            stay_duration = response_json.get("stay_duration", "")

            recommended_hotels = self.cypher_graph_hotel_recommender.recommend_hotels(
                amenities=amenities, stay_type=stay_type, stay_duration=stay_duration
            )

            self.cypher_graph_hotel_recommender.close()
            return recommended_hotels
        except Exception as e:
            logger.error(f"Error in Cypher recommendations: {e}")
            return []

    def process_graph_rag_recommendations(self, query: str) -> List[Dict[str, Any]]:
        """
        Process recommendations using the Graph-based RAG method.

        Params:
            - query: Natural language query.

        Returns:
            List[Dict[str, Any]]: Structured element instances.
        """
        logger.info("Processing Graph RAG recommendations.")

        try:
            recommended_hotels = self.graph_rag_hotel_recommender.recommend_hotels(
                query=query
            )
            self.graph_rag_hotel_recommender.close()
            return recommended_hotels
        except Exception as e:
            logger.error(f"Error in Graph RAG recommendations: {e}")
            return []

    def process_hybrid_recommendations(
        self,
        cypher_recommendations: List[Dict[str, Any]],
        graph_rag_recommendations: List[Dict[str, Any]],
        query: str,
    ) -> List[Dict[str, Any]]:
        """
        Combine recommendations using a hybrid approach.

        Args:
            cypher_recommendations (List[Dict[str, Any]]): Cypher-based recommendations.
            graph_rag_recommendations (List[Dict[str, Any]]): Graph RAG-based recommendations.
            query (str): User query.

        Returns:
            List[Dict[str, Any]]: Combined recommendations.
        """
        logger.info("Combining recommendations using hybrid approach.")
        try:
            # Combine the recommendations
            combined = []
            if cypher_recommendations:
                combined.append(cypher_recommendations)

            if graph_rag_recommendations:
                combined.append(graph_rag_recommendations)

            # Prepare data for LLM-based reranking
            prompt = (
                "You are an AI assistant specializing in hotel recommendations. "
                "Given the user query and the list of five hotel recommendations, your task is to rank them "
                "from most to least relevant based on the user's preferences. Please pay close attention to the following steps:\n\n"
                "1. Carefully analyze the user's query to understand the key preferences and requirements. Focus on specific amenities, "
                "location, price range, or any other details mentioned in the query.\n"
                "2. For each hotel recommendation, evaluate how well it aligns with the user's stated preferences. "
                "Consider factors such as the presence of requested amenities (e.g., air conditioning, TV, proximity to landmarks, etc.), "
                "hotel features, and overall suitability based on the user's needs.\n"
                "3. Rank the recommendations from most to least relevant. The most relevant recommendation should be the one that best "
                "meets the user's needs and preferences, while the least relevant recommendation should be the one that least aligns with "
                "those needs.\n\n"
                f"User Query: {query}\n\n"
                "Recommendations:\n"
                "1. **[Hotel Name]**: [Brief description of the hotel and how it fits the user's preferences].\n"
                "2. **[Hotel Name]**: [Brief description of the hotel and how it fits the user's preferences].\n"
                "3. **[Hotel Name]**: [Brief description of the hotel and how it fits the user's preferences].\n"
                "4. **[Hotel Name]**: [Brief description of the hotel and how it fits the user's preferences].\n"
                "5. **[Hotel Name]**: [Brief description of the hotel and how it fits the user's preferences].\n\n"
                "For each recommendation, consider the following when ranking:\n"
                "- **Amenity match**: How well the hotel matches the specific amenities mentioned in the query (e.g., air conditioning, TV, etc.).\n"
                "- **Location relevance**: How relevant the hotel's location is to the user's query, if applicable (e.g., proximity to tourist attractions, business areas, etc.).\n"
                "- **Suitability**: How well the hotel meets the user's overall needs (e.g., luxury, budget, family-friendly, etc.).\n"
                "- **User preferences**: Any other preferences stated in the query (e.g., preferred hotel type, specific features).\n\n"
                "Finally, return the recommendations in the following order, starting with the most relevant and ending with the least relevant."
            )

            for i, rec in enumerate(combined, 1):
                prompt += f"{i}. {rec}\n"

            prompt += "\nReturn the ranked recommendations as a numbered list."

            # Use the LLM to rerank
            llm = OpenAI(self.openai_model)
            response = llm.complete(prompt).text.strip()

            # Parse the response into a ranked list
            ranked_hotels = []
            for line in response.split("\n"):
                line = line.strip()
                if line and line[0].isdigit():
                    try:
                        index = int(line.split(".")[0]) - 1
                        if 0 <= index < len(combined):
                            ranked_hotels.append(combined[index])
                    except (ValueError, IndexError):
                        continue

            return ranked_hotels or combined
        except Exception as e:
            logger.error(f"Error in hybrid recommendations: {e}")
            return []

In [8]:
pipeline = RecommendationPipeline(
    neo4j_uri="bolt://localhost:7687",
    neo4j_username="neo4j",
    neo4j_password="password",
)



RuntimeError: Error summarizing community 0: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [None]:
query = "I am looking for a hotel with air conditioning and TV"

recommendations = pipeline.run(query)
print(recommendations)