In [2]:
import hashlib
import pandas as pd

In [3]:
def hash_md5(text: str) -> str:
    return str(hashlib.md5(text.encode()).hexdigest())

In [23]:
# Read reviews
reviews = pd.read_parquet("../scraper/booking/output/vn_hotels_reviews.parquet")

# Explode reviews column
reviews = reviews.explode("reviews")
reviews = pd.concat(
    [reviews.drop(["reviews"], axis=1), reviews["reviews"].apply(pd.Series)], axis=1
)

In [24]:
def categorize_stay_duration(stay_duration: str) -> str:
    try:
        duration = int(stay_duration.split(" ")[0])
        if duration < 2:
            return "Short"
        elif duration < 5:
            return "Medium"
        elif duration >= 5:
            return "Long"
    except:
        return "Unknown"

In [11]:
print("Working with reviews:")

# Read reviews
reviews = pd.read_parquet("../scraper/booking/output/vn_hotels_reviews.parquet")

# Explode reviews column
reviews = reviews.explode("reviews")
reviews = pd.concat(
    [reviews.drop(["reviews"], axis=1), reviews["reviews"].apply(pd.Series)], axis=1
)
reviews = reviews.rename(
    columns={
        "hotel_name": "hotel_slug",
        "full_review": "review_text_full_annot",
        "rating": "review_rating",
    }
)
reviews["review_id"] = reviews.apply(
    lambda x: hash_md5(str(x["hotel_slug"]) + str(x["username"])), axis=1
)
reviews["review_text_full"] = (
    reviews["review_title"].fillna("")
    + ". "
    + reviews["review_text_liked"].fillna("")
    + ". "
    + reviews["review_text_disliked"].fillna("")
)

# Categorize stay_duration (Short, Medium, Long)
reviews["stay_duration"] = reviews["stay_duration"].astype(str)
reviews["stay_duration"] = reviews["stay_duration"].apply(categorize_stay_duration)

# Add hotel_id, user_id
reviews["hotel_id"] = reviews.apply(lambda x: hash_md5(x["hotel_slug"]), axis=1)
reviews["user_id"] = reviews.apply(lambda x: hash_md5(str(x["username"])), axis=1)

# reorder columns (move hotel_id, user_id to the front, remove hotel_slug, username)
reviews = reviews[
    [
        "review_id",
        "hotel_id",
        "user_id",
        "review_post_date",
        "review_rating",
        "review_title",
        "review_text_full",
        "review_text_full_annot",
        "review_text_disliked",
        "review_text_liked",
        "stay_duration",
        "stay_type",
        "user_country",
        "room_view",
    ]
]
reviews["stay_duration"] = reviews["stay_duration"].str.extract(r"(\d+)")
reviews["stay_duration"] = (
    pd.to_numeric(reviews["stay_duration"], errors="coerce").fillna(0).astype(int)
)
reviews["review_post_date"] = pd.to_datetime(
    reviews["review_post_date"], format="%m-%d-%Y %H:%M:%S"
)
reviews["review_rating"] = pd.to_numeric(
    reviews["review_rating"], errors="coerce"
).astype(float)
reviews.reset_index(drop=True, inplace=True)
display(reviews.head())

Working with reviews:


0       1 night
0      3 nights
0       1 night
0      4 nights
0       1 night
         ...   
633    3 nights
633    2 nights
633    2 nights
633    2 nights
633     1 night
Name: stay_duration, Length: 6108, dtype: object

AttributeError: 'float' object has no attribute 'split'

In [36]:
print("Working with hotels:")

# Read hotels
hotels = pd.read_csv("../scraper/booking/input/vn_hotels.csv")
hotels = hotels[hotels["location"] == "Đà Lạt"]

hotels["hotel_id"] = hotels.apply(lambda x: hash_md5(x["hotel_slug"]), axis=1)
hotels = hotels[
    [
        "hotel_id",
        "hotel_slug",
        "name_hotel",
        "descriptions",
        "address",
        "location",
        "country",
        "url_hotel",
    ]
]
hotels.reset_index(drop=True, inplace=True)
display(hotels["descriptions"].to_csv("descriptions.csv", index=False))
display(len(hotels))

Working with hotels:


None

634

In [None]:
def extract_entities_llm(description: str) -> Optional[Dict[str, str]]:
    """
    Extract structured address information using OpenAI's API.
    
    Args:
        description (str): The address description in English or Vietnamese.
        
    Returns:
        Optional[Dict[str, str]]: A dictionary with structured address components, or None if extraction fails.
    """
    if not description.strip():
        logger.warning("Input description is empty or invalid.")
        return None

    prompt = f"""
    Analyze the following address description and extract the components into structured JSON. 
    Address descriptions may be in English or Vietnamese.

    Address: "{description}"

    Return a valid JSON object with the following keys:
    - "street": The street name or number, if available.
    - "ward": The ward name or number, if available.
    - "city": The city name.
    - "country": The country name.

    Example format:
    {{
      "street": "98 Đường Thông Thiên Học",
      "ward": "Phường 8",
      "city": "Đà Lạt",
      "country": "Việt Nam"
    }}
    If any component is missing in the address, leave it as null.
    """
    
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )
        
        content = response.choices[0].message["content"]
        logger.info(f"Raw response: {content}")

        # Parse the response content as JSON
        extracted_data = json.loads(content)
        required_keys = {"street", "ward", "city", "country"}
        if required_keys.issubset(extracted_data.keys()):
            return extracted_data
        else:
            logger.error("Response JSON does not contain all expected keys.")
            return None

    except json.JSONDecodeError as e:
        logger.error(f"Failed to parse JSON from response: {e}")
        return None
    except openai.error.OpenAIError as e:
        logger.error(f"OpenAI API error: {e}")
        return None
    except Exception as e:
        logger.exception(f"An unexpected error occurred: {e}")
        return None

In [None]:
from openai import ChatCompletion
from neo4j import GraphDatabase

# Define a function to extract main amenities
def extract_main_amenities(chunks):
    all_features = []
    for chunk in chunks:
        response = ChatCompletion.create(
            model="gpt-4",
            messages=[
                {
                    "role": "system", 
                    "content": """Extract the main hotel amenities in the following categories:
                    - Hotel facilities (e.g., parking, pool, Wi-Fi).
                    - Room amenities (e.g., air conditioning, TV, balcony).
                    - Food & Dining (e.g., breakfast, restaurants).
                    - Nearby attractions (e.g., landmarks, parks).
                    - Transportation (e.g., airport proximity, shuttle services).
                    Focus on the main amenities (not too many) and return the result as a list of amenities."""
                },
                {
                    "role": "user", 
                    "content": chunk
                }
            ],
            max_tokens=200,
            temperature=0.7
        )
        try:
            features = response.choices[0].message.content.split('\n')
            all_features.extend(features)
        except KeyError:
            print("Error processing chunk:", chunk)
            continue
    return list(set(all_features))  # Remove duplicates

# Define a function to insert amenities into Neo4j
def insert_amenities_to_neo4j(driver, hotel_name, amenities):
    with driver.session() as session:
        query = """
        MATCH (h:Hotel {name: $hotel_name})
        UNWIND $amenities as amenity
        MERGE (a:Amenity {name: amenity})
        MERGE (h)-[r:HAS_AMENITY]->(a)
        ON CREATE SET r.weight = 1
        ON MATCH SET r.weight = r.weight + 0.1
        """
        session.run(query, hotel_name=hotel_name, amenities=amenities)

# Main function to process and upload data
def process_hotel_data(chunks, hotel_name, neo4j_uri, neo4j_user, neo4j_password):
    # Extract amenities
    amenities = extract_main_amenities(chunks)
    
    # Connect to Neo4j
    driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
    try:
        insert_amenities_to_neo4j(driver, hotel_name, amenities)
        print(f"Amenities for {hotel_name} have been added to the database.")
    finally:
        driver.close()

# Example usage
chunks = [
    """Tọa lạc tại thành phố Đà Lạt, cách Hồ Xuân Hương 500 m, Dalat Wind Deluxe Hotel là khách sạn 2 sao có lễ tân 24 giờ, sảnh khách chung, Wi-Fi và chỗ đỗ xe riêng miễn phí. Tại khách sạn, tất cả các phòng đều có bàn làm việc, TV màn hình phẳng, ấm đun nước và phòng tắm riêng với chậu rửa vệ sinh (bidet). Một số phòng còn có ban công. Khách sạn phục vụ bữa sáng gọi món hàng ngày. Du khách có thể dùng bữa tại nhà hàng trong khuôn viên, nơi chuyên phục vụ các món nướng/BBQ. Du khách cũng có thể thư giãn trên sân hiên tắm nắng. Các điểm tham quan nổi tiếng gần Dalat Wind Deluxe Hotel bao gồm Vườn hoa Đà Lạt, Quảng trường Lâm Viên và Công viên Yersin Đà Lạt. Sân bay gần nhất là sân bay Liên Khương, cách chỗ nghỉ 23 km."""
]
hotel_name = "Dalat Wind Deluxe Hotel"
neo4j_uri = "bolt://localhost:7687"
neo4j_user = "neo4j"
neo4j_password = "your_password"

process_hotel_data(chunks, hotel_name, neo4j_uri, neo4j_user, neo4j_password)

In [None]:
print("User:")
users = reviews[["username", "user_country"]].drop_duplicates()
users["user_id"] = users.apply(lambda x: hash_md5(str(x["username"])), axis=1)
users = users[["user_id", "username", "user_country"]]
users.reset_index(drop=True, inplace=True)
display(users.head())

In [14]:
fact_review = pd.read_parquet("../scraper/booking/output/warehouse/fact_review.parquet")
dim_hotel = pd.read_parquet("../scraper/booking/output/warehouse/dim_hotel.parquet")
dim_user = pd.read_parquet("../scraper/booking/output/warehouse/dim_user.parquet")

import inspect

def printSchema(df):
    frame = inspect.currentframe().f_back
    variable_name = [k for k, v in frame.f_locals.items() if v is df][0]
    cols = df.columns
    print(variable_name)
    for col in cols:
        print(f"+-- {col}: " + str(df[col].dtype))
        
printSchema(fact_review)
printSchema(dim_hotel)
printSchema(dim_user)

fact_review
+-- review_id: object
+-- hotel_id: object
+-- user_id: object
+-- review_post_date: datetime64[ns]
+-- review_rating: float64
+-- review_title: object
+-- review_text_full: object
+-- review_text_full_annot: object
+-- review_text_disliked: object
+-- review_text_liked: object
+-- stay_duration: int64
+-- stay_type: object
+-- user_country: object
+-- room_view: object
dim_hotel
+-- hotel_id: object
+-- hotel_slug: object
+-- name_hotel: object
+-- descriptions: object
+-- address: object
+-- location: object
+-- country: object
+-- url_hotel: object
dim_user
+-- user_id: object
+-- username: object
+-- user_country: object
