In [2]:
import hashlib

import pandas as pd


def hash_md5(text: str) -> str:
    return str(hashlib.md5(text.encode()).hexdigest())


print("Review:")
reviews = pd.read_parquet("../scraper/booking/output/vn_hotels_reviews.parquet")
reviews = reviews.explode("reviews")
reviews = pd.concat(
    [reviews.drop(["reviews"], axis=1), reviews["reviews"].apply(pd.Series)], axis=1
)
reviews = reviews.rename(
    columns={
        "hotel_name": "hotel_slug",
        "full_review": "review_text_full_annot",
        "rating": "review_rating",
    }
)
reviews["review_id"] = reviews.apply(
    lambda x: hash_md5(str(x["hotel_slug"]) + str(x["username"])), axis=1
)
reviews["review_text_full"] = (
    reviews["review_title"].fillna("")
    + ". "
    + reviews["review_text_liked"].fillna("")
    + ". "
    + reviews["review_text_disliked"].fillna("")
)
reviews["hotel_id"] = reviews.apply(lambda x: hash_md5(x["hotel_slug"]), axis=1)
reviews["user_id"] = reviews.apply(lambda x: hash_md5(str(x["username"])), axis=1)

users = reviews[["username", "user_country"]].drop_duplicates()

# reorder columns (move hotel_id, user_id to the front, remove hotel_slug, username)
reviews = reviews[
    [
        "review_id",
        "hotel_id",
        "user_id",
        "review_post_date",
        "review_rating",
        "review_title",
        "review_text_full",
        "review_text_full_annot",
        "review_text_disliked",
        "review_text_liked",
        "stay_duration",
        "stay_type",
        "user_country",
        "room_view",
    ]
]
reviews["stay_duration"] = reviews["stay_duration"].str.extract(r"(\d+)")
reviews["stay_duration"] = (
    pd.to_numeric(reviews["stay_duration"], errors="coerce").fillna(0).astype(int)
)
reviews["review_post_date"] = pd.to_datetime(
    reviews["review_post_date"], format="%m-%d-%Y %H:%M:%S"
)
reviews["review_rating"] = pd.to_numeric(
    reviews["review_rating"], errors="coerce"
).astype(float)
reviews.reset_index(drop=True, inplace=True)
display(reviews.head())

print("Hotel:")
hotels = pd.read_csv("../scraper/booking/input/vn_hotels.csv")
hotels = hotels[hotels["location"] == "Đà Lạt"]
hotels["hotel_id"] = hotels.apply(lambda x: hash_md5(x["hotel_slug"]), axis=1)
hotels = hotels[
    [
        "hotel_id",
        "hotel_slug",
        "name_hotel",
        "descriptions",
        "address",
        "location",
        "country",
        "url_hotel",
    ]
]
hotels.reset_index(drop=True, inplace=True)
display(hotels.head())

print("User:")
users["user_id"] = users.apply(lambda x: hash_md5(str(x["username"])), axis=1)
users = users[["user_id", "username", "user_country"]]
users.reset_index(drop=True, inplace=True)
display(users.head())

Review:


Unnamed: 0,review_id,hotel_id,user_id,review_post_date,review_rating,review_title,review_text_full,review_text_full_annot,review_text_disliked,review_text_liked,stay_duration,stay_type,user_country,room_view
0,18147552e5edfbf872e83e79b5cba6ed,f8dd9fa66a227ffede2937d71ad05921,70bc55b31ea555f16b4b2cee1d5cf901,2024-10-02,10.0,Exceptional,Exceptional. Hotel staff were friendly and hel...,liked: Hotel staff were friendly and helpful. ...,,Hotel staff were friendly and helpful. Really ...,1,Couple,Finland,Deluxe Double Room
1,9cc8c8848fe6076d34bdd91f9e5cc704,f8dd9fa66a227ffede2937d71ad05921,527047c66c6af54086db833d12e1127d,2024-12-29,10.0,Thank you Booking.Com. Good job,Thank you Booking.Com. Good job. Everything. W...,liked: Everything. Will comeback disliked: Not...,Nothing,Everything. Will comeback,3,Family,Vietnam,Deluxe Queen Room with Two Queen Beds
2,168bf7b378ebd25a4ad7582a9fbcaffd,f8dd9fa66a227ffede2937d71ad05921,0b5a7b3db9977926290b4f518d3643a6,2024-12-21,10.0,Exceptional,Exceptional. Good location and cheap with air ...,liked: Good location and cheap with air condit...,,Good location and cheap with air conditioning....,1,Group,United Kingdom,Deluxe Queen Room with Two Queen Beds
3,c1d262e9150aa7ffbd28a1c33a8814d7,f8dd9fa66a227ffede2937d71ad05921,97ae00172b8f4fab84922c3d73be1430,2024-12-21,8.0,Very good,"Very good. It was a big room, very clean. We w...","liked: It was a big room, very clean. We were ...",Around a 15 min walk to restaurants/centre of ...,"It was a big room, very clean. We were moved i...",4,Couple,United Kingdom,Economy Quadruple Room - Basement
4,69dd16b3376ac4f1d97f5fd44fba6fc8,f8dd9fa66a227ffede2937d71ad05921,e35aa53ad2c0b1b2534562691e7b81bc,2024-12-06,10.0,Great value for money and location,Great value for money and location. Staff were...,liked: Staff were very friendly Room comfortab...,,Staff were very friendly Room comfortable. Bed...,1,Group,Slovakia,Deluxe Queen Room with Two Queen Beds


Hotel:


Unnamed: 0,hotel_id,hotel_slug,name_hotel,descriptions,address,location,country,url_hotel
0,f8dd9fa66a227ffede2937d71ad05921,dalat-wind,Dalat Wind Deluxe Hotel,"Tọa lạc tại thành phố Đà Lạt, cách Hồ Xuân Hươ...","Lot R2 03-04. Golf Valley, Ward 2, Đà Lạt, Viê...",Đà Lạt,vn,https://www.booking.com/hotel/vn/dalat-wind.vi...
1,fb401270b8babc2109a6dea17d1ebba2,tropicana-2,Miền Nhiệt Đới 2 Hotel,"Tọa lạc tại thành phố Đà Lạt, cách Quảng trườn...","64 Phan Nhu Thach, ward 1, Đà Lạt, Việt Nam",Đà Lạt,vn,https://www.booking.com/hotel/vn/tropicana-2.v...
2,03adc0096493aa76d4e51842f8a15aa8,reddoorz-near-thung-lung-tinh-yeu,Raon Dalat,"Tọa lạc tại thành phố Đà Lạt, cách CLB chơi go...","46 Tran Khanh Du, Ward 8, Đà Lạt, Việt Nam",Đà Lạt,vn,https://www.booking.com/hotel/vn/reddoorz-near...
3,9bb7114f0c3873e5398496dd7e140f5e,royal-palace-thanh-pho-da-lat,Royal Palace,"Tọa lạc tại thành phố Đà Lạt, cách Công viên Y...","Đường Hà Huy Tập 55 49A Đường Hà Huy Tập, Đà L...",Đà Lạt,vn,https://www.booking.com/hotel/vn/royal-palace-...
4,4a23f6c840d5b9f8d9f38fe289418a31,jolie-house-ap-da-thien,Jolie House,Tọa lạc ở thành phố Đà Lạt thuộc tỉnh Lâm Đồng...,"104 Phù Đổng Thiên Vương, Phường 8, Đà Lạt, Vi...",Đà Lạt,vn,https://www.booking.com/hotel/vn/jolie-house-a...


User:


Unnamed: 0,user_id,username,user_country
0,70bc55b31ea555f16b4b2cee1d5cf901,Heikki,Finland
1,527047c66c6af54086db833d12e1127d,Tractuyen,Vietnam
2,0b5a7b3db9977926290b4f518d3643a6,Neva,United Kingdom
3,97ae00172b8f4fab84922c3d73be1430,Jemma,United Kingdom
4,e35aa53ad2c0b1b2534562691e7b81bc,Marchessault,Slovakia


In [14]:
fact_review = pd.read_parquet("../scraper/booking/output/warehouse/fact_review.parquet")
dim_hotel = pd.read_parquet("../scraper/booking/output/warehouse/dim_hotel.parquet")
dim_user = pd.read_parquet("../scraper/booking/output/warehouse/dim_user.parquet")

import inspect

def printSchema(df):
    frame = inspect.currentframe().f_back
    variable_name = [k for k, v in frame.f_locals.items() if v is df][0]
    cols = df.columns
    print(variable_name)
    for col in cols:
        print(f"+-- {col}: " + str(df[col].dtype))
        
printSchema(fact_review)
printSchema(dim_hotel)
printSchema(dim_user)

fact_review
+-- review_id: object
+-- hotel_id: object
+-- user_id: object
+-- review_post_date: datetime64[ns]
+-- review_rating: float64
+-- review_title: object
+-- review_text_full: object
+-- review_text_full_annot: object
+-- review_text_disliked: object
+-- review_text_liked: object
+-- stay_duration: int64
+-- stay_type: object
+-- user_country: object
+-- room_view: object
dim_hotel
+-- hotel_id: object
+-- hotel_slug: object
+-- name_hotel: object
+-- descriptions: object
+-- address: object
+-- location: object
+-- country: object
+-- url_hotel: object
dim_user
+-- user_id: object
+-- username: object
+-- user_country: object
