In [18]:
import pandas as pd

In [19]:
df = pd.read_csv('../../data/processed/realestates_kh_v6.csv', encoding='latin1')

In [20]:
print(df)

            id      price  bedrooms  bathrooms  land_area address_subdivision  \
0     185714.0  1100000.0       7.0        7.0      124.0          Phnom Penh   
1     185539.0   680000.0       4.0        5.0       80.0          Phnom Penh   
2     217752.0   550000.0       3.0        4.0       66.0          Phnom Penh   
3     228897.0   750000.0       9.0       10.0      116.0          Phnom Penh   
4     190024.0   420000.0       5.0        6.0       65.0          Phnom Penh   
...        ...        ...       ...        ...        ...                 ...   
3388  245046.0   270000.0       NaN        NaN   270000.0          Phnom Penh   
3389  219672.0    63000.0       NaN        1.0    63000.0          Phnom Penh   
3390  242728.0   550000.0       6.0        6.0   550000.0          Phnom Penh   
3391  205451.0   165000.0       1.0        1.0   165000.0          Phnom Penh   
3392   58934.0        NaN       NaN        NaN    30000.0          Phnom Penh   

     address_locality  addr

In [21]:
import re
import numpy as np

def extract_floor_area(text):
    if not isinstance(text, str):
        return np.nan
    match = re.search(r'(\d{1,5})\s*(?:m2|sqm|m²|sqm)', text.lower())
    return float(match.group(1)) if match else np.nan

def extract_total_floors(text):
    if not isinstance(text, str):
        return np.nan
    # Match "Total Floor: 3", "Total floors: 3", "Floors ( s ) : 3", "stories: 04 stories house", "14 Floors"
    patterns = [
        r'total\s*floors?\s*[:\-]?\s*(\d{1,2})',
        r'floors?\s*\(.*\)\s*[:\-]?\s*(\d{1,2})',
        r'(\d{1,2})\s*[-]?\s*storey',
        r'(\d{1,2})\s*[-]?\s*stories',
        r'(\d{1,2})\s*[-]?\s*floors?',
        r'building stories\s*[:\-]?\s*(\d{1,2})',
        r'total\s*(\d{1,2})\s*floors?',
        r'stories\s*[:\-]?\s*(\d{1,2})',
        r'(\d{1,2})\s*floor'
    ]
    for pat in patterns:
        match = re.search(pat, text.lower())
        if match:
            return float(match.group(1))
    return np.nan

def extract_floor_number(text):
    if not isinstance(text, str):
        return np.nan
    # Match "Floor: 21F", "Floor 3", "on the 3rd floor", "first floor", "second floor", "E0", "E1", "E2"
    patterns = [
        r'floor\s*[:\-]?\s*(\d{1,2})',
        r'(\d{1,2})[a-z]{0,2}\s*floor',
        r'floor\s*(\d{1,2})[a-z]{0,2}',
        r'(\d{1,2})[a-z]{0,2}\s*floors?',
        r'(\d{1,2})\s*[-]?\s*storey',
        r'(\d{1,2})\s*[-]?\s*stories',
        r'floor\s*[:\-]?\s*([a-z]\d{0,2})',  # E0, E1, etc.
    ]
    for pat in patterns:
        match = re.search(pat, text.lower())
        if match:
            try:
                return float(re.sub(r'[^\d]', '', match.group(1)))
            except:
                continue
    # Handle "first floor", "second floor", etc.
    word2num = {'first': 1, 'second': 2, 'third': 3, 'fourth': 4, 'fifth': 5}
    for word, num in word2num.items():
        if word + ' floor' in text.lower():
            return float(num)
    return np.nan

def extract_unit_number(text):
    if not isinstance(text, str):
        return np.nan
    match = re.search(r'unit(?:\s*number)?\s*[:\-]?\s*(\d{1,5})', text.lower())
    return float(match.group(1)) if match else np.nan

df['floor_area'] = df['information'].apply(extract_floor_area)
df['floor_number'] = df['information'].apply(extract_floor_number)
df['total_floors'] = df['information'].apply(extract_total_floors)
df['unit_number'] = df['information'].apply(extract_unit_number)



In [22]:
df.isnull().sum()

id                 0
price              1
bedrooms         561
bathrooms        580
land_area          0
                ... 
f_unused           0
floor_area      2360
floor_number    2350
total_floors    2576
unit_number     3369
Length: 119, dtype: int64

In [23]:
# Count nulls for selected columns, grouped by category_name
cols = ['floor_area', 'floor_number', 'total_floors', 'unit_number']
null_summary = df.groupby('category_name')[cols].apply(lambda x: x.isnull().sum())

# Add a column for the total count of each category
category_counts = df['category_name'].value_counts().rename('total_count')
null_summary = null_summary.merge(category_counts, left_index=True, right_index=True)

null_summary

Unnamed: 0_level_0,floor_area,floor_number,total_floors,unit_number,total_count
category_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Apartment,48,76,136,157,160
Business,11,6,9,12,12
CommercialShophouse,4,4,3,4,4
Condo,202,290,607,649,669
Factory,2,2,2,3,3
Flat,214,200,158,235,235
Food/Hospitality,4,3,3,4,4
Hotel,2,2,1,3,3
House,366,313,344,398,398
Land,29,53,53,53,53


In [24]:
# Sort by price_per_m2 (descending)
df = df.sort_values(by='price_per_m2', ascending=False)

# Select the top 50 rows and display with more columns
styled = df.head(50)[[ 
    'information', 'category_name', 'floor_area', 'floor_number', 
    'total_floors', 'unit_number'
]].style.set_table_styles({
    'category_name': [{'selector': 'td', 'props': [('min-width', '20px')]}],
    'floor_area': [{'selector': 'td', 'props': [('min-width', '20px')]}],
    'floor_number': [{'selector': 'td', 'props': [('min-width', '20px')]}],
    'total_floors': [{'selector': 'td', 'props': [('min-width', '20px')]}],
    'unit_number': [{'selector': 'td', 'props': [('min-width', '20px')]}],
    'information': [{'selector': 'td', 'props': [('min-width', '300px'), ('max-width', '500px'), ('white-space', 'pre-wrap')]}]
})
styled

Unnamed: 0,information,category_name,floor_area,floor_number,total_floors,unit_number
0,"Shop House for Sale Now In Business Area at RiversideÂ !!!/T'S HOME Real Estate Phnom Penh. ??Property ID : TS-108 ??Room Type :Â 7 Bedroom / 7 Bathroom ?? Price : 1,100,000USD ?? Land Size : 4m Ã 31m ?? House Size: 4m Ã 31m ?? Total Floor: 3 Location : Riverside Area, Phnom Penh.",Shophouse,,3.0,3.0,
1,"Shop House for Sale Now In Business Area at BKK3 !!!/T'S HOME Real Estate Phnom Penh. ??Property ID : TS-024 ??Room Type : 4 Bedroom / 5 Bathroom ?? Price : 680,000USD ?? Land Size : 5m Ã 16m ?? House Size: 4m Ã 16m ?? Total Floor: Ground Floor Location : BKK3 Phnom Penh.",House,,,,
2,"Price: $550.000 (negotiable) House size: 5m x 11m Land size: 6m x 11m Ground Floor, first floor, and second floor with a beautiful terrace. 3 bedrooms, 4bathrooms, 1 living room, 2 kitchens. * It takes only few minutes to Preah Sisowath High School. *The house face North (Benefit: the house have rental monthly income)",Retreat,,1.0,,
3,"- Address: Commune Tonle Bassak , District Chamkarmon , Phnom Penh - Land area : 11m x 12.5m - Built up area : 11m x 12.5m - Floors ( s ) : 3 - Price : $750,000",Villa,,,3.0,
4,"Town House For Sale in Chroy Changvar Area, Phnom Penh/T'S HOME Real Estate Phnom Penh ??Property ID : TS-379 ??Bedroom : 5 ??Bathroom : 6 ?? Price : 420,000 USD ?? Land Size : 4.1m x 16m ?? Floor Size : 4.1m x 16m",House,,,,
5,"Town House for Sale Now In Business Area Olympic !!!/T'S HOME Real Estate Phnom Penh. ??Property ID : TS-102 ??Room Type : 3 Bedroom / 3 Bathroom ?? Price : 420,000USD ?? Land Size : 65? ?? House Size: 64? ?? Total Floor: 2 Location : Olympic Phnom Penh.",House,,2.0,2.0,
6,"This 5-story building in Daun Penh is now available for sale or rent . It sits on a land size of 186 sqm with a building size of 13m x 14m . With a hard title , this property is a great choice for long-term investment. Located in a highly convenient area , this building is surrounded by key attractions such as the Royal Palace, National Museum, the Riverside, and Central Market . It is also close to commercial buildings, clinics, residential areas, and international schools , making it perfect for businesses, offices, or rental opportunities. This property offers a great space for commercial use, such as an office, retail shop, or boutique hotel. The location attracts high foot traffic and is ideal for investment. DonÂt miss this chance to own or lease a property in a prime area. Contact us today for more details! Key Features: Â Land size: 186 sqm Â Building size: 13m x 14m Â Total 5 floors (E0 to E4) Â Hard title Â Convenient location Â Near Royal Palace, National Museum, Riverside, Central Market, commercial buildings, clinics, residential areas, and international schools",other,186.0,5.0,5.0,
7,"Stylish modern villa for sale at Norea Cove Residences is an amazing luxury resort villa project that has never been seen on Koh Norea in a new style. This villa is located opposite of Diamond Island (Koh Pich) and you just travel 4 minutes from AEON Mall-1, 5 minutes from Independence Monument and 5 minutes from Riverside or Royal Palace, Phnom Penh. ? This project is development of 3.8 hectares with 34 houses, making life fresh, a beautiful view, natural beauty, private relaxing. ? Type A has 10 houses and facing to the Mekong River ? Land size: 26.37m x 38.3m = 1,010 sqm ? House size: 2,737 sqm ? Stories: 04 stories house ? 06 Bedrooms | 13 Bathrooms ? 10 Cars parking lots ? The house has equipped with a modern elevator that can carry 23 persons ? Swimming pool 5m x 25m with depth 1.2m The selling price is starting from $6,500,000 to $8,500,000 negotiable and for the payment, all customers can pay in Cambodia or make payments abroad. Comes with 3 amenities, big open living area, big terraceÂs view, security guard post, elevator, maid's room, driver's room, laundry room, dining area, dry and wet kitchen suite, central air conditioning inside the villa, steam room, cinema room and swimming pool. Don't miss out on this chance to be part of the bustling city center. Contact us now to explore the potential of this remarkable property and secure your place in the heart of Koh Norea (Norea New City), Phnom Penh!",Villa,10.0,4.0,4.0,
8,"Flat (3 floors) near Monivong Thom Road and Phumin Administration School need to sell urgently * Price only: $ 450,000 (negotiable) *This price includes transfer of title *Lot area: 70 sqm (size in plot) * House size: 5.2m x 12m * Parking in front of the street 5 m * 4 Bedrooms & 5 Bathrooms * house facing direction: east - Location: 150m from Phomin Administrative School, 700m from Boeung Keng Kang Market and 300m from Monivong Thom Road. - Can pay installment through bank for 20 years but need to pay 30% of the house price in advance.",Flat,70.0,3.0,3.0,
9,"House near Sunthor Muk High School and Depo Market need to sell urgently * Sale price only: $899,900 dollars (negotiable) *This price includes transfer of title * Land size: 10m x 14m * House size: 10m x 9.8m * 4.2m left of front land * 6 Bedrooms & 5 Bathrooms * home facing : south - Location: 450m from Sunthor Muk High School, 600m from Depo Market and 550m from Depot Market Community School. - Can pay installment through bank for 20 years but need to pay 30% of the house price in advance.",House,,,,
