In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('../../data/processed/realestates_kh_v2_3_2.csv', encoding='latin1')

In [3]:
df.isnull().sum()

id                       1
headline                 1
price_display           16
bedrooms               943
bathrooms              950
land_area              402
address_subdivision      2
address_locality         2
address_line_2          12
address_line_1         149
category_name            2
is_parent                3
type                     2
information             39
latitude                 2
longitude                3
dtype: int64

In [4]:
def extract_land_area(info):
    if pd.isnull(info):
        return None

    # Normalize corrupted multiplication signs
    info = info.replace('Ã', 'x').replace('×', 'x').replace('X', 'x')

    patterns = [
        r'(\d+(?:[.,]\d+)?)\s*(?:m²|m2|sqm|sq\.?m|square\s*meters?|meters?\s*squared)',  # direct area
        r'(\d+(?:[.,]\d+)?)\s*[\?㎡]',  # corrupted m²
        r'unit size\s*[:=]?\s*(\d+(?:[.,]\d+)?)\s*sq\.?m',
        r'property size\s*[:=]?\s*(\d+(?:[.,]\d+)?)\s*sq\.?m',
        r'it measures\s*(\d+(?:[.,]\d+)?)\s*sqm',
        r'gross size.*?(\d+(?:[.,]\d+)?)\s*m²',
        r'net size.*?(\d+(?:[.,]\d+)?)\s*m²',
        r'living space.*?(\d+(?:[.,]\d+)?)\s*(?:m²|m2|sqm|sq\.?m|square\s*meters?|meters?\s*squared)',
        r'(\d+(?:[.,]\d+)?)\s*hectare|(\d+(?:[.,]\d+)?)\s*Ha',  # hectares
        # Dimension patterns
        r'(\d+(?:[.,]\d+)?)\s*m?\s*[x*]\s*(\d+(?:[.,]\d+)?)\s*m?',  # 5 x 13, 4m x 16m, etc.
        r'(\d+(?:[.,]\d+)?)\s*wide.*?(\d+(?:[.,]\d+)?)\s*long',     # 5m wide, 13m long
        r'floor size\s*[:=]?\s*(\d+(?:[.,]\d+)?)\s*[x*]\s*(\d+(?:[.,]\d+)?)',
        r'house size\s*[:=]?\s*(\d+(?:[.,]\d+)?)\s*[x*]\s*(\d+(?:[.,]\d+)?)',
        r'land area\s*[:=]?\s*(\d+(?:[.,]\d+)?)\s*[x*]\s*(\d+(?:[.,]\d+)?)',
        r'land size\s*[:=]?\s*(\d+(?:[.,]\d+)?)\s*[x*]\s*(\d+(?:[.,]\d+)?)',
        # Fallback: just a number with "land size"
        r'land size\s*[:=]?\s*(\d+(?:[.,]\d+)?)',
        r'home size\s*[:=]?\s*(\d+(?:[.,]\d+)?)',
    ]

    candidates = []

    for pat in patterns:
        for match in re.finditer(pat, info, re.IGNORECASE):
            groups = match.groups()
            # Hectares
            if 'hectare' in pat or 'Ha' in pat:
                for g in groups:
                    if g:
                        try:
                            area = float(g.replace(',', '')) * 10000  # 1 ha = 10,000 m²
                            candidates.append(area)
                        except:
                            continue
            # Dimensions
            elif len(groups) >= 2 and groups[0] and groups[1]:
                try:
                    a = float(groups[0].replace(',', ''))
                    b = float(groups[1].replace(',', ''))
                    area = a * b
                    candidates.append(area)
                except:
                    continue
            # Single value
            elif len(groups) >= 1 and groups[0]:
                try:
                    area = float(groups[0].replace(',', ''))
                    candidates.append(area)
                except:
                    continue

    # Special: price per sqm and total price
    price_per_sqm_match = re.search(r'(\d{1,3}(?:,\d{3})*|\d+)\s*\$?\s*(?:per|/)\s*(?:square\s*metre|sqm|m2|m²)', info, re.IGNORECASE)
    price_match = re.search(r'(\d{1,3}(?:,\d{3})*|\d+)\s*\$+', info)
    if not candidates and price_per_sqm_match and price_match:
        try:
            price_per_sqm = float(price_per_sqm_match.group(1).replace(',', ''))
            price_val = float(price_match.group(1).replace(',', ''))
            area = price_val / price_per_sqm
            candidates.append(area)
        except:
            pass

    # If multiple candidates, pick the largest (usually land > house)
    if candidates:
        return max(candidates)
    return None

In [5]:

# Apply extraction only to rows where land_area is null or 0
mask = df['land_area'].isnull() | (df['land_area'] == 0)
df.loc[mask, 'land_area_extracted'] = df.loc[mask, 'information'].apply(extract_land_area)

# Fill missing land_area with extracted value if available
df['land_area'] = df['land_area'].combine_first(df['land_area_extracted'])

# Drop the helper column
df.drop(columns=['land_area_extracted'], inplace=True)

# Save the updated DataFrame
# df.to_csv('../../data/processed/realestates_kh_v2_3_2.csv', index=False, encoding='latin1')

In [6]:
df.isnull().sum()

id                       1
headline                 1
price_display           16
bedrooms               943
bathrooms              950
land_area              360
address_subdivision      2
address_locality         2
address_line_2          12
address_line_1         149
category_name            2
is_parent                3
type                     2
information             39
latitude                 2
longitude                3
dtype: int64

In [7]:
# Show information patterns for rows where land_area is still missing
missing_info = df[df['land_area'].isnull()]['information']

for i, info in enumerate(missing_info, 1):
    print(f"{i}. {info}\n{'-'*80}")

1. There are 3 bedrooms, 1 living room and 4 bathrooms, 5m wide, 13m long, located in Khan Prek Phnov
--------------------------------------------------------------------------------
2. Condo for sale Property code: ACD24-029 Price: 34,000$ (Can negotiation) Floor Floor: 13F Gross size Common area: 20.6mÂ² Net size Interior area: 14.52mÂ² Property type Listing type: Studio room Bathroom Bathroom: 1 Location Address: Tuol Kouk district, Phnom Penh
--------------------------------------------------------------------------------
3. 1-3Bedroom Apartment for Sale-Boueng Keng Kang I (BKK1) ,Rentex is proud to present this stunning property located in the Phnom Penh Neighborhood of Chomkamon , This property is available for lease and is suitable for professionals looking to stay close to the center of Phnom Penh in a prestigious location. Properties Code : 1109 1Bedroom : $150000-$200000 2Bedroom : $250000-$310000 3Bedroom : $850000-$500000 Features include: -Wifi & Cable Tv -Kitchen -Living-

In [10]:
def extract_land_area_v7(info):
    if pd.isnull(info):
        return None

    info = info.replace('Ã', 'x').replace('×', 'x').replace('X', 'x')

    patterns = [
        r'(\d+(?:[.,]\d+)?)\s*(?:m²|m2|sqm|sq\.?m|square\s*meters?|meters?\s*squared)',
        r'(\d+(?:[.,]\d+)?)\s*[\?㎡]',
        r'unit size\s*[:=]?\s*(\d+(?:[.,]\d+)?)\s*sq\.?m',
        r'property size\s*[:=]?\s*(\d+(?:[.,]\d+)?)\s*sq\.?m',
        r'it measures\s*(\d+(?:[.,]\d+)?)\s*sqm',
        r'gross size.*?(\d+(?:[.,]\d+)?)\s*m²',
        r'net size.*?(\d+(?:[.,]\d+)?)\s*m²',
        r'living space.*?(\d+(?:[.,]\d+)?)\s*(?:m²|m2|sqm|sq\.?m|square\s*meters?|meters?\s*squared)',
        r'(\d+(?:[.,]\d+)?)\s*hectare|(\d+(?:[.,]\d+)?)\s*Ha',
        r'(\d+(?:[.,]\d+)?)\s*m?\s*[x*]\s*(\d+(?:[.,]\d+)?)\s*m?',
        # More flexible wide/long pattern
        r'(\d+(?:[.,]\d+)?)\s*m?\s*[\w\s,;]*?wide[\w\s,;]*?(\d+(?:[.,]\d+)?)\s*m?\s*[\w\s,;]*?long',
        # width and length with "is" or "of"
        r'(\d+(?:[.,]\d+)?)\s*m?\s*width[\w\s,;]*?(?:is|of)?[\w\s,;]*?(\d+(?:[.,]\d+)?)\s*m?\s*length',
        r'floor size\s*[:=]?\s*(\d+(?:[.,]\d+)?)\s*[x*]\s*(\d+(?:[.,]\d+)?)',
        r'house size\s*[:=]?\s*(\d+(?:[.,]\d+)?)\s*[x*]\s*(\d+(?:[.,]\d+)?)',
        r'land area\s*[:=]?\s*(\d+(?:[.,]\d+)?)\s*[x*]\s*(\d+(?:[.,]\d+)?)',
        r'land size\s*[:=]?\s*(\d+(?:[.,]\d+)?)\s*[x*]\s*(\d+(?:[.,]\d+)?)',
        r'land size\s*[:=]?\s*(\d+(?:[.,]\d+)?)',
        r'home size\s*[:=]?\s*(\d+(?:[.,]\d+)?)',
        r'over\s*(\d+(?:[.,]\d+)?)\s*square\s*metres',
        r'Gross Area[:=]?\s*(\d+(?:[.,]\d+)?)',
        r'Net Area[:=]?\s*(\d+(?:[.,]\d+)?)',
        r'area\s*[:=]?\s*(\d+(?:[.,]\d+)?)\s*(?:m²|m2|sqm|sq\.?m)?',
    ]

    candidates = []

    for pat in patterns:
        for match in re.finditer(pat, info, re.IGNORECASE):
            groups = match.groups()
            if 'hectare' in pat or 'Ha' in pat:
                for g in groups:
                    if g:
                        try:
                            area = float(g.replace(',', '')) * 10000
                            candidates.append(area)
                        except:
                            continue
            elif len(groups) >= 2 and groups[0] and groups[1]:
                try:
                    a = float(groups[0].replace(',', ''))
                    b = float(groups[1].replace(',', ''))
                    area = a * b
                    candidates.append(area)
                except:
                    continue
            elif len(groups) >= 1 and groups[0]:
                try:
                    area = float(groups[0].replace(',', ''))
                    candidates.append(area)
                except:
                    continue

    price_per_sqm_match = re.search(r'(\d{1,3}(?:,\d{3})*|\d+)\s*\$?\s*(?:per|/)\s*(?:square\s*metre|sqm|m2|m²)', info, re.IGNORECASE)
    price_match = re.search(r'(\d{1,3}(?:,\d{3})*|\d+)\s*\$+', info)
    if not candidates and price_per_sqm_match and price_match:
        try:
            price_per_sqm = float(price_per_sqm_match.group(1).replace(',', ''))
            price_val = float(price_match.group(1).replace(',', ''))
            area = price_val / price_per_sqm
            candidates.append(area)
        except:
            pass

    if candidates:
        return max(candidates)
    return None

In [11]:
# Apply only to rows where land_area is null or 0
mask = df['land_area'].isnull() | (df['land_area'] == 0)
df.loc[mask, 'land_area_extracted'] = df.loc[mask, 'information'].apply(extract_land_area_v6)
df['land_area'] = df['land_area'].combine_first(df['land_area_extracted'])
df.drop(columns=['land_area_extracted'], inplace=True)
df.isnull().sum()

id                       1
headline                 1
price_display           16
bedrooms               943
bathrooms              950
land_area              357
address_subdivision      2
address_locality         2
address_line_2          12
address_line_1         149
category_name            2
is_parent                3
type                     2
information             39
latitude                 2
longitude                3
dtype: int64

In [12]:
# Drop rows where land_area is still null
df = df[~df['land_area'].isnull()]

# Check for any remaining nulls
df.isnull().sum()

id                       0
headline                 0
price_display            1
bedrooms               821
bathrooms              830
land_area                0
address_subdivision      0
address_locality         0
address_line_2           4
address_line_1         110
category_name            0
is_parent                0
type                     0
information             18
latitude                 0
longitude                0
dtype: int64

In [13]:
df.to_csv('../../data/processed/realestates_kh_v2_3_3.csv', index=False, encoding='latin1')