In [39]:
import re
import pandas as pd

In [40]:
df = pd.read_csv('../../data/raw/realestates_kh_v2.csv')

In [41]:
df = df[df['address_subdivision'] == 'Phnom Penh']

In [42]:
df.shape

(4210, 24)

In [43]:
df.isnull().sum()

Unnamed: 0                0
id                        0
headline                  0
price_display            13
rent_display           3597
bedrooms               1028
bathrooms              1035
land_area              2194
thumbnail_url             0
thumbnail_urls            0
garages                3175
location                  0
address_subdivision       0
address_locality          0
address_line_2            9
address_line_1          148
category_name             0
is_parent                 0
type                      0
information              38
source_url                0
language                  0
latitude                  0
longitude                 0
dtype: int64

In [44]:
df['land_area'] = pd.to_numeric(df['land_area'], errors='coerce')  # ensure numeric
missing = df['land_area'].isna().sum()
print(f"❓ Missing land_area: {missing} rows")


❓ Missing land_area: 2194 rows


In [45]:
def is_missing(val):
    return pd.isna(val) or str(val).strip().lower() in ["", "na", "n/a", "null"]


In [46]:
df[df['land_area'].isna() | (df['land_area'] == 0)][['information']].head(10)


Unnamed: 0,information
0,A flat (2 floors) near Hengly market and near ...
1,Twin Villa (Twin Villa) in Borey Highland 2005...
2,"There are 3 floors 2 bedrooms, 2 bathrooms can..."
3,"I have a business house, I want to sell a hous..."
4,Apartment Instast 4M * 15.5m There are 45M kit...
5,Villa (Victoria) in Orkide Villa (Orkide Villa...
9,"Unit Type H, 2-bedroom, 2-bathroom Floor area:..."
11,1-3Bedroom Apartment for Sale-Boueng Keng Kang...
12,"• 7th and 8th floor: starting from USD 2,900/s..."
14,DABEST Property ID: PPA56 • Call DaBest Proper...


In [47]:
def extract_land_area(text):
    if not isinstance(text, str):
        return None

    text = text.lower()

    # Match square meters (e.g., 98,514 sqm)
    sqm = re.search(r"(\d{2,7}(?:[.,]\d+)?)[\s\-]*(sqm|m2|square meters?)", text)
    if sqm:
        return float(sqm.group(1).replace(",", "").strip())

    # Match hectares (e.g., 15.5 ha or 20 hectares)
    ha = re.search(r"(\d{1,4}(?:[.,]\d+)?)[\s\-]*(ha|hectares?)", text)
    if ha:
        return float(ha.group(1).replace(",", "").strip()) * 10000

    # Match dimensions (e.g., 8 x 20m or 8.4 x 12.5m)
    dim = re.search(r"(\d{1,3}(?:\.\d+)?)\s*[x×*]\s*(\d{1,3}(?:\.\d+)?)\s*m", text)
    if dim:
        try:
            width = float(dim.group(1))
            length = float(dim.group(2))
            return width * length
        except:
            return None

    # Match values with keywords like 'land area', 'total size', 'lot'
    keyword_number = re.search(r"(land|lot|size|area)[^\d]{0,15}(\d{4,7})", text)
    if keyword_number:
        return float(keyword_number.group(2).replace(",", ""))

    return None

In [48]:
filled = 0
df['land_area'] = pd.to_numeric(df['land_area'], errors='coerce')

for i, row in df.iterrows():
    if is_missing(row.get("land_area")):
        info = row.get("information", "")
        value = extract_land_area(info)
        if value:
            df.at[i, "land_area"] = value
            filled += 1

print(f"✅ Filled {filled} missing land_area values.")


✅ Filled 827 missing land_area values.


In [49]:
df.isnull().sum()

Unnamed: 0                0
id                        0
headline                  0
price_display            13
rent_display           3597
bedrooms               1028
bathrooms              1035
land_area              1367
thumbnail_url             0
thumbnail_urls            0
garages                3175
location                  0
address_subdivision       0
address_locality          0
address_line_2            9
address_line_1          148
category_name             0
is_parent                 0
type                      0
information              38
source_url                0
language                  0
latitude                  0
longitude                 0
dtype: int64

In [50]:
# def extract_land_size(text):
#     if pd.isna(text) or not isinstance(text, str):
#         return None
#     match = re.search(r"Land size:\s*(\d+\.?\d*m\s*x\s*\d+\.?\d*m)", text, re.IGNORECASE)
#     if match:
#         return match.group(1)
#     return None

In [51]:
# df['land_area'] = df['information'].apply(extract_land_size)


In [52]:
df.isnull().sum()

Unnamed: 0                0
id                        0
headline                  0
price_display            13
rent_display           3597
bedrooms               1028
bathrooms              1035
land_area              1367
thumbnail_url             0
thumbnail_urls            0
garages                3175
location                  0
address_subdivision       0
address_locality          0
address_line_2            9
address_line_1          148
category_name             0
is_parent                 0
type                      0
information              38
source_url                0
language                  0
latitude                  0
longitude                 0
dtype: int64

In [53]:
# df.to_csv('../../data/processed/realestates_kh_v2_1.csv', index=False)