In [6]:
import pandas as pd
import numpy as np
import re

In [7]:
df = pd.read_csv('../../data/processed/realestates_kh_v2_3_3.csv', encoding='latin1')

In [8]:
def extract_bedrooms(info):
    if pd.isnull(info):
        return None
    # Range: 1-3Bedroom, 2-4 Bedrooms
    match = re.search(r'(\d+)\s*-\s*(\d+)\s*bed(?:room)?s?', info, re.IGNORECASE)
    if match:
        return int(match.group(2))
    # "Studio" as 1 bedroom
    if re.search(r'studio', info, re.IGNORECASE):
        return 1
    # "Bedrooms: 2" or "Bedroom : 2"
    match = re.search(r'bed(?:room)?s?\s*[:=]\s*(\d+)', info, re.IGNORECASE)
    if match:
        return int(match.group(1))
    # "2BR/2BA" or "2BR"
    match = re.search(r'(\d+)\s*BR\b', info, re.IGNORECASE)
    if match:
        return int(match.group(1))
    # "4 bed 5 bath"
    match = re.search(r'(\d+)\s*bed\b', info, re.IGNORECASE)
    if match:
        return int(match.group(1))
    # "(\d+) bedrooms"
    match = re.search(r'(\d+)\s*bed(?:room)?s?', info, re.IGNORECASE)
    if match:
        return int(match.group(1))
    return None

def extract_bathrooms(info):
    if pd.isnull(info):
        return None
    # Range: 1-3Bathroom, 2-4 Bathrooms
    match = re.search(r'(\d+)\s*-\s*(\d+)\s*bath(?:room)?s?', info, re.IGNORECASE)
    if match:
        return int(match.group(2))
    # "Bathrooms: 2" or "Bathroom : 2"
    match = re.search(r'bath(?:room)?s?\s*[:=]\s*(\d+)', info, re.IGNORECASE)
    if match:
        return int(match.group(1))
    # "2BR/2BA" or "2BA"
    match = re.search(r'(\d+)\s*BA\b', info, re.IGNORECASE)
    if match:
        return int(match.group(1))
    # "4 bed 5 bath"
    match = re.search(r'(\d+)\s*bath\b', info, re.IGNORECASE)
    if match:
        return int(match.group(1))
    # "(\d+) bathrooms"
    match = re.search(r'(\d+)\s*bath(?:room)?s?', info, re.IGNORECASE)
    if match:
        return int(match.group(1))
    return None

In [9]:
# Extract for rows where bedrooms or bathrooms are null
mask_bed = df['bedrooms'].isnull()
mask_bath = df['bathrooms'].isnull()
df.loc[mask_bed, 'bedrooms'] = df.loc[mask_bed, 'information'].apply(extract_bedrooms)
df.loc[mask_bath, 'bathrooms'] = df.loc[mask_bath, 'information'].apply(extract_bathrooms)


In [10]:
df.isnull().sum()

id                       0
headline                 0
price_display            1
bedrooms               668
bathrooms              691
land_area                0
address_subdivision      0
address_locality         0
address_line_2           4
address_line_1         110
category_name            0
is_parent                0
type                     0
information             18
latitude                 0
longitude                0
dtype: int64

In [11]:
# Show 20 samples of information where bedrooms or bathrooms are still null
null_bed = df[df['bedrooms'].isnull()]['information'].head(20)
null_bath = df[df['bathrooms'].isnull()]['information'].head(20)

print("Sample information with null bedrooms:")
for i, info in enumerate(null_bed, 1):
    print(f"{i}. {info}\n{'-'*60}")

print("\nSample information with null bathrooms:")
for i, info in enumerate(null_bath, 1):
    print(f"{i}. {info}\n{'-'*60}")

Sample information with null bedrooms:
1. Land Located at Tuol Kok Village, Sangkat Tuol Sangkae 1, Khan Russey Keo, Phnom Penh
------------------------------------------------------------
2. Land for Sale or Rent on Road N3 Main Road - Hard Title - Land 8.40hectares - Price for Sale 700$/Sqm - Price for Rent 1.5$/Sqm - Kind of land : Development land - Commercial land
------------------------------------------------------------
3. Land in front of Eden Garden for Sales Size: 55811 m2 Width: 166m Depth: 335 m Price: 4xxx/m2 Ownership Type: Hard Title Location: land located in front of Eden Garden and View Park inside the Phnom Penh City Center. This land is suitable for constructing an apartment complex, hotel, shopping mall, skyscraper, office building, and other commercial endeavors. Map: https://goo.gl/maps/oDS7UbiFvqonVEsn9 #PPCC #phnompenhcitycenter #land #sale #bussinesdistrict #Edengarden #??? #?????? PCC #phnompenhpenhcitycenter #land #sale #bussinesdistrict #Edengarden #??? #?

In [12]:
df.to_csv('../../data/processed/realestates_kh_v2_4.csv', index=False, encoding='latin1')