In [10]:
import pandas as pd
import re   

In [11]:
df = pd.read_csv('../../data/processed/realestates_kh_v2_4.csv')

In [12]:
df.head()

Unnamed: 0,id,headline,price_display,bedrooms,bathrooms,land_area,address_subdivision,address_locality,address_line_2,address_line_1,category_name,is_parent,type,information,latitude,longitude
0,235560.0,Link house for sale,"$114,000",4.0,5.0,71.4,Phnom Penh,Chroy Changvar,Preaek Lieb,,Unit,False,residential,"- Address: Commune Preaek Lieb, District Chroy...",11.589674,104.925654
1,248374.0,Land for Sale,"$1,800.0/m²",,,104635.0,Phnom Penh,Russey Keo,Tuol Sangkae 1,273 273,Land/Development,False,residential,"Land Located at Tuol Kok Village, Sangkat Tuol...",11.58781,104.902234
2,248290.0,<b>Development Land For Sales (National Road 3...,$700.0/m²,,,84000.0,Phnom Penh,Por Sen Chey,Kantaok,N3 N3,Land/Development,False,residential,Land for Sale or Rent on Road N3 Main Road - H...,11.537783,104.920232
3,248489.0,?????????????????? ??? Urban Village Phase 2 ?...,"$90,000",1.0,1.0,76000.0,Phnom Penh,Meanchey,Chak Angrae Leu,,Condo,False,residential,"Condo for sale in Chak Angrae Leu, Meanchey, P...",11.537783,104.920232
4,235239.0,Land in front of Eden Garden inside Phnom Penh...,"$276,264,450",,,55811.0,Phnom Penh,Daun Penh,Srah Chak,,Land/Development,False,residential,Land in front of Eden Garden for Sales Size: 5...,11.579202,104.908613


In [13]:
cols = ['price_display', 'land_area', 'information']
df[cols]

Unnamed: 0,price_display,land_area,information
0,"$114,000",71.40,"- Address: Commune Preaek Lieb, District Chroy..."
1,"$1,800.0/m²",104635.00,"Land Located at Tuol Kok Village, Sangkat Tuol..."
2,$700.0/m²,84000.00,Land for Sale or Rent on Road N3 Main Road - H...
3,"$90,000",76000.00,"Condo for sale in Chak Angrae Leu, Meanchey, P..."
4,"$276,264,450",55811.00,Land in front of Eden Garden for Sales Size: 5...
...,...,...,...
3751,"$520,000",86.25,"ShopHouse rent and sale in PHNOM PENH ,Chamka..."
3752,"$1,050,168",1406.00,Land for sale not yet located on Chamkar Dong ...
3753,"$787,410",2019.00,Warehouse Land Value Under Market Near Passis ...
3754,"$1,700,000",209.00,This commercial building is available for sale...


In [14]:
def extract_price(info, land_area=None):
    if pd.isnull(info):
        return None

    info = str(info)

    # 1. Price range: $120,000 - $150,000 or 120,000-150,000
    match = re.search(r'(\d[\d,\.]*)\s*[-to]+\s*(\d[\d,\.]*)', info, re.IGNORECASE)
    if match:
        p1 = float(match.group(1).replace(',', '').replace('$', ''))
        p2 = float(match.group(2).replace(',', '').replace('$', ''))
        return (p1 + p2) / 2

    # 2. Price per sqm: $1,200/m², 1200/m2, 1200 per sqm, etc.
    match = re.search(r'(\d[\d,\.]*)\s*\$?\s*(?:/|per\s*)(?:m²|m2|sqm|square\s*meter)', info, re.IGNORECASE)
    if match:
        price_per_sqm = float(match.group(1).replace(',', '').replace('$', ''))
        if land_area is not None and not pd.isnull(land_area):
            try:
                return price_per_sqm * float(land_area)
            except:
                return None
        else:
            return None  # Can't compute total price without area

    # 3. Absolute price: $120,000, 120000, USD 120,000, etc.
    match = re.search(r'\$?\s*([\d,]+(?:\.\d+)?)\s*(?:usd)?(?!\s*/)', info, re.IGNORECASE)
    if match:
        return float(match.group(1).replace(',', ''))

    # 4. Price per month/year/unit (for rent or per unit, skip if not needed)
    match = re.search(r'(\d[\d,\.]*)\s*\$?\s*/\s*(?:month|year|unit)', info, re.IGNORECASE)
    if match:
        return float(match.group(1).replace(',', ''))

    # 5. "Prices start at just $1,390 per square metre" (extract price per sqm)
    match = re.search(r'\$?\s*([\d,]+(?:\.\d+)?)\s*per\s*(?:square\s*metre|sqm|m2|m²)', info, re.IGNORECASE)
    if match:
        price_per_sqm = float(match.group(1).replace(',', ''))
        if land_area is not None and not pd.isnull(land_area):
            try:
                return price_per_sqm * float(land_area)
            except:
                return None
        else:
            return None

    # 6. Negotiable/Contact for price/Not available
    if re.search(r'negotiable|contact|call|tba|n/a|not available', info, re.IGNORECASE):
        return None

    return None


In [15]:
df['price_clean'] = df.apply(lambda row: extract_price(row.get('price_display', row.get('information', '')), row.get('land_area')), axis=1)
df['price_clean'].describe()

count    3.755000e+03
mean     1.102737e+06
std      6.081888e+06
min      1.680000e+02
25%      1.320550e+05
50%      2.800000e+05
75%      7.500000e+05
max      2.762644e+08
Name: price_clean, dtype: float64

In [16]:
df

Unnamed: 0,id,headline,price_display,bedrooms,bathrooms,land_area,address_subdivision,address_locality,address_line_2,address_line_1,category_name,is_parent,type,information,latitude,longitude,price_clean
0,235560.0,Link house for sale,"$114,000",4.0,5.0,71.40,Phnom Penh,Chroy Changvar,Preaek Lieb,,Unit,False,residential,"- Address: Commune Preaek Lieb, District Chroy...",11.589674,104.925654,114000.0
1,248374.0,Land for Sale,"$1,800.0/m²",,,104635.00,Phnom Penh,Russey Keo,Tuol Sangkae 1,273 273,Land/Development,False,residential,"Land Located at Tuol Kok Village, Sangkat Tuol...",11.587810,104.902234,188343000.0
2,248290.0,<b>Development Land For Sales (National Road 3...,$700.0/m²,,,84000.00,Phnom Penh,Por Sen Chey,Kantaok,N3 N3,Land/Development,False,residential,Land for Sale or Rent on Road N3 Main Road - H...,11.537783,104.920232,58800000.0
3,248489.0,?????????????????? ??? Urban Village Phase 2 ?...,"$90,000",1.0,1.0,76000.00,Phnom Penh,Meanchey,Chak Angrae Leu,,Condo,False,residential,"Condo for sale in Chak Angrae Leu, Meanchey, P...",11.537783,104.920232,90000.0
4,235239.0,Land in front of Eden Garden inside Phnom Penh...,"$276,264,450",,,55811.00,Phnom Penh,Daun Penh,Srah Chak,,Land/Development,False,residential,Land in front of Eden Garden for Sales Size: 5...,11.579202,104.908613,276264450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3751,129973.0,"ShopHouse rent And sale in Chamkarmon, BKK-1","$520,000",,,86.25,Phnom Penh,Chamkarmon,BKK 1,,Shophouse,False,commercial,"ShopHouse rent and sale in PHNOM PENH ,Chamka...",11.539474,104.917352,520000.0
3752,222625.0,urgent land sale,"$1,050,168",,,1406.00,Phnom Penh,Dangkao,Dangkao,???????? 20,Land/Development,False,commercial,Land for sale not yet located on Chamkar Dong ...,11.514304,104.889750,1050168.0
3753,246888.0,??????????? ????????????????? ????????????????...,"$787,410",,,2019.00,Phnom Penh,Por Sen Chey,Chaom Chau 2,,Warehouse,False,commercial,Warehouse Land Value Under Market Near Passis ...,11.698532,104.979682,787410.0
3754,252444.0,Commercial Building for rent I Borey Peng Huot...,"$1,700,000",,4.0,209.00,Phnom Penh,Chbar Ampov,Nirouth,,other,False,commercial,This commercial building is available for sale...,11.522551,104.962470,1700000.0


In [30]:
cols = ['price_display', 'price_clean', 'land_area', 'information']
df[cols]

Unnamed: 0,price_display,price_clean,land_area,information
0,"$114,000",114000.0,71.40,"- Address: Commune Preaek Lieb, District Chroy..."
1,"$1,800.0/m²",188343000.0,104635.00,"Land Located at Tuol Kok Village, Sangkat Tuol..."
2,$700.0/m²,58800000.0,84000.00,Land for Sale or Rent on Road N3 Main Road - H...
3,"$90,000",90000.0,76000.00,"Condo for sale in Chak Angrae Leu, Meanchey, P..."
4,"$276,264,450",276264450.0,55811.00,Land in front of Eden Garden for Sales Size: 5...
...,...,...,...,...
3751,"$520,000",520000.0,86.25,"ShopHouse rent and sale in PHNOM PENH ,Chamka..."
3752,"$1,050,168",1050168.0,1406.00,Land for sale not yet located on Chamkar Dong ...
3753,"$787,410",787410.0,2019.00,Warehouse Land Value Under Market Near Passis ...
3754,"$1,700,000",1700000.0,209.00,This commercial building is available for sale...


In [None]:
# df.at[10, 'price_clean'] = float(df.at[10, 'price_clean']) * float(df.at[10, 'land_area'])

In [29]:
df.loc[df['price_clean'] < df['land_area'], ['price_display', 'price_clean', 'land_area']]

Unnamed: 0,price_display,price_clean,land_area


In [32]:
df.loc[df['price_clean'] < 5000, ['price_display', 'price_clean', 'land_area']]

Unnamed: 0,price_display,price_clean,land_area
159,"$2,500",2500.0,2100.0
178,"$2,850",2850.0,1792.0
192,"$2,700",2700.0,1600.0
371,"$2,500",2500.0,800.0
545,"$1,300",1300.0,548.0
856,"$4,300",4300.0,336.0
1098,"$3,500",3500.0,225.0
1383,"$1,300",1300.0,168.0
1933,"$1,000",1000.0,107.1
2841,$66/m²,4488.0,68.0


In [35]:
# Show rows where price_clean < 5000 and price_display is NOT a price per sqm (e.g., not like '195/m^2', '195/m2', etc.)
mask = (
    (df['price_clean'] < 5000) &
    ~df['price_display'].str.contains(r'/\s*(m2|m²|sqm|\^2)', case=False, na=False)
)
df.loc[mask, ['price_display', 'price_clean', 'land_area']]

  ~df['price_display'].str.contains(r'/\s*(m2|m²|sqm|\^2)', case=False, na=False)


Unnamed: 0,price_display,price_clean,land_area


In [28]:
# For all rows where price_clean < land_area, set price_clean = price_clean * land_area
mask = df['price_clean'] < df['land_area']
df.loc[mask, 'price_clean'] = df.loc[mask, 'price_clean'] * df.loc[mask, 'land_area']

In [34]:
# For all rows where price_clean < 5000 and price_display is NOT a price per sqm, set price_clean = price_clean * land_area
mask = (
    (df['price_clean'] < 5000) &
    ~df['price_display'].str.contains(r'/\s*(m2|m²|sqm|\^2)', case=False, na=False)
)
df.loc[mask, 'price_clean'] = df.loc[mask, 'price_clean'] * df.loc[mask, 'land_area']

  ~df['price_display'].str.contains(r'/\s*(m2|m²|sqm|\^2)', case=False, na=False)


In [25]:
df.at[3717, 'information']

'Business home for sale in the road, ensuring the market - $ 187,000 in the next 3m-sized household space. Still on the 24-meter-upper rim in National Road 3 to the 3rd city of Kram, more than 30 hectares of Bigs, nearly 30 hectares, nearly 30 hectares, nearly 30 hectares, nearly 30 hectares, is International School Market, Phsar Mall'

In [26]:
df = df.drop(3717)


In [None]:
# df.to_csv('../../data/processed/realestates_kh_v2_3_5.csv', index=False, encoding='latin1')

In [37]:
# Overwrite price_display with price_clean, then drop price_clean column
df['price_display'] = df['price_clean']
df = df.drop(columns=['price_clean'])

In [1]:
df.head()

NameError: name 'df' is not defined

In [38]:
df.to_csv('../../data/processed/realestates_kh_v2_3_6.csv', index=False, encoding='latin1')

In [39]:
df.dtypes

id                     float64
headline                object
price_display          float64
bedrooms               float64
bathrooms              float64
land_area              float64
address_subdivision     object
address_locality        object
address_line_2          object
address_line_1          object
category_name           object
is_parent                 bool
type                    object
information             object
latitude               float64
longitude              float64
dtype: object