In [24]:
import requests
import time
import pandas as pd
from collections import deque
from bs4 import BeautifulSoup

In [25]:
cambodia_provinces = [
    "Phnom Penh",          # Capital (Autonomous Municipality)
    "Banteay Meanchey",
    "Battambang",
    "Kampong Cham",
    "Kampong Chhnang",
    "Kampong Speu",
    "Kampong Thom",
    "Kampot",
    "Kandal",
    "Kep",
    "Koh Kong",
    "Kratié",
    "Mondulkiri",
    "Oddar Meanchey",
    "Pailin",
    "Preah Vihear",
    "Prey Veng",
    "Pursat",
    "Ratanakiri",
    "Siem Reap",
    "Sihanoukville",
    "Stung Treng",
    "Svay Rieng",
    "Takeo",
    "Tboung Khmum"
]


In [26]:
# ——— CONFIG ———
PAGE_SIZE   = 1000
MIN_STEP    = 0.005   # don’t split tiles smaller than ~0.05°
EXPECTED    = None    # set to an int to stop early, or None to exhaustively crawl
USER_AGENT  = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
               "AppleWebKit/537.36 (KHTML, like Gecko) "
               "Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.60")

# Cambodia full bounds
TOP_LAT     = 14.704581
BOTTOM_LAT  = 9.913701
LEFT_LON    = 102.313423
RIGHT_LON   = 107.627449


headers = {"User-Agent": USER_AGENT}

In [None]:
def fetch_tile(tl_lat, tl_lon, br_lat, br_lon):
    url = (
        "https://www.realestate.com.kh/api/listing/map-points/"
        f"?active_tab=popularLocations"
        f"&order_by=relevance"
        f"&property_type=residential"
        f"&q=location:Phnom Penh"
        f"&search_type=sale"
        f"&bottom_right_lat={br_lat}&bottom_right_lon={br_lon}"
        f"&top_left_lat={tl_lat}&top_left_lon={tl_lon}"
        f"&show_all=true&page_size={PAGE_SIZE}"
    )
    resp = requests.get(url, headers=headers)
    if resp.status_code != 200:
        print(f"⚠️ HTTP {resp.status_code} for tile {tl_lat,tl_lon,br_lat,br_lon}")
        return []
    pts = resp.json().get("points", [])
    if not pts:
        return []
    return pts[0].get("samples", [])

In [28]:
request_province_list = []
request_all = []

In [29]:
# Set the URL
# %20Phnom%20Penh
url_1 = "https://www.realestate.com.kh/api/listing/map-points/?active_tab=popularLocations&order_by=relevance&property_type="  # Replace with the actual URL
url_2 = '&q=location:'
url_3 = '&search_type=sale&bottom_right_lat=9.039849866109236&bottom_right_lon=108.07048795635677&top_left_lat=15.785344977938763&top_left_lon=100.34079574208647&order_by=relevance&show_all=true&page_size=1000'



In [30]:
# reset accumulators & dedupe set
seen_ids = set()

for i in range(len(cambodia_provinces)):
    for j in ['residential', 'commercial','borey', 'project']:
        province = cambodia_provinces[i]
        url = url_1 + j + url_2+ province + url_3

        # initial request
        response = requests.get(url, headers=headers)
        data = response.json().get("points", [])
        count = data[0]['count'] if data else 0
        samples = data[0].get('samples', []) if data else []

        # record province count
        request_province_list.append({
            "Province": province,
            "Count":    count
        })

        if count < PAGE_SIZE:
            # under cap: just add those samples (deduped)
            for item in samples:
                _id = item.get("id")
                if _id and _id not in seen_ids:
                    seen_ids.add(_id)
                    item['type'] = j
                    request_all.append(item)
        else:
            # include the initial 1000 before grid crawl
            for item in samples:
                _id = item.get("id")
                if _id and _id not in seen_ids:
                    seen_ids.add(_id)
                    item['type'] = j
                    request_all.append(item)

            # over cap: drill down via full Cambodia grid
            print(f"🔍 {province} has {count} listings → drilling down…")
            queue = deque([(TOP_LAT, LEFT_LON, BOTTOM_LAT, RIGHT_LON)])
            tiles_processed = 0

            while queue:
                tl_lat, tl_lon, br_lat, br_lon = queue.popleft()
                tiles_processed += 1

                results = fetch_tile(tl_lat, tl_lon, br_lat, br_lon)
                n = len(results)

                # progress
                print(f"[{province}][Tile {tiles_processed}] Fetched {n} | "
                    f"Queue: {len(queue)} | Collected: {len(request_all)}")

                lat_span = tl_lat - br_lat
                lon_span = br_lon - tl_lon

                # if still capped, split into 4 sub‐tiles
                if n == PAGE_SIZE and lat_span > MIN_STEP and lon_span > MIN_STEP:
                    mid_lat = (tl_lat + br_lat) / 2
                    mid_lon = (tl_lon + br_lon) / 2
                    queue.extend([
                        (tl_lat,   tl_lon,   mid_lat, mid_lon),  # NW
                        (tl_lat,   mid_lon,  mid_lat, br_lon),   # NE
                        (mid_lat,  tl_lon,   br_lat,  mid_lon),  # SW
                        (mid_lat,  mid_lon,  br_lat,  br_lon),   # SE
                    ])
                else:
                    # accept these results (dedupe as we go)
                    for item in results:
                        _id = item.get("id")
                        if _id and _id not in seen_ids:
                            seen_ids.add(_id)
                            item['type'] = j
                            request_all.append(item)

                if EXPECTED and len(request_all) >= EXPECTED:
                    break

                time.sleep(0.4)

        print(f"✅ After {province}: total unique = {len(request_all)}\n")


🔍 Phnom Penh has 3899 listings → drilling down…
[Phnom Penh][Tile 1] Fetched 1000 | Queue: 0 | Collected: 1000
[Phnom Penh][Tile 2] Fetched 2 | Queue: 3 | Collected: 1000
[Phnom Penh][Tile 3] Fetched 0 | Queue: 2 | Collected: 1001
[Phnom Penh][Tile 4] Fetched 1000 | Queue: 1 | Collected: 1001
[Phnom Penh][Tile 5] Fetched 30 | Queue: 4 | Collected: 1001
[Phnom Penh][Tile 6] Fetched 0 | Queue: 3 | Collected: 1029
[Phnom Penh][Tile 7] Fetched 1000 | Queue: 2 | Collected: 1029
[Phnom Penh][Tile 8] Fetched 0 | Queue: 5 | Collected: 1029
[Phnom Penh][Tile 9] Fetched 0 | Queue: 4 | Collected: 1029
[Phnom Penh][Tile 10] Fetched 0 | Queue: 3 | Collected: 1029
[Phnom Penh][Tile 11] Fetched 4 | Queue: 2 | Collected: 1029
[Phnom Penh][Tile 12] Fetched 0 | Queue: 1 | Collected: 1031
[Phnom Penh][Tile 13] Fetched 1000 | Queue: 0 | Collected: 1031
[Phnom Penh][Tile 14] Fetched 0 | Queue: 3 | Collected: 1031
[Phnom Penh][Tile 15] Fetched 1000 | Queue: 2 | Collected: 1031
[Phnom Penh][Tile 16] Fetched 

In [31]:
df_province_list = pd.DataFrame(request_province_list)
df_all = pd.DataFrame(request_all)

In [32]:
# ——— FINISH & DEDUPE ———
if 'id' in df_all.columns:
    before = len(df_all)
    df_all = df_all.drop_duplicates(subset='id')
    after = len(df_all)
    print(f"🔄 Final dedupe: {before} → {after} unique listings")

🔄 Final dedupe: 6190 → 6190 unique listings


In [33]:
df_province_list['Count'] = df_province_list['Count'].astype(int)
print(df_province_list['Count'].sum())
df_province_list = df_province_list.groupby('Province')['Count'].sum().reset_index()
df_province_list

6501


Unnamed: 0,Province,Count
0,Banteay Meanchey,11
1,Battambang,20
2,Kampong Cham,15
3,Kampong Chhnang,7
4,Kampong Speu,56
5,Kampong Thom,10
6,Kampot,101
7,Kandal,228
8,Kep,35
9,Koh Kong,21


In [34]:
df_all

Unnamed: 0,id,headline,price_display,rent_display,bedrooms,bathrooms,land_area,thumbnail_url,thumbnail_urls,garages,location,address_subdivision,address_locality,address_line_2,address_line_1,category_name,is_parent,type
0,204942,A flat (2 floors) near Hengly market and near ...,"$150,000",,6.0,4.0,,https://images.realestate.com.kh/__sized__/lis...,[https://images.realestate.com.kh/__sized__/li...,,"[104.91, 11.53]",Phnom Penh,Meanchey,Stueng Mean chey,,Flat,False,residential
1,211997,Twin Villa (Twin Villa) in Borey Highland 2005...,"$269,000",,4.0,7.0,,https://images.realestate.com.kh/__sized__/lis...,[https://images.realestate.com.kh/__sized__/li...,,"[104.89, 11.55]",Phnom Penh,Sen Sok,Khmuonh,,Twin Villa,False,residential
2,212595,house for sale,"$85,000",,2.0,2.0,,https://images.realestate.com.kh/__sized__/lis...,[https://images.realestate.com.kh/__sized__/li...,,"[104.91713589328344, 11.554793367229314]",Phnom Penh,Chamkarmon,BKK 2,117 117,House,False,residential
3,211987,House for sale in Meanchey Area,"$450,000",,8.0,6.0,0.0,https://images.realestate.com.kh/__sized__/lis...,[https://images.realestate.com.kh/__sized__/li...,3.0,"[104.88632789999997, 11.539647799999969]",Phnom Penh,Meanchey,Stueng Mean chey 3,1 ផ្លូវលូប្រាំ(82c),Flat,False,residential
4,212653,House for Sale Urgently | Extra Space and Stai...,"$85,000",,2.0,2.0,,https://images.realestate.com.kh/__sized__/lis...,[https://images.realestate.com.kh/__sized__/li...,,"[104.93, 11.55]",Phnom Penh,Meanchey,Boeung Tumpun,"ST. 45BT #4C, ST. 45BT #4C,",Flat,False,residential
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6185,246364,Land in Phnom Tamao | Selling 60% below market...,"$1,440,000",,,,80000.0,https://images.realestate.com.kh/__sized__/lis...,[https://images.realestate.com.kh/__sized__/li...,,"[104.8227380276038, 11.297112676749492]",Takeo,Bati,Kandoeng,,Land,False,commercial
6186,217364,ផ្ទះសំណាក់លក់បន្ទាន់,POA,,18.0,18.0,,https://images.realestate.com.kh/__sized__/lis...,[https://images.realestate.com.kh/__sized__/li...,1.0,"[105.94432001929226, 11.761335084427174]",Tboung Khmum,Ponhea Kraek,Kraek,72 St 72,House,False,residential
6187,231535,ដី ចំការធូរ៉េន លក់ | Durian Farm For Sale,"$100,000/m²",,,,,https://images.realestate.com.kh/__sized__/lis...,[https://images.realestate.com.kh/__sized__/li...,,"[105.956464, 11.709971]",Tboung Khmum,Ponhea Kraek,Trapeang Phlong,,Land/Development,False,residential
6188,246457,Land For Sale,"$147,000",,,,22317.0,https://images.realestate.com.kh/__sized__/lis...,[https://images.realestate.com.kh/__sized__/li...,,"[105.680894, 12.04224]",Tboung Khmum,Tboung Khmum,Roka Po Pram,72 Pel 72C Phum,Land/Development,False,residential


In [35]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

url = "https://www.realestate.com.kh/"

# Create column if not exists
if 'information' not in df_all.columns:
    df_all['information'] = None

# Define the scraping function
def fetch_info(index, id_value):
    temp_url = url + str(id_value) + '/'
    try:
        response = requests.get(temp_url, timeout=10)
        soup = BeautifulSoup(response.content, "html.parser")
        span = soup.find("span", class_="css-zrj3zm")
        if span:
            raw_text = span.get_text(separator=" ", strip=True)
            print(f"[{index}] ✔ Success")
            return index, raw_text
        else:
            print(f"[{index}] ✘ No span found")
            return index, None
    except Exception as e:
        print(f"[{index}] ⚠ Error: {e}")
        return index, None

# Use ThreadPoolExecutor for parallel execution
with ThreadPoolExecutor(max_workers=25) as executor:
    futures = [
        executor.submit(fetch_info, i, df_all.iloc[i]['id']) for i in range(len(df_all))
    ]

    for future in as_completed(futures):
        index, result = future.result()
        df_all.at[index, 'information'] = result


[2] ✔ Success
[4] ✔ Success
[1] ✔ Success
[0] ✔ Success
[8] ✔ Success
[9] ✔ Success
[3] ✔ Success
[6] ✔ Success
[10] ✔ Success
[15] ✔ Success
[7] ✔ Success
[16] ✔ Success
[23] ✔ Success
[21] ✔ Success
[5] ✔ Success
[11] ✔ Success
[14] ✔ Success
[22] ✔ Success
[18] ✔ Success
[13] ✔ Success
[17] ✔ Success
[20] ✔ Success
[12] ✔ Success
[19] ✔ Success
[24] ✔ Success
[26] ✔ Success
[33] ✔ Success
[31] ✔ Success
[28] ✔ Success
[25] ✔ Success
[27] ✔ Success
[29] ✔ Success
[30] ✔ Success
[32] ✔ Success
[34] ✔ Success
[35] ✔ Success
[36] ✔ Success
[37] ✔ Success
[39] ✔ Success
[43] ✔ Success
[41] ✔ Success
[38] ✔ Success
[42] ✔ Success
[40] ✔ Success
[46] ✔ Success
[48] ✔ Success
[45] ✔ Success
[49] ✔ Success
[44] ✔ Success
[47] ✔ Success
[54] ✔ Success
[51] ✔ Success
[53] ✔ Success
[52] ✔ Success
[55] ✔ Success
[50] ✔ Success
[58] ✔ Success
[56] ✔ Success
[60] ✔ Success
[59] ✔ Success
[57] ✔ Success
[61] ✔ Success
[62] ✔ Success
[63] ✔ Success
[65] ✔ Success
[66] ✔ Success
[64] ✔ Success
[67] 

In [36]:
df_final_amount = df_all.groupby('address_subdivision').size().reset_index(name='count')
df_final_amount = pd.merge(
    df_province_list,
    df_final_amount,
    left_on='Province',
    right_on='address_subdivision',
    how='left'
)
df_final_amount

Unnamed: 0,Province,Count,address_subdivision,count
0,Banteay Meanchey,11,Banteay Meanchey,10.0
1,Battambang,20,Battambang,19.0
2,Kampong Cham,15,Kampong Cham,14.0
3,Kampong Chhnang,7,Kampong Chhnang,7.0
4,Kampong Speu,56,Kampong Speu,56.0
5,Kampong Thom,10,Kampong Thom,9.0
6,Kampot,101,Kampot,101.0
7,Kandal,228,Kandal,224.0
8,Kep,35,Kep,33.0
9,Koh Kong,21,Koh Kong,21.0


In [None]:
df_all.to_csv('../../data/raw/realestates_kh_1.csv', index=False)

In [39]:
import pandas as pd
df_all = pd.read_csv('../../data/raw/realestates_kh_1.csv')

In [40]:
df_all

Unnamed: 0,id,headline,price_display,rent_display,bedrooms,bathrooms,land_area,thumbnail_url,thumbnail_urls,garages,location,address_subdivision,address_locality,address_line_2,address_line_1,category_name,is_parent,type,information
0,204942,A flat (2 floors) near Hengly market and near ...,"$150,000",,6.0,4.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.91, 11.53]",Phnom Penh,Meanchey,Stueng Mean chey,,Flat,False,residential,A flat (2 floors) near Hengly market and near ...
1,211997,Twin Villa (Twin Villa) in Borey Highland 2005...,"$269,000",,4.0,7.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.89, 11.55]",Phnom Penh,Sen Sok,Khmuonh,,Twin Villa,False,residential,Twin Villa (Twin Villa) in Borey Highland 2005...
2,212595,house for sale,"$85,000",,2.0,2.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.91713589328344, 11.554793367229314]",Phnom Penh,Chamkarmon,BKK 2,117 117,House,False,residential,មាន3ជាន់ 2បន្ទប់ទឹក បន្ទប់គេង2 អាចដាក់ម៉ូតូបាន...
3,211987,House for sale in Meanchey Area,"$450,000",,8.0,6.0,0.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,3.0,"[104.88632789999997, 11.539647799999969]",Phnom Penh,Meanchey,Stueng Mean chey 3,1 ផ្លូវលូប្រាំ(82c),Flat,False,residential,"I have a business house, I want to sell a hous..."
4,212653,House for Sale Urgently | Extra Space and Stai...,"$85,000",,2.0,2.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.93, 11.55]",Phnom Penh,Meanchey,Boeung Tumpun,"ST. 45BT #4C, ST. 45BT #4C,",Flat,False,residential,ផ្ទះល្វែងលក់បន្ទាន់ 4m * 15.5m មានជណ្ដើរកៀន ចង...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6185,246364,Land in Phnom Tamao | Selling 60% below market...,"$1,440,000",,,,80000.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.8227380276038, 11.297112676749492]",Takeo,Bati,Kandoeng,,Land,False,commercial,Land in Phnom Tamao | Selling 60% below market...
6186,217364,ផ្ទះសំណាក់លក់បន្ទាន់,POA,,18.0,18.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,1.0,"[105.94432001929226, 11.761335084427174]",Tboung Khmum,Ponhea Kraek,Kraek,72 St 72,House,False,residential,លក់ផ្ទះសំណាក់បន្ទាន់ តម្លៃសមរម្យដែលអាចចរចាបាន ...
6187,231535,ដី ចំការធូរ៉េន លក់ | Durian Farm For Sale,"$100,000/m²",,,,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[105.956464, 11.709971]",Tboung Khmum,Ponhea Kraek,Trapeang Phlong,,Land/Development,False,residential,ដី ចំការធូរ៉េន លក់ | Durian Farm For Sale ទីតា...
6188,246457,Land For Sale,"$147,000",,,,22317.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[105.680894, 12.04224]",Tboung Khmum,Tboung Khmum,Roka Po Pram,72 Pel 72C Phum,Land/Development,False,residential,This Land area is located the middle of thaila...


In [43]:
import langid

df_all['language'] = None
for i in range(len(df_all)):
    text = df_all.loc[i, 'information']
    lang, confidence = langid.classify(str(text))
    df_all.loc[i, 'language'] = lang

df_all


Unnamed: 0,id,headline,price_display,rent_display,bedrooms,bathrooms,land_area,thumbnail_url,thumbnail_urls,garages,location,address_subdivision,address_locality,address_line_2,address_line_1,category_name,is_parent,type,information,language
0,204942,A flat (2 floors) near Hengly market and near ...,"$150,000",,6.0,4.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.91, 11.53]",Phnom Penh,Meanchey,Stueng Mean chey,,Flat,False,residential,A flat (2 floors) near Hengly market and near ...,en
1,211997,Twin Villa (Twin Villa) in Borey Highland 2005...,"$269,000",,4.0,7.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.89, 11.55]",Phnom Penh,Sen Sok,Khmuonh,,Twin Villa,False,residential,Twin Villa (Twin Villa) in Borey Highland 2005...,en
2,212595,house for sale,"$85,000",,2.0,2.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.91713589328344, 11.554793367229314]",Phnom Penh,Chamkarmon,BKK 2,117 117,House,False,residential,មាន3ជាន់ 2បន្ទប់ទឹក បន្ទប់គេង2 អាចដាក់ម៉ូតូបាន...,km
3,211987,House for sale in Meanchey Area,"$450,000",,8.0,6.0,0.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,3.0,"[104.88632789999997, 11.539647799999969]",Phnom Penh,Meanchey,Stueng Mean chey 3,1 ផ្លូវលូប្រាំ(82c),Flat,False,residential,"I have a business house, I want to sell a hous...",en
4,212653,House for Sale Urgently | Extra Space and Stai...,"$85,000",,2.0,2.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.93, 11.55]",Phnom Penh,Meanchey,Boeung Tumpun,"ST. 45BT #4C, ST. 45BT #4C,",Flat,False,residential,ផ្ទះល្វែងលក់បន្ទាន់ 4m * 15.5m មានជណ្ដើរកៀន ចង...,km
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6185,246364,Land in Phnom Tamao | Selling 60% below market...,"$1,440,000",,,,80000.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.8227380276038, 11.297112676749492]",Takeo,Bati,Kandoeng,,Land,False,commercial,Land in Phnom Tamao | Selling 60% below market...,en
6186,217364,ផ្ទះសំណាក់លក់បន្ទាន់,POA,,18.0,18.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,1.0,"[105.94432001929226, 11.761335084427174]",Tboung Khmum,Ponhea Kraek,Kraek,72 St 72,House,False,residential,លក់ផ្ទះសំណាក់បន្ទាន់ តម្លៃសមរម្យដែលអាចចរចាបាន ...,km
6187,231535,ដី ចំការធូរ៉េន លក់ | Durian Farm For Sale,"$100,000/m²",,,,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[105.956464, 11.709971]",Tboung Khmum,Ponhea Kraek,Trapeang Phlong,,Land/Development,False,residential,ដី ចំការធូរ៉េន លក់ | Durian Farm For Sale ទីតា...,km
6188,246457,Land For Sale,"$147,000",,,,22317.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[105.680894, 12.04224]",Tboung Khmum,Tboung Khmum,Roka Po Pram,72 Pel 72C Phum,Land/Development,False,residential,This Land area is located the middle of thaila...,en


In [44]:
df_all.loc[~df_all['language'].isin(['en', 'km', 'ja', 'zh']), 'language'] = 'en'
df_all.loc[df_all['language'].isin(['zh']), 'language'] = 'zh-CN'


In [45]:
df_all['language'].value_counts()

language
en       5491
km        426
zh-CN     268
ja          5
Name: count, dtype: int64

In [46]:
from deep_translator import GoogleTranslator
for i in range(len(df_all)):
    if df_all.at[i, 'language'] != 'en':
        translated = GoogleTranslator(
            source=df_all.at[i, 'language'], 
            target='en'
        ).translate(df_all.at[i, 'information'])
        df_all.at[i, 'information'] = translated


In [49]:
df_all

Unnamed: 0,id,headline,price_display,rent_display,bedrooms,bathrooms,land_area,thumbnail_url,thumbnail_urls,garages,location,address_subdivision,address_locality,address_line_2,address_line_1,category_name,is_parent,type,information,language
0,204942,A flat (2 floors) near Hengly market and near ...,"$150,000",,6.0,4.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.91, 11.53]",Phnom Penh,Meanchey,Stueng Mean chey,,Flat,False,residential,A flat (2 floors) near Hengly market and near ...,en
1,211997,Twin Villa (Twin Villa) in Borey Highland 2005...,"$269,000",,4.0,7.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.89, 11.55]",Phnom Penh,Sen Sok,Khmuonh,,Twin Villa,False,residential,Twin Villa (Twin Villa) in Borey Highland 2005...,en
2,212595,house for sale,"$85,000",,2.0,2.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.91713589328344, 11.554793367229314]",Phnom Penh,Chamkarmon,BKK 2,117 117,House,False,residential,"There are 3 floors 2 bedrooms, 2 bathrooms can...",km
3,211987,House for sale in Meanchey Area,"$450,000",,8.0,6.0,0.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,3.0,"[104.88632789999997, 11.539647799999969]",Phnom Penh,Meanchey,Stueng Mean chey 3,1 ផ្លូវលូប្រាំ(82c),Flat,False,residential,"I have a business house, I want to sell a hous...",en
4,212653,House for Sale Urgently | Extra Space and Stai...,"$85,000",,2.0,2.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.93, 11.55]",Phnom Penh,Meanchey,Boeung Tumpun,"ST. 45BT #4C, ST. 45BT #4C,",Flat,False,residential,Apartment Instast 4M * 15.5m There are 45M kit...,km
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6185,246364,Land in Phnom Tamao | Selling 60% below market...,"$1,440,000",,,,80000.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[104.8227380276038, 11.297112676749492]",Takeo,Bati,Kandoeng,,Land,False,commercial,Land in Phnom Tamao | Selling 60% below market...,en
6186,217364,ផ្ទះសំណាក់លក់បន្ទាន់,POA,,18.0,18.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,1.0,"[105.94432001929226, 11.761335084427174]",Tboung Khmum,Ponhea Kraek,Kraek,72 St 72,House,False,residential,"Selling negotiable emergency lodges, which are...",km
6187,231535,ដី ចំការធូរ៉េន លក់ | Durian Farm For Sale,"$100,000/m²",,,,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[105.956464, 11.709971]",Tboung Khmum,Ponhea Kraek,Trapeang Phlong,,Land/Development,False,residential,Tucking Farm Sale | Durian Farm for Sale Locat...,km
6188,246457,Land For Sale,"$147,000",,,,22317.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,"[105.680894, 12.04224]",Tboung Khmum,Tboung Khmum,Roka Po Pram,72 Pel 72C Phum,Land/Development,False,residential,This Land area is located the middle of thaila...,en


In [50]:
import ast
def parse_location(val):
    if val == "":
        return None
    try:
        return ast.literal_eval(val)
    except Exception:
        return None  # In case of malformed strings

df_all['location'] = df_all['location'].apply(parse_location)

for i in range(len(df_all)):
    df_all.loc[i, 'lat'] = df_all.iloc[i]['location'][1]
    df_all.loc[i, 'lng'] = df_all.iloc[i]['location'][0]

In [51]:
df_all

Unnamed: 0,id,headline,price_display,rent_display,bedrooms,bathrooms,land_area,thumbnail_url,thumbnail_urls,garages,...,address_locality,address_line_2,address_line_1,category_name,is_parent,type,information,language,lat,lng
0,204942,A flat (2 floors) near Hengly market and near ...,"$150,000",,6.0,4.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Meanchey,Stueng Mean chey,,Flat,False,residential,A flat (2 floors) near Hengly market and near ...,en,11.530000,104.910000
1,211997,Twin Villa (Twin Villa) in Borey Highland 2005...,"$269,000",,4.0,7.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Sen Sok,Khmuonh,,Twin Villa,False,residential,Twin Villa (Twin Villa) in Borey Highland 2005...,en,11.550000,104.890000
2,212595,house for sale,"$85,000",,2.0,2.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Chamkarmon,BKK 2,117 117,House,False,residential,"There are 3 floors 2 bedrooms, 2 bathrooms can...",km,11.554793,104.917136
3,211987,House for sale in Meanchey Area,"$450,000",,8.0,6.0,0.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,3.0,...,Meanchey,Stueng Mean chey 3,1 ផ្លូវលូប្រាំ(82c),Flat,False,residential,"I have a business house, I want to sell a hous...",en,11.539648,104.886328
4,212653,House for Sale Urgently | Extra Space and Stai...,"$85,000",,2.0,2.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Meanchey,Boeung Tumpun,"ST. 45BT #4C, ST. 45BT #4C,",Flat,False,residential,Apartment Instast 4M * 15.5m There are 45M kit...,km,11.550000,104.930000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6185,246364,Land in Phnom Tamao | Selling 60% below market...,"$1,440,000",,,,80000.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Bati,Kandoeng,,Land,False,commercial,Land in Phnom Tamao | Selling 60% below market...,en,11.297113,104.822738
6186,217364,ផ្ទះសំណាក់លក់បន្ទាន់,POA,,18.0,18.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,1.0,...,Ponhea Kraek,Kraek,72 St 72,House,False,residential,"Selling negotiable emergency lodges, which are...",km,11.761335,105.944320
6187,231535,ដី ចំការធូរ៉េន លក់ | Durian Farm For Sale,"$100,000/m²",,,,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Ponhea Kraek,Trapeang Phlong,,Land/Development,False,residential,Tucking Farm Sale | Durian Farm for Sale Locat...,km,11.709971,105.956464
6188,246457,Land For Sale,"$147,000",,,,22317.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Tboung Khmum,Roka Po Pram,72 Pel 72C Phum,Land/Development,False,residential,This Land area is located the middle of thaila...,en,12.042240,105.680894


In [52]:
df_all.drop(columns=['language'], inplace=True)

In [53]:
df_all

Unnamed: 0,id,headline,price_display,rent_display,bedrooms,bathrooms,land_area,thumbnail_url,thumbnail_urls,garages,...,address_subdivision,address_locality,address_line_2,address_line_1,category_name,is_parent,type,information,lat,lng
0,204942,A flat (2 floors) near Hengly market and near ...,"$150,000",,6.0,4.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Phnom Penh,Meanchey,Stueng Mean chey,,Flat,False,residential,A flat (2 floors) near Hengly market and near ...,11.530000,104.910000
1,211997,Twin Villa (Twin Villa) in Borey Highland 2005...,"$269,000",,4.0,7.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Phnom Penh,Sen Sok,Khmuonh,,Twin Villa,False,residential,Twin Villa (Twin Villa) in Borey Highland 2005...,11.550000,104.890000
2,212595,house for sale,"$85,000",,2.0,2.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Phnom Penh,Chamkarmon,BKK 2,117 117,House,False,residential,"There are 3 floors 2 bedrooms, 2 bathrooms can...",11.554793,104.917136
3,211987,House for sale in Meanchey Area,"$450,000",,8.0,6.0,0.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,3.0,...,Phnom Penh,Meanchey,Stueng Mean chey 3,1 ផ្លូវលូប្រាំ(82c),Flat,False,residential,"I have a business house, I want to sell a hous...",11.539648,104.886328
4,212653,House for Sale Urgently | Extra Space and Stai...,"$85,000",,2.0,2.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Phnom Penh,Meanchey,Boeung Tumpun,"ST. 45BT #4C, ST. 45BT #4C,",Flat,False,residential,Apartment Instast 4M * 15.5m There are 45M kit...,11.550000,104.930000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6185,246364,Land in Phnom Tamao | Selling 60% below market...,"$1,440,000",,,,80000.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Takeo,Bati,Kandoeng,,Land,False,commercial,Land in Phnom Tamao | Selling 60% below market...,11.297113,104.822738
6186,217364,ផ្ទះសំណាក់លក់បន្ទាន់,POA,,18.0,18.0,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,1.0,...,Tboung Khmum,Ponhea Kraek,Kraek,72 St 72,House,False,residential,"Selling negotiable emergency lodges, which are...",11.761335,105.944320
6187,231535,ដី ចំការធូរ៉េន លក់ | Durian Farm For Sale,"$100,000/m²",,,,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Tboung Khmum,Ponhea Kraek,Trapeang Phlong,,Land/Development,False,residential,Tucking Farm Sale | Durian Farm for Sale Locat...,11.709971,105.956464
6188,246457,Land For Sale,"$147,000",,,,22317.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Tboung Khmum,Tboung Khmum,Roka Po Pram,72 Pel 72C Phum,Land/Development,False,residential,This Land area is located the middle of thaila...,12.042240,105.680894


In [58]:
# df_all[~df_all['price_display'].fillna('').astype(str).str.contains(r'\$\d', regex=True)]
df_all[df_all['price_display'].fillna('').astype(str).str.contains('/', regex=False)]




Unnamed: 0,id,headline,price_display,rent_display,bedrooms,bathrooms,land_area,thumbnail_url,thumbnail_urls,garages,...,address_subdivision,address_locality,address_line_2,address_line_1,category_name,is_parent,type,information,lat,lng
24,232502,Land For Sale,$850/m²,,,,3234.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Phnom Penh,Sen Sok,Phnom Penh Thmey,,Land/Development,False,residential,Land For Urgent Sale It's standing on good loc...,11.569135,104.924800
57,186047,Luxury Condominium for Sale Now in Russey Keo ...,"$2,500/m²",,1.0,1.0,80.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,1.0,...,Phnom Penh,Russey Keo,Tuol Sangkae 1,,Condo,False,residential,Luxury Condominium for Sale Now in Russey Keo...,11.550000,104.930000
114,186060,Luxury Condominium for Sale at Riverside Phnom...,"$1,900/m²",,2.0,2.0,95.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,1.0,...,Phnom Penh,Daun Penh,Phsar Chas,,Condo,False,residential,Luxury Condominium for Sale at Riverside Phnom...,11.575610,104.920250
129,95244,Condo complex a beacon of luxury in Toul Kork,"$1,390/m²",,,,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,85.0,...,Phnom Penh,Toul Kork,Boeung Kak 1,337 Street,project,True,residential,"De Castle, the most renowned developer in Camb...",11.583610,104.893585
240,234719,Residence land for sale in good located at San...,"$6,500/m²",,,,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Phnom Penh,Chamkarmon,BKK 1,,Land/Development,False,residential,"• Land size: 16.5m x 28.97m • Price: $6,500/Sq...",11.544500,104.913586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6179,209172,Land For Sale Urgently,"$7,300/m²",,,,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Takeo,Bati,Chambak,2 3 Road,Unit,False,residential,"- Location: Boeung Leach village, Chambok comm...",11.203607,104.819798
6181,231472,Land for sale,$18/m²,,,,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Takeo,Tram Kak,Ou Saray,,Land/Development,False,residential,Land for sale ☆ Land size: 28 hectares ☆ Plot ...,11.095382,104.430141
6182,232292,1m2 3700 $ Ang Ta Som to Takeo Province,"$3,700/m²",,,,,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Takeo,Tram Kak,Leay Bour,22 អង្គតាសោមមកខេត្តតាកែវ,Land/Development,False,residential,✅ The land is located on Street 22 from Ang Ta...,11.006976,104.692919
6183,241358,Land on Hot Sales | 30+Km from Phnom Penh | Wa...,$33.0/m²,,,,4006.0,https://images.realestate.com.kh/__sized__/lis...,['https://images.realestate.com.kh/__sized__/l...,,...,Takeo,Bati,Trapeang Sab,,Land/Development,False,residential,The land is typically suitable for a warehouse...,11.266646,104.801068


In [55]:
import langid

text = "សួស្តី"
lang, confidence = langid.classify(text)

print(lang)        # Output: 'km'
print(confidence)  # Output: e.g., 0.99


km
-11.102603912353516


In [56]:
df_all.to_csv('../../data/raw/realestates_kh_v2.csv', index=False)