In [2]:
import pandas as pd
import json
import random
from shapely import wkt, Point
import time
from collections import defaultdict
from shapely.geometry import Point, Polygon, MultiPolygon
import shapely.wkt as wkt
import numpy as np
import csv

In [3]:
import json

# Load hexagon_mapping.json
with open('hexagon_mapping.json', 'r', encoding='utf-8') as f:
    hexagon_mapping = json.load(f)

# Extract all h3 hex IDs from the nested mapping
def extract_all_h3_ids(nested_dict):
    h3_ids = set()
    for v1 in nested_dict.values():
        for v2 in v1.values():
            for v3 in v2.values():
                h3_ids.update(v3.keys())
    return h3_ids

valid_h3_ids = extract_all_h3_ids(hexagon_mapping)
print(f"Loaded {len(valid_h3_ids)} valid h3 hexagons from mapping.")

Loaded 866 valid h3 hexagons from mapping.


In [4]:
h3 = pd.read_csv('../../../data/raw/scrape/population_phnom_penh.csv')

In [5]:
h3.head()

Unnamed: 0,h3,population,geometry,index_right,Shape_Leng,Shape_Area,ADM3_EN,ADM3_PCODE,ADM3_REF,ADM3ALT1EN,ADM3ALT2EN,ADM2_EN,ADM2_PCODE,ADM1_EN,ADM1_PCODE,ADM0_EN,ADM0_PCODE,date,validOn
0,886586a6ebfffff,255.0,MULTIPOLYGON (((104.97555292017218 11.70869197...,466,0.209855,0.001835,Kaoh Dach,KH121004,,,,Chraoy Chongvar,KH1210,Phnom Penh,KH12,Cambodia,KH,2014-10-14,2018-10-04
1,886586a6ddfffff,1337.0,MULTIPOLYGON (((104.95959051419976 11.68686180...,466,0.209855,0.001835,Kaoh Dach,KH121004,,,,Chraoy Chongvar,KH1210,Phnom Penh,KH12,Cambodia,KH,2014-10-14,2018-10-04
2,886586a6dbfffff,125.0,MULTIPOLYGON (((104.94522418806201 11.68053577...,466,0.209855,0.001835,Kaoh Dach,KH121004,,,,Chraoy Chongvar,KH1210,Phnom Penh,KH12,Cambodia,KH,2014-10-14,2018-10-04
3,886586a6d9fffff,764.0,MULTIPOLYGON (((104.95426989678246 11.67958437...,466,0.209855,0.001835,Kaoh Dach,KH121004,,,,Chraoy Chongvar,KH1210,Phnom Penh,KH12,Cambodia,KH,2014-10-14,2018-10-04
4,886586a6d7fffff,323.0,MULTIPOLYGON (((104.94681957690541 11.69604271...,466,0.209855,0.001835,Kaoh Dach,KH121004,,,,Chraoy Chongvar,KH1210,Phnom Penh,KH12,Cambodia,KH,2014-10-14,2018-10-04


In [6]:
# Precompute triangulation for each polygon
def precompute_triangulation(polygon):
    """Triangulate polygon and return triangles with their area weights"""
    if polygon.geom_type == 'MultiPolygon':
        polygon = polygon.geoms[0]
    
    # Use convex hull for better performance with complex shapes
    convex_poly = polygon.convex_hull
    min_x, min_y, max_x, max_y = convex_poly.bounds
    
    # Use representative point as fallback
    rep_point = convex_poly.representative_point()
    return [(min_x, min_y, max_x, max_y), rep_point]

def generate_random_lat_lon_point_in_polygon(triangulation_data, polygon):
    """Generate random point inside the actual polygon, not just bounding box."""
    bounds, rep_point = triangulation_data
    min_x, min_y, max_x, max_y = bounds

    for _ in range(50):
        lon = random.uniform(min_x, max_x)
        lat = random.uniform(min_y, max_y)
        point = Point(lon, lat)
        if polygon.contains(point):
            return lon, lat  # Only return if inside the polygon

    # Fallback to representative point if no point found
    return rep_point.x, rep_point.y

print("Precomputing geometry optimizations...")
start_time = time.time()

# Create geometry map and triangulation cache
h3_geometry_map = h3.set_index('h3')['geometry'].to_dict()
h3_triangulation_map = {}
for h_id, geom_str in h3_geometry_map.items():
    try:
        geom = wkt.loads(str(geom_str))
        h3_triangulation_map[h_id] = precompute_triangulation(geom)
    except Exception as e:
        # Fallback to a default point if geometry fails
        h3_triangulation_map[h_id] = ((0, 0, 1, 1), Point(0.5, 0.5))
        print(f"Error processing h_id {h_id}: {e}")

# Precompute land area choices (30-120 sqm in 2 sqm increments)
area_choices = [num for num in range(30, 250) if num % 2 == 0]


Precomputing geometry optimizations...


In [7]:
import json

# Load your grouped_stats_nested_v2.json
with open('hexagon_mapping.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

flat_records = []
for subdiv, localities in data.items():
    for locality, streets in localities.items():
        for street, h_ids_data in streets.items():
            for h_id, metrics in h_ids_data.items():
                flat_records.append({
                    'address_subdivision': subdiv,
                    'address_locality': locality,
                    'address_line_2': street,
                    'h_id': h_id,
                    'metrics': metrics
                })

In [8]:
# %%
# Initialize data structures for sampling
print("Starting data generation...")
start_time = time.time()

csv_data = []
h_id_counts = defaultdict(int)  # Track usage per h_id
price_tracker = defaultdict(set)  # Track generated prices per category
n = 100000  # Target number of samples

# Create shuffled list of available records
available_records = flat_records.copy()
random.shuffle(available_records)

# Sampling loop
records_generated = 0
while records_generated < n:
    if not available_records:
        # Refresh available records
        available_records = [
            r for r in filtered_flat_records 
            if h_id_counts.get(r['h_id'], 0) < 150
        ]
        
        if not available_records:
            print("Warning: No more valid records available")
            break
            
        random.shuffle(available_records)
        print(f"Refreshed available records: {len(available_records)} options")
    
    # Get next record
    record = available_records.pop()
    h_id = record['h_id']
        # Only keep records whose h_id is in the valid hexagon set
    filtered_flat_records = [rec for rec in flat_records if rec['h_id'] in valid_h3_ids]
    print(f"Filtered to {len(filtered_flat_records)} records with valid h3 hexagons.")

    # Use this for sampling
    available_records = filtered_flat_records.copy()
    random.shuffle(available_records)
    
    # Skip if h_id has reached its limit
    if h_id_counts.get(h_id, 0) >= 150:
        continue
        
    # Get precomputed geometry
    triangulation_data = h3_triangulation_map.get(h_id)
    if not triangulation_data:
        continue
        
    # Generate coordinates
    try:
        # Get the actual polygon geometry for this h_id
        polygon_geom = wkt.loads(str(h3_geometry_map[h_id]))
        random_lon, random_lat = generate_random_lat_lon_point_in_polygon(triangulation_data, polygon_geom)
    except Exception as e:
        print(f"Coordinate generation failed for h_id {h_id}: {e}")
        continue
        
    # Create unique key for price tracking
    price_key = (h_id)
    
    random_land_area = random.choice(area_choices)
    
    # Create final record
    final_record = {
        'address_subdivision': record['address_subdivision'],
        'address_locality': record['address_locality'],
        'address_line_2': record['address_line_2'],
        'h_id': h_id,
        'price_per_m2': 1,
        'land_area': random_land_area,
        'price': 1 * random_land_area,
        'longitude': random_lon,
        'latitude': random_lat,
    }
    
    # Append to results
    csv_data.append(final_record)
    
    # Update counters
    h_id_counts[h_id] += 1
    records_generated += 1
    
    # Print progress
    if records_generated % 100 == 0:
        elapsed = time.time() - start_time
        rate = records_generated / elapsed if elapsed > 0 else 0
        print(f"Generated {records_generated}/{n} rows ({rate:.1f} rows/sec)")
        print(f"Unique h_ids: {len(h_id_counts)}, Unique prices: {sum(len(s) for s in price_tracker.values())}")

# Final output
print(f"\nData generation completed in {time.time()-start_time:.2f} seconds")
print(f"Total records: {len(csv_data)}")
print(f"Unique h_ids used: {len(h_id_counts)}")
print(f"Unique price categories: {len(price_tracker)}")

Starting data generation...
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 hexagons.
Filtered to 1470 records with valid h3 he

In [9]:
df = pd.DataFrame(csv_data)

In [10]:
df

Unnamed: 0,address_subdivision,address_locality,address_line_2,h_id,price_per_m2,land_area,price,longitude,latitude
0,Phnom Penh,Praek Pnov,Samraong,886580d009fffff,1,132,132,104.830384,11.717244
1,Phnom Penh,Pur SenChey,Kamboul,8865846ea7fffff,1,174,174,104.741205,11.547118
2,Phnom Penh,Chraoy Chongvar,Preaek Ta Sek,886580d26dfffff,1,150,150,104.881978,11.676204
3,Phnom Penh,Pur SenChey,Ovlaok,8865846c3bfffff,1,152,152,104.759005,11.579001
4,Phnom Penh,Pur SenChey,Boeng Thum,8865846521fffff,1,96,96,104.770540,11.482552
...,...,...,...,...,...,...,...,...,...
99995,Phnom Penh,Pur SenChey,Boeng Thum,886584652bfffff,1,192,192,104.769793,11.471359
99996,Phnom Penh,Mean Chey,Chak Angrae Kraom,8865846109fffff,1,104,104,104.931755,11.486649
99997,Phnom Penh,Dangkao,Prey Sa,8865846189fffff,1,46,46,104.869213,11.492956
99998,Phnom Penh,Saensokh,Krang Thnong,8865846d49fffff,1,62,62,104.831913,11.592815


In [11]:
df.isnull().sum()  

address_subdivision    0
address_locality       0
address_line_2         0
h_id                   0
price_per_m2           0
land_area              0
price                  0
longitude              0
latitude               0
dtype: int64

In [12]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Select the first 4 address_line_2 groups for plotting
# groups = df['h_id'].unique()[:len(df['h_id'].unique())]

# for addr in groups:
#     subset = df[df['h_id'] == addr]
#     plt.figure(figsize=(10, 4))
#     sns.boxplot(x=subset['price_per_m2'])
#     sns.stripplot(x=subset['price_per_m2'], color='red', alpha=0.5, label='Data Points')
#     plt.title(f'Price per m² Distribution: {addr}')
#     plt.xlabel('price_per_m2')
#     plt.legend()
#     plt.show()

In [14]:
df.to_csv('../../../data/raw/scrape/100k.csv', index=False)