In [10]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import MinMaxScaler

In [11]:
df = pd.read_csv('../../../data/processed/100k.csv')

In [12]:
df

Unnamed: 0,address_subdivision,address_locality,address_line_2,h_id,price_per_m2,land_area,price,longitude,latitude,near_Koh_Pich_in_km,...,f_road,f_secondary,f_service,f_steps,f_tertiary,f_track,f_trunk,f_trunk_link,f_unclassified,f_unused
0,Phnom Penh,Praek Pnov,Samraong,886580d009fffff,146.76,132,132,104.830384,11.717244,22,...,0,0,0,0,0,0,0,0,0,0
1,Phnom Penh,Pur SenChey,Kamboul,8865846ea7fffff,529.34,174,174,104.741205,11.547118,21,...,0,0,0,0,1,1,0,0,1,0
2,Phnom Penh,Chraoy Chongvar,Preaek Ta Sek,886580d26dfffff,435.46,150,150,104.881978,11.676204,15,...,0,0,1,0,0,0,0,0,0,0
3,Phnom Penh,Pur SenChey,Ovlaok,8865846c3bfffff,117.18,152,152,104.759005,11.579001,20,...,0,0,0,0,0,0,0,0,0,0
4,Phnom Penh,Pur SenChey,Boeng Thum,8865846521fffff,158.79,96,96,104.770540,11.482552,20,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,Phnom Penh,Pur SenChey,Boeng Thum,886584652bfffff,133.23,192,192,104.769793,11.471359,20,...,0,0,0,0,0,0,0,0,0,0
99996,Phnom Penh,Mean Chey,Chak Angrae Kraom,8865846109fffff,1537.33,104,104,104.931755,11.486649,7,...,0,0,1,0,0,0,0,0,0,0
99997,Phnom Penh,Dangkao,Prey Sa,8865846189fffff,684.87,46,46,104.869213,11.492956,10,...,0,0,0,0,0,0,0,0,0,0
99998,Phnom Penh,Saensokh,Krang Thnong,8865846d49fffff,1250.68,62,62,104.831913,11.592815,12,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# read sample data
with open('commune_nested_with_min_max.json', 'r') as file:
    data = json.load(file)

In [14]:
commune_data = []
for city, districts in data.items():
    for district, communes in districts.items():
        for commune in communes:
            commune_data.append({
                'address_subdivision': city,
                'address_locality': district,
                'address_line_2': commune['commune'],
                'commune_min': commune['min'],
                'commune_max': commune['max']
            })
commune_df = pd.DataFrame(commune_data)

In [15]:
df = pd.merge(df, commune_df, 
              on=['address_subdivision', 'address_locality', 'address_line_2'],
              how='left')

In [16]:
distance_columns = ['near_Koh_Pich_in_km', 'near_Russian_Market_in_km', 'near_AEON_Mall_1_in_km', 'near_AEON_Mall_2_in_km', 'near_AEON_Mall_3_in_km', 'near_Bassac_Lane_in_km', 'near_Koh_Norea_in_km', 'near_Camko_City_in_km', 'near_Olympic_Stadium_in_km', 'near_Phsar_Tmey_in_km', 'near_Boeng_Keng_Kang_1_in_km', 'near_Wat_Phnom_in_km', 'near_Chroy_Changvar_Bridge_in_km', 'near_Vattanac_Tower_in_km', 'near_Royal_Palace_in_km', 'near_Sisowath_Riverside_Park_in_km', 'near_Phnom_Penh_Airport_in_km', 'near_Phsar_Chas_in_km', 'near_Phsar_kandal_in_km']
amenity_columns = ['n_cafe_5km', 'n_gas_station_5km', 'n_hospital_5km', 'n_hotel_5km', 'n_mart_5km', 'n_pre_school_5km', 'n_secondary_school_5km', 'n_primary_school_5km', 'n_university_5km', 'n_seven_eleven_5km', 'n_resturant_5km', 'n_super_market_5km', 'n_borey_5km', 'n_bank_5km', 'n_atm_5km']
road_columns = ['f_bridleway', 'f_corridor', 'f_cycleway', 'f_disused', 'f_footway', 'f_motorway', 'f_path', 'f_pedestrian', 'f_primary', 'f_residential', 'f_road', 'f_secondary', 'f_service', 'f_steps', 'f_tertiary', 'f_track', 'f_trunk', 'f_trunk_link', 'f_unclassified', 'f_unused']

ROAD_TYPE_WEIGHTS = {
    'f_residential': 1.0, 'f_pedestrian': 0.9, 'f_cycleway': 0.85, 'f_footway': 0.8,
    'f_primary': 0.7, 'f_secondary': 0.65, 'f_tertiary': 0.6, 'f_service': 0.55,
    'f_trunk': 0.4, 'f_trunk_link': 0.4, 'f_motorway': 0.3, 'f_unclassified': 0.5,
    'f_track': 0.2, 'f_path': 0.25, 'f_steps': 0.3, 'f_disused': 0.1, 
    'f_unused': 0.1, 'f_corridor': 0.15, 'f_bridleway': 0.2, 'f_road': 0.5
}

In [17]:
for col in road_columns:
    df[col] = df[col].astype(bool)

In [18]:
# Precompute road weights vector
road_weight_vector = np.array([ROAD_TYPE_WEIGHTS[col] for col in road_columns])
road_matrix = df[road_columns].values

# Compute scores using vectorized operations
distance_score = 1 / (1 + df[distance_columns]).mean(axis=1)
amenity_score = np.tanh(df[amenity_columns] / 50).mean(axis=1)

# Compute road score efficiently
road_present = road_matrix.astype(bool)
valid_rows = road_present.any(axis=1)
road_score = np.full(len(df), 0.5)  # Default value
road_score[valid_rows] = np.dot(road_present[valid_rows], road_weight_vector) / road_present[valid_rows].sum(axis=1)

# Combine scores
df['desirability_score'] = 0.7 * distance_score + 0.2 * amenity_score + 0.1 * road_score

# Normalize scores within communes using optimized method
scaler = MinMaxScaler()
df['desirability_score'] = df.groupby('address_line_2')['desirability_score'].transform(lambda x: scaler.fit_transform(x.values.reshape(-1, 1)).flatten())


In [19]:
def compute_price_bands(min_vals, max_vals, scores):
    """Vectorized computation of price bands"""
    # Handle min == max cases
    equal_mask = min_vals == max_vals
    min_vals = np.where(equal_mask, min_vals * 0.85, min_vals)
    max_vals = np.where(equal_mask, max_vals * 1.15, max_vals)
    
    # Compute mean and range
    means = (min_vals + max_vals) / 2
    ranges = np.maximum(max_vals - min_vals, 1e-6)  # Avoid division by zero
    
    # Initialize bands
    low_bands = np.zeros_like(scores)
    high_bands = np.zeros_like(scores)
    
    # Define tier conditions and boundaries
    conditions = [
        (scores >= 0.9), (scores >= 0.8), (scores >= 0.7), (scores >= 0.6),
        (scores >= 0.5), (scores >= 0.4), (scores >= 0.3), (scores >= 0.2),
        (scores >= 0.1), (scores > 0), (True)  # Catch-all
    ]
    
    # Band boundaries for each tier
    low_boundaries = [
        means + 0.4 * ranges,
        means + 0.3 * ranges,
        means + 0.25 * ranges,
        means + 0.2 * ranges,
        means + 0.15 * ranges,
        means + 0.1 * ranges,
        means + 0.05 * ranges,
        means - 0.1 * ranges,
        means - 0.2 * ranges,
        means - 0.3 * ranges,
        min_vals + 0.02 * ranges
    ]
    
    high_boundaries = [
        max_vals - 0.02 * ranges,
        means + 0.4 * ranges,
        means + 0.3 * ranges,
        means + 0.25 * ranges,
        means + 0.2 * ranges,
        means + 0.15 * ranges,
        means + 0.1 * ranges,
        means + 0.05 * ranges,
        means - 0.1 * ranges,
        means - 0.2 * ranges,
        means - 0.3 * ranges
    ]
    
    # Apply conditions
    for i, cond in enumerate(conditions):
        low_bands[cond] = low_boundaries[i][cond]
        high_bands[cond] = high_boundaries[i][cond]
    
    # Clamp to min/max ranges
    low_bands = np.maximum(low_bands, min_vals)
    high_bands = np.minimum(high_bands, max_vals)
    
    return low_bands, high_bands

In [20]:
# Compute price bands
min_vals = df['commune_min'].fillna(0).values
max_vals = df['commune_max'].fillna(1).values  # Handle nulls safely
scores = df['desirability_score'].values

low_bands, high_bands = compute_price_bands(min_vals, max_vals, scores)

# Generate prices
price_per_m2 = np.random.uniform(low_bands, high_bands)
df['price_per_m2'] = np.round(price_per_m2, 2)

In [None]:
# Handle duplicate prices efficiently
duplicate_mask = df.duplicated('price_per_m2', keep=False)
while duplicate_mask.any():
    # Add small random perturbations to duplicates
    perturbations = np.random.uniform(-0.5, 0.5, duplicate_mask.sum())
    df.loc[duplicate_mask, 'price_per_m2'] += perturbations
    df['price_per_m2'] = np.round(df['price_per_m2'], 2)
    duplicate_mask = df.duplicated('price_per_m2', keep=False)