In [1]:
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd
from tqdm import tqdm  # Optional progress bar

In [2]:
# Load real estate data
df = pd.read_csv('../../../data/processed/100k.csv')
df['geometry'] = df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)
gdf_real_estate = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')


In [3]:
roads = gpd.read_file(r"D:\CADT\cambodia-latest-free.shp\gis_osm_roads_free_1.shp")

In [4]:
# Project both to a metric CRS for accurate distance (UTM zone 48N for Cambodia)
gdf_real_estate = gdf_real_estate.to_crs(epsg=32648)
roads = roads.to_crs(epsg=32648)

In [5]:
# Define road types to flag
road_types = [
    'bridleway', 'corridor', 'cycleway', 'disused', 'footway', 'motorway', 'path',
    'pedestrian', 'primary', 'residential', 'road', 'secondary', 'service', 'steps',
    'tertiary', 'track', 'trunk', 'trunk_link', 'unclassified', 'unused'
]

In [6]:
roads_sindex = roads.sindex

In [7]:

def find_nearby_road_types_optimized(row, roads_gdf, roads_index, distance=100):
    point = row.geometry
    # Get possible matches using spatial index
    bbox = point.buffer(distance).bounds
    possible_matches_idx = list(roads_index.intersection(bbox))
    if not possible_matches_idx:
        return pd.Series(0, index=[f'f_{rt}' for rt in road_types])
    
    # Check exact distances for candidates
    possible_roads = roads_gdf.iloc[possible_matches_idx]
    precise_matches = possible_roads[possible_roads.distance(point) <= distance]
    present_types = set(precise_matches['fclass'])
    
    # Create flags
    return pd.Series({
        f'f_{rt}': 1 if rt in present_types else 0
        for rt in road_types
    })

In [8]:
# Process in chunks with progress monitoring
chunk_size = 5000  # Adjust based on memory
flags_chunks = []

for i in tqdm(range(0, len(gdf_real_estate), chunk_size)):
    chunk = gdf_real_estate.iloc[i:i+chunk_size]
    chunk_flags = chunk.apply(
        find_nearby_road_types_optimized,
        axis=1,
        roads_gdf=roads,
        roads_index=roads_sindex,
        distance=100
    )
    flags_chunks.append(chunk_flags)

flags_df = pd.concat(flags_chunks)

100%|██████████| 20/20 [02:12<00:00,  6.63s/it]


In [12]:
flags_df

Unnamed: 0,f_bridleway,f_corridor,f_cycleway,f_disused,f_footway,f_motorway,f_path,f_pedestrian,f_primary,f_residential,f_road,f_secondary,f_service,f_steps,f_tertiary,f_track,f_trunk,f_trunk_link,f_unclassified,f_unused
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0
2,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
99996,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
99997,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
99998,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [13]:
# Combine with original data
result = pd.concat([df.reset_index(drop=True), flags_df.reset_index(drop=True)], axis=1)

# Show result
print(result.head())

  address_subdivision address_locality address_line_2             h_id  \
0          Phnom Penh       Praek Pnov       Samraong  886580d009fffff   
1          Phnom Penh      Pur SenChey        Kamboul  8865846ea7fffff   
2          Phnom Penh  Chraoy Chongvar  Preaek Ta Sek  886580d26dfffff   
3          Phnom Penh      Pur SenChey         Ovlaok  8865846c3bfffff   
4          Phnom Penh      Pur SenChey     Boeng Thum  8865846521fffff   

   price_per_m2  land_area  price   longitude   latitude  near_Koh_Pich_in_km  \
0             1        132    132  104.830384  11.717244                   22   
1             1        174    174  104.741205  11.547118                   21   
2             1        150    150  104.881978  11.676204                   15   
3             1        152    152  104.759005  11.579001                   20   
4             1         96     96  104.770540  11.482552                   20   

   ...  f_road  f_secondary  f_service  f_steps  f_tertiary  f_track

In [14]:
result

Unnamed: 0,address_subdivision,address_locality,address_line_2,h_id,price_per_m2,land_area,price,longitude,latitude,near_Koh_Pich_in_km,...,f_road,f_secondary,f_service,f_steps,f_tertiary,f_track,f_trunk,f_trunk_link,f_unclassified,f_unused
0,Phnom Penh,Praek Pnov,Samraong,886580d009fffff,1,132,132,104.830384,11.717244,22,...,0,0,0,0,0,0,0,0,0,0
1,Phnom Penh,Pur SenChey,Kamboul,8865846ea7fffff,1,174,174,104.741205,11.547118,21,...,0,0,0,0,1,1,0,0,1,0
2,Phnom Penh,Chraoy Chongvar,Preaek Ta Sek,886580d26dfffff,1,150,150,104.881978,11.676204,15,...,0,0,1,0,0,0,0,0,0,0
3,Phnom Penh,Pur SenChey,Ovlaok,8865846c3bfffff,1,152,152,104.759005,11.579001,20,...,0,0,0,0,0,0,0,0,0,0
4,Phnom Penh,Pur SenChey,Boeng Thum,8865846521fffff,1,96,96,104.770540,11.482552,20,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,Phnom Penh,Pur SenChey,Boeng Thum,886584652bfffff,1,192,192,104.769793,11.471359,20,...,0,0,0,0,0,0,0,0,0,0
99996,Phnom Penh,Mean Chey,Chak Angrae Kraom,8865846109fffff,1,104,104,104.931755,11.486649,7,...,0,0,1,0,0,0,0,0,0,0
99997,Phnom Penh,Dangkao,Prey Sa,8865846189fffff,1,46,46,104.869213,11.492956,10,...,0,0,0,0,0,0,0,0,0,0
99998,Phnom Penh,Saensokh,Krang Thnong,8865846d49fffff,1,62,62,104.831913,11.592815,12,...,0,0,0,0,0,0,0,0,0,0


In [15]:
result.to_csv('../../../data/processed/100k.csv', index=False)