# Singapore Public Housing (HDB) Resale Price Prediction Model (Part 5)
### Feature Engineering - Geospatial Features

## 1. Initialization

In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from shapely import wkt
from math import sin, cos, sqrt, atan2, radians

pd.set_option('max_columns', 99)

In [2]:
hdb = pd.read_csv('./Dataset/Transitional/complete_data_with_ec.csv')
mrt = pd.read_csv('./Dataset/Engineered/MRT.csv')
bus = pd.read_csv('./Dataset/Engineered/Bus_Stop.csv')
pri = pd.read_csv('./Dataset/Engineered/Primary_School.csv')
sec = pd.read_csv('./Dataset/Engineered/Secondary_School.csv')
spm = pd.read_csv('./Dataset/Engineered/Mall.csv')

In [3]:
def dist_cal(coor_1, coor_2):
    R = 6373
    lat_1 = radians(coor_1[0])
    lon_1 = radians(coor_1[1])   
    lat_2 = radians(coor_2[0])
    lon_2 = radians(coor_2[1])
    dlon = lon_2 - lon_1
    dlat = lat_2 - lat_1
    a = sin(dlat / 2)**2 + cos(lat_1) * cos(lat_2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = int(R * c * 1000)
    return int(distance)

## 2. MRT Stations

In [4]:
%%time

for idx_1, coor_1 in enumerate(zip(hdb['latitude'], hdb['longitude'])):
    dist_list = []
    
    for coor_2 in zip(mrt['Latitude'], mrt['Longitude']):
        dist_list.append(dist_cal(coor_1, coor_2))
        
    hdb.loc[idx_1, 'mrt_dist'] = min(dist_list)
    
    nearest = dist_list.index(min(dist_list))
    hdb.loc[idx_1, 'mrt_station'] = mrt.loc[nearest, 'Name']
    
    if (mrt.loc[nearest, 'Bus_Interchange'] == 1) & (min(dist_list) <= 500):
        hdb.loc[idx_1, 'near_bus_itc'] = 1
    else:
        hdb.loc[idx_1, 'near_bus_itc'] = 0
    
    if (mrt.loc[nearest, 'MRT_Interchange'] == 1) & (min(dist_list) <= 500):
        hdb.loc[idx_1, 'near_mrt_itc'] = 1
    else:
        hdb.loc[idx_1, 'near_mrt_itc'] = 0

CPU times: user 2min 39s, sys: 1.1 s, total: 2min 40s
Wall time: 2min 41s


## 3. Bus Stops

In [5]:
%%time
for idx_1, coor_1 in enumerate(zip(hdb['latitude'], hdb['longitude'])):
    dist_list = []
    bus_list = 0
    for coor_2 in zip(bus['latitude'], bus['longitude']):
        distance = dist_cal(coor_1, coor_2)
        dist_list.append(distance)
        if distance <= 300:
            bus_list += 1
    hdb.loc[idx_1, 'bus_u300m'] = bus_list
    hdb.loc[idx_1, 'bus_dist'] = min(dist_list)

CPU times: user 10min 9s, sys: 2.01 s, total: 10min 11s
Wall time: 10min 13s


## 4. Shopping Malls

In [6]:
%%time
for idx_1, coor_1 in enumerate(zip(hdb['latitude'], hdb['longitude'])):
    dist_list = []
    total_list = []
    
    for coor_2 in zip(spm['latitude'], spm['longitude']):
        distance = dist_cal(coor_1, coor_2)
        total_list.append(distance)
        if distance <= 1000:
            dist_list.append(distance)
            
    hdb.loc[idx_1, 'mall_u1km'] = len(dist_list)
    hdb.loc[idx_1, 'mall_dist'] = min(total_list)

CPU times: user 59.5 s, sys: 198 ms, total: 59.7 s
Wall time: 59.8 s


## 5. Primary School

In [7]:
pri.loc[0:19, 'elite'] = 1

In [8]:
%%time
for idx_1, coor_1 in enumerate(zip(hdb['latitude'], hdb['longitude'])):
    under_1km = 0
    under_2km = 0
    aff_1km = 0
    aff_2km = 0
    elite_1km = 0
    elite_2km = 0

    for idx_2, coor_2 in enumerate(zip(pri['latitude'], pri['longitude'])):
        distance = dist_cal(coor_1, coor_2)
        
        if distance <= 1000:
            under_1km+=1
            if (pri.loc[idx_2, 'affiliation'] == 1) & (pri.loc[idx_2, 'elite'] == 1):
                aff_1km+=1
            if pri.loc[idx_2, 'elite'] == 1:
                elite_1km+=1
                
        if distance <= 2000:
            under_2km+=1
            if (pri.loc[idx_2, 'affiliation'] == 1) & (pri.loc[idx_2, 'elite'] == 1):
                aff_2km+=1
            if pri.loc[idx_2, 'elite'] == 1:
                elite_2km+=1
            
    hdb.loc[idx_1, 'pri_u1km'] = under_1km
    hdb.loc[idx_1, 'pri_u2km'] = under_2km
    hdb.loc[idx_1, 'pri_aff_u1km'] = aff_1km
    hdb.loc[idx_1, 'pri_aff_u2km'] = aff_2km
    hdb.loc[idx_1, 'pri_elite_u1km'] = elite_1km
    hdb.loc[idx_1, 'pri_elite_u2km'] = elite_2km

CPU times: user 2min 48s, sys: 589 ms, total: 2min 48s
Wall time: 2min 48s


## 6. Secondary School

In [224]:
sec.loc[0:19, 'elite'] = 1

In [10]:
%%time
for idx_1, coor_1 in enumerate(zip(hdb['latitude'], hdb['longitude'])):
    under_1km = 0
    under_2km = 0
    aff_1km = 0
    aff_2km = 0
    elite_1km = 0
    elite_2km = 0

    for idx_2, coor_2 in enumerate(zip(sec['latitude'], sec['longitude'])):
        distance = dist_cal(coor_1, coor_2)
        
        if distance <= 1000:
            under_1km+=1
            if (sec.loc[idx_2, 'affiliation'] == 1) & (sec.loc[idx_2, 'elite'] == 1):
                aff_1km+=1
            if sec.loc[idx_2, 'elite'] == 1:
                elite_1km+=1
                
        if distance <= 2000:
            under_2km+=1
            if (sec.loc[idx_2, 'affiliation'] == 1) & (sec.loc[idx_2, 'elite'] == 1):
                aff_2km+=1
            if sec.loc[idx_2, 'elite'] == 1:
                elite_2km+=1
            
    hdb.loc[idx_1, 'sec_u1km'] = under_1km
    hdb.loc[idx_1, 'sec_u2km'] = under_2km
    hdb.loc[idx_1, 'sec_aff_u1km'] = aff_1km
    hdb.loc[idx_1, 'sec_aff_u2km'] = aff_2km
    hdb.loc[idx_1, 'sec_elite_u1km'] = elite_1km
    hdb.loc[idx_1, 'sec_elite_u2km'] = elite_2km

CPU times: user 2min 40s, sys: 645 ms, total: 2min 41s
Wall time: 2min 41s


## 7. Highways

In [11]:
def get_highway_dist(row, df):
    distances = []
    house = Point(row['longitude'], row['latitude'])
    for highway in df['geometry']:
        distances.append(int(highway.distance(house) * 100_000))
    return min(distances)

In [12]:
highways = pd.read_csv('./Dataset/Spatial/Highways.csv')
highways['geometry'] = highways['geometry'].apply(wkt.loads)
highways = gpd.GeoDataFrame(highways, geometry='geometry')

In [13]:
%%time
hdb['dist_to_highway'] = hdb.apply(get_highway_dist, df=highways, axis=1)

CPU times: user 41.6 s, sys: 77.2 ms, total: 41.7 s
Wall time: 41.7 s


## 8. Highway Exits/Ramps

In [14]:
ramps = pd.read_csv('./Dataset/Spatial/Ramps.csv')
ramps['geometry'] = ramps['geometry'].apply(wkt.loads)
ramps = gpd.GeoDataFrame(ramps, geometry='geometry')

In [15]:
%%time
hdb['dist_to_exits'] = hdb.apply(get_highway_dist, df=ramps, axis=1)

CPU times: user 6min 51s, sys: 251 ms, total: 6min 51s
Wall time: 6min 51s


## 9. Export to Final Dataset

In [16]:
hdb.to_csv('./Dataset/Transitional/final_data.csv', index=False)