In [219]:
# import essential data handling libraries
import pandas as pd
import numpy as np
import json
import requests
from geopy.distance import geodesic
import time
from datetime import datetime

In [199]:
def get_hdb_resale_data():
    # importing our data set from data.gov.sg
    query_string='https://data.gov.sg/api/action/datastore_search?resource_id=42ff9cfe-abe5-4b54-beda-c88f9bb438ee&limit=1000000'
    resp = requests.get(query_string)

    #Convert JSON into Python Object 
    data = json.loads(resp.content)

    # Checking the length of dataframeenrich_geo_coordinate
    len(data['result']['records']) 

    # Store our dictionart records into hdb_price_dict_records
    hdb_price_dict_records = data['result']['records']

    # Next we need to feed our JSON data into dataframe. 
    # We will access the 'records' key:value pairs of the python dictionary. 
    # We will then map the list into a dataframe.
    town = []
    flat_type = []
    flat_model = []
    floor_area_sqm = []
    street_name = []
    resale_price = []
    month = []
    remaining_lease = []
    lease_commence_date = []
    storey_range = []
    _id = []
    block = []

    for i in range(0, len(hdb_price_dict_records)):
        town.append(hdb_price_dict_records[i]['town'])
        flat_type.append(hdb_price_dict_records[i]['flat_type'])
        flat_model.append(hdb_price_dict_records[i]['flat_model'])
        floor_area_sqm.append(hdb_price_dict_records[i]['floor_area_sqm'])
        street_name.append(hdb_price_dict_records[i]['street_name'])
        resale_price.append(hdb_price_dict_records[i]['resale_price'])
        month.append(hdb_price_dict_records[i]['month'])
        remaining_lease.append(hdb_price_dict_records[i]['remaining_lease'])
        lease_commence_date.append(hdb_price_dict_records[i]['lease_commence_date'])
        storey_range.append(hdb_price_dict_records[i]['storey_range'])
        _id.append(hdb_price_dict_records[i]['_id'])
        block.append(hdb_price_dict_records[i]['block'])
        
    df = pd.DataFrame({
        'town': town,
        'flat_type': flat_type,
        'flat_model': flat_model,
        'floor_area_sqm': floor_area_sqm,
        'street_name': street_name,
        'resale_price': resale_price,
        'month': month,
        'remaining_lease': remaining_lease,
        'lease_commence_date': lease_commence_date,
        'storey_range': storey_range,
        '_id': _id,
        'block': block
    })

    df['address'] = df['block'] + ' ' + df['street_name']

    return df

In [200]:
df_hdb_price = get_hdb_resale_data()

In [201]:
# Let's examine our dataframe
df_hdb_price.head()

Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,street_name,resale_price,month,remaining_lease,lease_commence_date,storey_range,_id,block,address
0,ANG MO KIO,2 ROOM,Improved,44,ANG MO KIO AVE 10,232000,2017-01,61 years 04 months,1979,10 TO 12,1,406,406 ANG MO KIO AVE 10
1,ANG MO KIO,3 ROOM,New Generation,67,ANG MO KIO AVE 4,250000,2017-01,60 years 07 months,1978,01 TO 03,2,108,108 ANG MO KIO AVE 4
2,ANG MO KIO,3 ROOM,New Generation,67,ANG MO KIO AVE 5,262000,2017-01,62 years 05 months,1980,01 TO 03,3,602,602 ANG MO KIO AVE 5
3,ANG MO KIO,3 ROOM,New Generation,68,ANG MO KIO AVE 10,265000,2017-01,62 years 01 month,1980,04 TO 06,4,465,465 ANG MO KIO AVE 10
4,ANG MO KIO,3 ROOM,New Generation,67,ANG MO KIO AVE 5,265000,2017-01,62 years 05 months,1980,01 TO 03,5,601,601 ANG MO KIO AVE 5


In [202]:
df_hdb_price.shape

(70104, 13)

In [213]:
# Create a function that creates data frames of key locations
def get_geo_coordinates(df):
    list_of_location = df.iloc[:,1].tolist()
    # Obtaining Mall Coordinates in Singapore
    address_name = []
    address_roadname = []
    address_lat = []
    address_long = []

    for i in range(0, len(list_of_location)):
        query_address = list_of_location[i]
        query_string = 'https://developers.onemap.sg/commonapi/search?searchVal='+str(query_address)+'&returnGeom=Y&getAddrDetails=Y'
        resp = requests.get(query_string)
        data = json.loads(resp.content)
        
        if data['found'] != 0:
            address_name.append(query_address)
            address_roadname.append(data["results"][0]["ROAD_NAME"])
            address_lat.append(data["results"][0]["LATITUDE"])
            address_long.append(data["results"][0]["LONGITUDE"])

            print (str(query_address)+ " ,Lat: " +data['results'][0]['LATITUDE'] + " Long: " + data['results'][0]['LONGITUDE'])

        else:
            print ("No Results")
            
    # Store this information in a dataframe
    return pd.DataFrame({
                'address': address_name,
                'roadName': address_roadname,
                'latitude': address_lat,
                'longitude': address_long
            })

In [215]:
# import csv of locations of interests
df_shopping_malls_list = pd.read_csv("./shopping_malls.csv")
df_mrt_stations_list = pd.read_csv("./data_sets/mrt_stations.csv")
df_govt_funded_schools_list = pd.read_csv("./data_sets/govt_funded_schools.csv")
df_govt_aided_schools_list = pd.read_csv("./data_sets/govt_aided_schools.csv")
df_sap_schools_list = pd.read_csv("./data_sets/sap_schools.csv")

df_mall = get_geo_coordinates(df_shopping_malls_list)
df_mrt = get_geo_coordinates(df_mrt_stations_list)
df_govt_funded_schools = get_geo_coordinates(df_govt_funded_schools_list)
df_govt_aided_schools = get_geo_coordinates(df_govt_aided_schools_list)
df_sap_schools = get_geo_coordinates(df_sap_schools_list)

 Shopping Centre ,Lat: 1.287617926 Long: 103.8033911
Rochester Mall ,Lat: 1.305317927 Long: 103.7885239
Taman Jurong Shopping Centre ,Lat: 1.334844875 Long: 103.720462
The Clementi Mall ,Lat: 1.315496754 Long: 103.76457020000001
The Star Vista ,Lat: 1.30697044 Long: 103.7884203
Tiong Bahru Plaza ,Lat: 1.2864712 Long: 103.8271583
West Coast Plaza ,Lat: 1.303742583 Long: 103.7660929
No Results
Jurong East MRT Station ,Lat: 1.33400861 Long: 103.74173509999999
Bukit Batok MRT Station ,Lat: 1.349360664 Long: 103.7499662
Bukit Gombak MRT Station ,Lat: 1.35869439 Long: 103.752085
Choa Chu Kang MRT Station ,Lat: 1.3849696919999999 Long: 103.74459190000002
Yew Tee MRT Station ,Lat: 1.3975350180000001 Long: 103.7474051
Kranji MRT Station ,Lat: 1.4251259490000001 Long: 103.7617547
Marsiling MRT Station ,Lat: 1.432807925 Long: 103.77420670000001
Woodlands MRT Station ,Lat: 1.437094408 Long: 103.78722490000001
Admiralty MRT Station ,Lat: 1.4401624819999999 Long: 103.8005272
Sembawang MRT Station ,L

In [217]:
def enrich_lat_lg_unique(df):
    # Create address column
    if 'address' not in df.columns:
        df['address'] = df['block'] + " " + df['street_name']
    
    # Dedup Address List
    df_dedup = df.drop_duplicates(subset='address', keep='first')
    df_dedup['address'] = df_dedup['address'].str.replace("ST. GEORGE'S", "SAINT GEORGE'S")

    # Next let's grab the unique addresses and create a list 
    address_list = df_dedup['address'].tolist()

    no_items = len(address_list)
    latitude = []
    longitude = []
    blk_no = []
    road_name = []
    postal_code = []
    address = []
    count = 0
    print(f"{no_items} rows of addresses to enrich")
    for row in range(len(address_list)):
        #formulate query string  
        query_address = address_list[row]
        query_string='https://developers.onemap.sg/commonapi/search?searchVal='+str(query_address)+'&returnGeom=Y&getAddrDetails=Y'
        resp = requests.get(query_string)

        #Convert JSON into Python Object 
        data_geo_location=json.loads(resp.content)

        if data_geo_location['found'] != 0:
            latitude.append(data_geo_location['results'][0]['LATITUDE'])
            longitude.append(data_geo_location['results'][0]['LONGITUDE'])
            blk_no.append(data_geo_location['results'][0]['BLK_NO'])
            road_name.append(data_geo_location['results'][0]['ROAD_NAME'])
            postal_code.append(data_geo_location['results'][0]['POSTAL'])
            address.append(query_address)
            print (str(round((row / no_items)*100, 2)) + "% " + "Completed: " + str(query_address) + " ,Lat: " + data_geo_location['results'][0]['LATITUDE'] + " Long: " + data_geo_location['results'][0]['LONGITUDE'])
        else:
            latitude.append(None)
            longitude.append(None)
            blk_no.append(None)
            road_name.append(None)
            postal_code.append(None)
            address.append(query_address)
            print (str(round((row / no_items)*100, 2)) + "%" + ": " + str(query_address) +  "No Results")

    # Fit into a dataframe
    return pd.DataFrame({
                'latitude': latitude,
                'longitude': longitude,
                'blk_no': blk_no,
                'road_name': road_name,
                'postal_code': postal_code,
                'address': address
            })

In [12]:
start_time = time.time()
df_enriched = enrich_lat_lg_unique(df_hdb_price)
print("--- %s seconds ---" % (time.time() - start_time))

 1.3448018290000001 Long: 103.9598616
97.04% Completed: 496G TAMPINES AVE 9 ,Lat: 1.360810818 Long: 103.9505822
97.05% Completed: 894A WOODLANDS DR 50 ,Lat: 1.436011336 Long: 103.79036690000001
97.06% Completed: 317C YISHUN AVE 9 ,Lat: 1.433780483 Long: 103.84310570000001
97.07% Completed: 431A YISHUN AVE 1 ,Lat: 1.419606825 Long: 103.8472882
97.08% Completed: 318A YISHUN AVE 9 ,Lat: 1.434074497 Long: 103.84357359999998
97.1% Completed: 317A YISHUN AVE 9 ,Lat: 1.434505929 Long: 103.843292
97.11% Completed: 318B YISHUN AVE 9 ,Lat: 1.4336783359999998 Long: 103.8438713
97.12% Completed: 632 ANG MO KIO AVE 4 ,Lat: 1.3794906480000002 Long: 103.84148259999999
97.13% Completed: 252 BISHAN ST 22 ,Lat: 1.3620398809999998 Long: 103.84375759999999
97.14% Completed: 513 BT BATOK ST 52 ,Lat: 1.354054852 Long: 103.7523202
97.15% Completed: 52 TELOK BLANGAH DR ,Lat: 1.273849941 Long: 103.81124709999999
97.16% Completed: 103 GANGSA RD ,Lat: 1.378670708 Long: 103.76853840000001
97.18% Completed: 547C S

In [16]:
df_enriched.shape

(8712, 6)

In [210]:
def export_missing_values(df):
    if df['latitude'].isna().sum() != 0:
        df[df['latitude'].isna()].to_csv("./data_sets/missing_lat_lng"+datetime.today().strftime("%Y-%m-%d")+".csv")
        df_missing_lat_lng.dropna().to_csv("./data_sets/unique_address_lat_lng_with_missing"+datetime.today().strftime("%Y-%m-%d")+".csv")
    else:
        df.to_csv("./data_sets/unique_address_lat_lng_no_missing"+datetime.today().strftime("%Y-%m-%d")+".csv")


In [211]:
export_missing_values(df_enriched)

In [46]:
df_enriched.dropna(inplace=True)
df_enriched.isna().sum()

latitude       0
longitude      0
blk_no         0
road_name      0
postal_code    0
address        0
dtype: int64

In [216]:
def get_lat_lng(df):
    list_of_lat = df['latitude']
    list_of_lng = df['longitude']
    list_of_lat_lng = []
    for lat, lng in zip(list_of_lat, list_of_lng):
        list_of_lat_lng.append((lat,lng))
    return list_of_lat_lng

In [102]:
hdb_lat_lng = get_lat_lng(df_enriched)
mall_lat_lng = get_lat_lng(df_mall)
mrt_lat_lng = get_lat_lng(df_mrt)
govt_aided_lat_lng = get_lat_lng(df_govt_aided_schools)
govt_funded_lat_lng = get_lat_lng(df_govt_funded_schools)
sap_lat_lng = get_lat_lng(df_sap_schools)

In [122]:
def get_nearest_dist_m(lst_origin, lst_destination):
    list_of_distances = []
    shortest_distance = []

    for origin in lst_origin:
        for destination in range(0, len(lst_destination)):
          list_of_distances.append(geodesic(origin, lst_destination[destination]).meters)
        shortest = min(list_of_distances)
        print(f'Shortest distance is {shortest} meters')
        shortest_distance.append(shortest)
        list_of_distances.clear()
    return shortest_distance

In [218]:
start_time = time.time()
nearest_mrt_dist = get_nearest_dist_m(hdb_lat_lng, mrt_lat_lng)
nearest_mall_dist = get_nearest_dist_m(hdb_lat_lng, mall_lat_lng)
nearest_govt_aided_dist = get_nearest_dist_m(hdb_lat_lng, govt_aided_lat_lng)
nearest_govt_funded_dist = get_nearest_dist_m(hdb_lat_lng, govt_funded_lat_lng)
nearest_sap_dist = get_nearest_dist_m(hdb_lat_lng, sap_lat_lng)
print("--- %s seconds ---" % (time.time() - start_time))

tance is 5571.208246159869 meters
Shortest distance is 5680.206393926434 meters
Shortest distance is 5591.134022292626 meters
Shortest distance is 1165.6231476956477 meters
Shortest distance is 1642.121602054221 meters
Shortest distance is 1519.8417258643885 meters
Shortest distance is 1493.092967404076 meters
Shortest distance is 3029.743653513132 meters
Shortest distance is 3550.759736483349 meters
Shortest distance is 5337.645064279763 meters
Shortest distance is 8573.882742825568 meters
Shortest distance is 4947.027056866855 meters
Shortest distance is 4856.223228714519 meters
Shortest distance is 8610.86377807395 meters
Shortest distance is 8469.5058301367 meters
Shortest distance is 2153.113154320339 meters
Shortest distance is 3173.3392080457083 meters
Shortest distance is 3043.6072163953995 meters
Shortest distance is 3146.8050366234647 meters
Shortest distance is 3131.206125784167 meters
Shortest distance is 3245.6620383888735 meters
Shortest distance is 3088.774558455483 mete

In [128]:
def get_cbd_dist_m(lst_origin):
    cbd_dist = []
    for origin in lst_origin:
        cbd_dist.append(geodesic(origin,(1.2830, 103.8513)).meters)


In [129]:
cbd_dist = get_cbd_dist_m(hdb_lat_lng)

In [130]:
# Creating new columns in dataframe
df_enriched['nearest_mrt_dist'] = nearest_mrt_dist
df_enriched['nearest_mall_dist'] = nearest_mall_dist
df_enriched['nearest_govt_aided_dist'] = nearest_govt_aided_dist
df_enriched['nearest_funded_dist'] = nearest_govt_funded_dist
df_enriched['nearest_sap_dist'] = nearest_sap_dist

In [151]:
# Re-merging dataframes
combined = df_enriched.merge(df_hdb_price, on="address", how="outer")

In [152]:
combined.shape

(70105, 23)

In [165]:
combined.dropna(inplace=True)

In [168]:
combined.drop(['blk_no', 'road_name'], axis=1, inplace=True)

In [222]:
combined.to_csv("./data_sets/combined_"+datetime.today().strftime("%Y-%m-%d")+".csv")