# Geocoding

- Fetch latitude and longitude for flats and points of interest, based on their physical addresses

In [None]:
import pandas as pd
import numpy as np
import time
import datetime
import geopy.geocoders
from geopy.geocoders import Nominatim, GoogleV3, Bing
from geopy.exc import GeocoderTimedOut
from geopy.exc import GeocoderServiceError
from geopy.exc import GeopyError
import requests

### Geocoding Services

- I explored 4 geocoding services - Google, Bing, Nomination and OneMap
- The first 2 are chargeable services that come with certain numbers of free requests
- Nomination is totdally free but does not work all the time
- OneMap is free and reliable but it is restricted to Singapore, which is enough for this project
- For chargeable services I have removed my API Key, so the codes will NOT Work without the key

In [None]:
# Using Google Geocoding API to get geocodes
def get_pos_google(location_name):
    
    location_name = location_name + ' Singapore'
    geolocator = GoogleV3(api_key='###Google API Key####')
    loc = geolocator.geocode(location_name, timeout=10)
    if not loc:
        return 0,0

    return (loc.latitude, loc.longitude) 


# Using Bing Geocoding API to get geocodes
def get_pos_bing(location_name):
    
    location_name = location_name + ' Singapre'
    geolocator = Bing('****Bing API******')
    loc = geolocator.geocode(location_name,timeout=10)

    if not loc:
        return 0,0

    return (loc.latitude, loc.longitude) 


# Using Nominatim to get geocodes is FREE
def get_pos_nomination(str_location):  
    
    str_location = str_location + " Singapore"
             
    geolocator = Nominatim(user_agent='user_1')
    loc = geolocator.geocode(str_location, timeout=10)  
    if not loc:
        return 0,0
    
    return (loc.latitude, loc.longitude)


In [None]:
# OneMap Geocoding service provided by Singapore Government

def get_pos_onemap(str_location):
    lat=0
    lng=0
    addr = str_location.replace(' ','+')
    addr = addr.replace("st.","saint")
    
    search = 'https://developers.onemap.sg/commonapi/search?searchVal='+addr+'&returnGeom=Y&getAddrDetails=Y&pageNum=1'

    print(search)
    
    resp = requests.get(search)
    if resp.status_code == 200:
        data = resp.json()
        try:
            lat,lng = data['results'][0]['LATITUDE'],data['results'][0]['LONGITUDE']
        except:
            print('error getting geocode for '+ str_location)
    
    
    return lat,lng

In [None]:
# function to iterate through the hdb flat dataframe and fetch the geocodes for the flats

def update_df_hdb(df,ym):
    print('fetching lat/long and updating the dataframe .....')
  
    for i in range(len(df)):
        print(df.index[i],df.iloc[i,9])
        df.iloc[i,61], df.iloc[i,62] = get_pos_onemap(df.iloc[i,9])
    
        # 1 second break before accessing the server again to prevent overloading server"
        # This will avoid being blocked by geocoding service provider"
        time.sleep(1)
    
    d=datetime.datetime.now()
    #timestamp = str(d.year)+str(d.month)+str(d.day)+str(d.hour)+str(d.minute)+str(d.second)
    path = '../datasets/hdb/hdb_'+ym+'_'+'geocoded.csv'
    df.to_csv(path, index = False)

    print('completed for '+ym)

#### Load Training and Validation data and break into small chuncks

The computation of lat and long take many hours. If I do it for all records at the same time and the computer will hang, so I will break them up by year.

In order to avoid SettingWithCopyWarning, I will be using copy(deep=True)

In [None]:
# Get the hdb resale flat transactions that have been cleaned

hdb_df = pd.read_csv('../datasets/hdb/hdb_resale_2015_2020_cleaned.csv')

In [None]:
# create latitude and longitude columns
hdb_df['lat']=0
hdb_df['lng']=0

In [None]:
hdb_df['sale_date'] = pd.to_datetime(hdb_df['sale_date'])
hdb_df.dtypes

### Fetch geocodes for HDB Flats 

It takes about 4 hours for each chunck of around 5000 records

In [None]:
# get 2015 data and breakup into 4 managable parts
hdb_2015 = hdb_df[hdb_df.sale_date.dt.year==2015].copy(deep=True)
hdb_2015_1 = hdb_2015.iloc[:5000,:].copy(deep=True)
hdb_2015_2 = hdb_2015.iloc[5000:10000,:].copy(deep=True)
hdb_2015_3 = hdb_2015.iloc[10000:15000,:].copy(deep=True)
hdb_2015_4 = hdb_2015.iloc[15000:,:].copy(deep=True)

# get 2016 data and break into 4 manageable parts
hdb_2016 = hdb_df[hdb_df.sale_date.dt.year==2016].copy(deep=True)
hdb_2016_1 = hdb_2016.iloc[:5000,:].copy(deep=True)
hdb_2016_2 = hdb_2016.iloc[5000:10000,:].copy(deep=True)
hdb_2016_3 = hdb_2016.iloc[10000:15000,:].copy(deep=True)
hdb_2016_4 = hdb_2016.iloc[15000:,:].copy(deep=True)

# get 2017 data and break into 4 manageable parts
hdb_2017 = hdb_df[hdb_df.sale_date.dt.year==2017].copy(deep=True)
hdb_2017_1 = hdb_2017.iloc[:5000,:].copy(deep=True)
hdb_2017_2 = hdb_2017.iloc[5000:10000,:].copy(deep=True)
hdb_2017_3 = hdb_2017.iloc[10000:15000,:].copy(deep=True)
hdb_2017_4 = hdb_2017.iloc[15000:,:].copy(deep=True)

# get 2018 data and break into 4 manageable parts
hdb_2018 = hdb_df[hdb_df.sale_date.dt.year==2018].copy(deep=True)
hdb_2018_1 = hdb_2018.iloc[:5000,:].copy(deep=True)
hdb_2018_2 = hdb_2018.iloc[5000:10000,:].copy(deep=True)
hdb_2018_3 = hdb_2018.iloc[10000:15000,:].copy(deep=True)
hdb_2018_4 = hdb_2018.iloc[15000:,:].copy(deep=True)

# get 2019 data and break into 4 manageable parts
hdb_2019 = hdb_df[hdb_df.sale_date.dt.year==2019].copy(deep=True)
hdb_2019_1 = hdb_2019.iloc[:5000,:].copy(deep=True)
hdb_2019_2 = hdb_2019.iloc[5000:10000,:].copy(deep=True)
hdb_2019_3 = hdb_2019.iloc[10000:15000,:].copy(deep=True)
hdb_2019_4 = hdb_2019.iloc[15000:,:].copy(deep=True)

# get 2020 data
hdb_2020 = hdb_df[hdb_df.sale_date.dt.year==2020].copy(deep=True)

In [None]:
# This csv file is used for testing purpose only
hdb_test = pd.read_csv('../datasets/hdb/hdb_test.csv')
hdb_test['lat']=0
hdb_test['lng']=0

# blocked to prevent it from running
update_df_hdb(hdb_test,'test')

### Combine the files for each year into a single file

The codes below are simplified to show each year's records as 4 files (based on how I have broken them down earlier). The reality is that each year has several files because the nomimation geocoding service is not reliable and sometimes it failed after just a few hundred fetches. 

Unfortunately the Google and Bing services are chargeable after certain number of free requests, so they are seldom used.

In [None]:
# Combine the files for each year after geocoding

hdb_2015_1 = pd.read_csv("../datasets/hdb/hdb_resale_2015_1_geocoded.csv")
hdb_2015_2 = pd.read_csv("../datasets/hdb/hdb_resale_2015_2_geocoded.csv")
hdb_2015_3 = pd.read_csv("../datasets/hdb/hdb_resale_2015_3_geocoded.csv")
hdb_2015_4 = pd.read_csv("../datasets/hdb/hdb_resale_2015_4_geocoded.csv")
hdb_2015_geocoded = pd.concat([hdb_2015_1,hdb_2015_2,hdb_2015_3,hdb_2015_4],ignore_index=True)

In [None]:
hdb_2015_geocoded.shape

In [None]:
#hdb_2015_geocoded.to_csv('../datasets/hdb/hdb_resale_2015_geocoded.csv',index=False)

### Fetch geocodes for Points of Interest

Only need to do for 2  set of POI. The rest came with the source files

In [68]:
# Load school dataset and update with geocodes

school_df = pd.read_csv('../datasets/poi/school_cleaned.csv')
print('fetching lat/long and updating the dataframe .....')

for i in range(len(school_df)):
    print(school_df.iloc[i,0] + ' at ' +school_df.iloc[i,3])
    school_df.iloc[i,1], school_df.iloc[i,2] = get_pos_nomination(school_df.iloc[i,3])

# school_df.to_csv('../datasets/poi/school_geocoded.csv', index = False)

fetching lat/long and updating the dataframe .....
admiralty primary school at 11   woodlands circle
admiralty secondary school at 31   woodlands crescent
ahmad ibrahim primary school at 10   yishun street 11
ahmad ibrahim secondary school at 751  yishun avenue 7
ai tong school at 100  bright hill drive
alexandra primary school at 2a   prince charles crescent
anchor green primary school at 31   anchorvale drive
anderson primary school at 19   ang mo kio ave 9
anderson secondary school at 10   ang mo kio street 53
anderson serangoon junior college at 4500 ang mo kio avenue 6
ang mo kio primary school at 20   ang mo kio avenue 3
ang mo kio secondary school at 6    ang mo kio street 22
anglican high school at 600  upper changi road
anglo-chinese junior college at 25   dover close east
anglo-chinese school (barker road) at 60   barker road
anglo-chinese school (independent) at 121  dover road
anglo-chinese school (junior) at 16   winstedt road
anglo-chinese school (primary) at 50   barker 

lianhua primary school at 2    bukit batok street 52
loyang view secondary school at 12   pasir ris street 11
maha bodhi school at 10   ubi avenue 1
manjusri secondary school at 20   ubi avenue 1
maris stella high school at 25   mount vernon road
marsiling primary school at 31   woodlands centre road
marsiling secondary school at 12   marsiling road
marymount convent school at 20   marymount road
mayflower primary school at 6    ang mo kio ave 2
mayflower secondary school at 2    ang mo kio street 21
mee toh school at 21   edgedale plains
meridian primary school at 20   pasir ris st 71
meridian secondary school at 31   pasir ris street 51
methodist girls' school (primary) at 11   blackmore drive
methodist girls' school (secondary) at 11   blackmore drive
millennia institute at 60   bukit batok west avenue 8
montfort junior school at 52   hougang avenue 8
montfort secondary school at 50   hougang avenue 8
nan chiau high school at 20   anchorvale link
nan chiau primary school at 50   anc

woodlands ring primary school at 11   woodlands ring road
woodlands ring secondary school at 10   woodlands ring road
woodlands secondary school at 11   marsiling road
xinghua primary school at 45   hougang avenue 1
xingnan primary school at 5    jurong west street 91
xinmin primary school at 9    hougang avenue 8
xinmin secondary school at 11   hougang avenue 8
xishan primary school at 8    yishun street 21
yangzheng primary school at 15   serangoon avenue 3
yew tee primary school at 10   choa chu kang st 64
yio chu kang primary school at 1    hougang street 51
yio chu kang secondary school at 3063 ang mo kio avenue 5
yishun innova junior college at 3    yishun ring road
yishun primary school at 500  yishun ring road
yishun secondary school at 4    yishun street 71
yishun town secondary school at 6    yishun street 21
yu neng primary school at 56   bedok north st 3
yuan ching secondary school at 103  yuan ching road
yuhua primary school at 158  jurong east street 24
yuhua secondary sc

In [71]:
# Load shopping malls dataset and update with geocodes based on malls' names 

df_malls = pd.read_csv('../datasets/poi/mall_cleaned.csv')

for i in range(len(df_malls)):
    lat, lng = get_pos_nomination(df_malls.iloc[i,0])
    df_malls.iloc[i,1] = lat
    df_malls.iloc[i,2] = lng
    time.sleep(2)

In [None]:
# df_malls.to_csv("../datasets/poi/mall_geocoded.csv")