# Feature Engineering Part 1

- Create features for latitude and longitude, based on addresses of flats and points of interest

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as sps
import seaborn as sns
import time
import datetime
import matplotlib.pyplot as plt
import geopy.geocoders
from geopy.geocoders import Nominatim, GoogleV3, Bing
from geopy.exc import GeocoderTimedOut
from geopy.exc import GeocoderServiceError
from geopy.exc import GeopyError

### Geocoding Services

- I explored 3 geocoding services - Google, Bing and Nomination
- The first 2 are chargeable services that come with certain numbers of free requests
- Nomination is totdally free but does not work all the time.
- For chargeable services I have removed my API Key, so the codes will NOT Work without the key

In [13]:
# Using Google Geocoding API to get geocodes
def get_pos_google(location_name):
    
    location_name = location_name + ' Singapore'
    geolocator = GoogleV3(api_key='AIzaSyDdpsQJUlKwnBUArrTuu7McX-Cjfdi3os4')
    loc = geolocator.geocode(location_name, timeout=10)
    if not loc:
        return 0,0

    return (loc.latitude, loc.longitude) 


# Using Bing Geocoding API to get geocodes
def get_pos_bing(location_name):
    
    location_name = location_name + ' Singapre'
    geolocator = Bing('****Bing API******')
    loc = geolocator.geocode(location_name,timeout=10)

    if not loc:
        return 0,0

    return (loc.latitude, loc.longitude) 


# Using Nominatim to get geocodes is FREE
def get_pos_nomination(str_location):  
    
    str_location = str_location + " Singapore"
             
    geolocator = Nominatim(user_agent='user_1')
    loc = geolocator.geocode(str_location, timeout=10)  
    if not loc:
        return 0,0
    
    return (loc.latitude, loc.longitude)


In [14]:
# function to iterate through the hdb flat dataframe and fetch their geocodes

def update_df_hdb(df,ym):
    print('fetching lat/long and updating the dataframe .....')
  
    for i in range(len(df)):
        print(df.index[i],df.iloc[i,9])
        df.iloc[i,61], df.iloc[i,62] = get_pos_google(df.iloc[i,9])
    
        # 2 seconds break before accessing the server again to prevent overloading server"
        # This will avoid being blocked by geocoding service provider"
        time.sleep(2)
    
    d=datetime.datetime.now()
    #timestamp = str(d.year)+str(d.month)+str(d.day)+str(d.hour)+str(d.minute)+str(d.second)
    path = '../datasets/hdb/hdb_resale_'+ym+'_'+'geocoded.csv'
    df.to_csv(path, index = False)

    print('completed for '+ym)

In [5]:
df = pd.read_csv("../datasets/hdb/any.csv")

In [12]:
df[['lat','lng']]

Unnamed: 0,lat,lng
0,1.311435,103.87624
1,0.000000,0.00000
2,0.000000,0.00000
3,0.000000,0.00000
4,0.000000,0.00000
...,...,...
98,0.000000,0.00000
99,0.000000,0.00000
100,0.000000,0.00000
101,0.000000,0.00000


In [15]:
update_df_hdb(df,'1234')

fetching lat/long and updating the dataframe .....
0 120a kim tian place
1 687b woodlands drive 75
2 688b woodlands drive 75
3 690c woodlands drive 75
4 687c woodlands drive 75
5 688a woodlands drive 75
6 688b woodlands drive 75
7 688c woodlands drive 75
8 687b woodlands drive 75
9 687a woodlands drive 75
10 161 yung ping road
11 690e woodlands drive 75
12 688d woodlands drive 75
13 120 kim tian place
14 121 kim tian place
15 160 yung ping road
16 688d woodlands drive 75
17 688b woodlands drive 75
18 688a woodlands drive 75
19 689b woodlands drive 75
20 688b woodlands drive 75
21 690a woodlands drive 75
22 690c woodlands drive 75
23 687a woodlands drive 75
24 689e woodlands drive 75
25 688a woodlands drive 75
26 690a woodlands drive 75
27 121 kim tian place
28 121a kim tian place
29 161 yung ping road
30 159 yung ping road
31 690c woodlands drive 75
32 690c woodlands drive 75
33 689b woodlands drive 75
34 689f woodlands drive 75
35 690a woodlands drive 75
36 690a woodlands drive 75
37 

#### Load Training and Validation data and break into small chuncks

The computation of lat and long take many hours. If I do it for all records at the same time and the computer will hang, so I will break them up by year.

In order to avoid SettingWithCopyWarning, I will be using copy(deep=True)

In [11]:
hdb_df = pd.read_csv("../datasets/hdb/hdb_resale_2015_2018_cleaned.csv")

In [25]:
# latitude and longitude columns to be populated with geocodes
hdb_df['lat']=0
hdb_df['lng']=0

In [26]:
hdb_df.columns

Index(['sale_date', 'town', 'flat_type', 'storey_range', 'floor_area_sqm',
       'flat_model', 'lease_commence_date', 'remaining_lease', 'resale_price',
       'address', 'floor_range', 'bedok', 'bishan', 'bukit_batok',
       'bukit_merah', 'bukit_panjang', 'bukit_timah', 'central_area',
       'choa_chu_kang', 'clementi', 'geylang', 'hougang', 'jurong_east',
       'jurong_west', 'kallang_whampoa', 'marine_parade', 'pasir_ris',
       'punggol', 'queenstown', 'sembawang', 'sengkang', 'serangoon',
       'tampines', 'toa_payoh', 'woodlands', 'yishun', 'model_adjoined_flat',
       'model_apartment', 'model_dbss', 'model_improved',
       'model_improved_maisonette', 'model_maisonette', 'model_model_a',
       'model_model_a2', 'model_model_a_maisonette', 'model_multi_generation',
       'model_new_generation', 'model_premium_apartment',
       'model_premium_apartment_loft', 'model_premium_maisonette',
       'model_simplified', 'model_standard', 'model_terrace', 'model_type_s1',
   

In [13]:
hdb_df['sale_date'] = pd.to_datetime(hdb_df['sale_date'])
hdb_df.dtypes

sale_date                datetime64[ns]
town                             object
flat_type                        object
storey_range                     object
floor_area_sqm                  float64
                              ...      
type_3_room                       int64
type_4_room                       int64
type_5_room                       int64
type_executive                    int64
type_multi_generation             int64
Length: 61, dtype: object

In [27]:
# get 2015 data and breakup into 4 managable parts
hdb_2015 = hdb_df[hdb_df.sale_date.dt.year==2015].copy(deep=True)
hdb_2015_1 = hdb_2015.iloc[:5000,:].copy(deep=True)
hdb_2015_2 = hdb_2015.iloc[5000:10000,:].copy(deep=True)
hdb_2015_3 = hdb_2015.iloc[10000:15000,:].copy(deep=True)
hdb_2015_4 = hdb_2015.iloc[15000:,:].copy(deep=True)

# get 2016 data and break into 4 manageable parts
hdb_2016 = hdb_df[hdb_df.sale_date.dt.year==2016].copy(deep=True)
hdb_2016_1 = hdb_2016.iloc[:5000,:].copy(deep=True)
hdb_2016_2 = hdb_2016.iloc[5000:10000,:].copy(deep=True)
hdb_2016_3 = hdb_2016.iloc[10000:15000,:].copy(deep=True)
hdb_2016_4 = hdb_2016.iloc[15000:,:].copy(deep=True)

# get 2017 data and break into 4 manageable parts
hdb_2017 = hdb_df[hdb_df.sale_date.dt.year==2017].copy(deep=True)
hdb_2017_1 = hdb_2017.iloc[:5000,:].copy(deep=True)
hdb_2017_2 = hdb_2017.iloc[5000:10000,:].copy(deep=True)
hdb_2017_3 = hdb_2017.iloc[10000:15000,:].copy(deep=True)
hdb_2017_4 = hdb_2017.iloc[15000:,:].copy(deep=True)

# get 2018 data and break into 4 manageable parts
hdb_2018 = hdb_df[hdb_df.sale_date.dt.year==2018].copy(deep=True)
hdb_2018_1 = hdb_2018.iloc[:5000,:].copy(deep=True)
hdb_2018_2 = hdb_2018.iloc[5000:10000,:].copy(deep=True)
hdb_2018_3 = hdb_2018.iloc[10000:15000,:].copy(deep=True)
hdb_2018_4 = hdb_2018.iloc[15000:,:].copy(deep=True)

### Fetch geocodes for HDB Flats 

It takes about 4 hours for each chunck of around 5000 records

In [None]:
# Run the function to populate one chunck at a time

#update_df_hdb(hdb_2015_1,'hdb_2015_1')

In [35]:
# for verifying that the codes are working (if necessary)
# data will be exported to datasets/ouput/<df name + timestamp>.csv

test = hdb_2018.iloc[:5,:].copy(deep=True)
update_df_hdb(test,'test')

fetching lat/long and updating the dataframe .....
57460 314 ang mo kio avenue 3
57461 109 ang mo kio avenue 4
57462 150 ang mo kio avenue 5
57463 559 ang mo kio avenue 10
57464 461 ang mo kio avenue 10
completed for year test


### Combine the files for 2015 to 2018 into a single file

The codes below are simplified to show each year's records as 4 files (based on how I have broken them down earlier). The reality is that each year has several files because the nomimation geocoding service is not reliable and sometimes it failed after just a few hundred fetches. 

Unfortunately the Google and Bing services are chargeable after certain number of free requests, so they are seldom used.

In [17]:
combined = pd.read_csv("../datasets/combined/combined_2015_2018.csv")

In [20]:
combined['sale_date'] = pd.to_datetime(combined['sale_date'])

In [26]:
combined[combined['sale_date'].dt.year==2018].to_csv("../datasets/combined/data_2018.csv",index=False)

In [59]:
# Combine them after geocoding

hdb_2015_1 = pd.read_csv("../datasets/hdb/hdb_resale_2015_1_geocoded.csv")
hdb_2015_2 = pd.read_csv("../datasets/hdb/hdb_resale_2015_2_geocoded.csv")
hdb_2015_3 = pd.read_csv("../datasets/hdb/hdb_resale_2015_3_geocoded.csv")
hdb_2015_4 = pd.read_csv("../datasets/hdb/hdb_resale_2015_4_geocoded.csv")

In [60]:
hdb_2015_geocoded = pd.concat([hdb_2015_1,hdb_2015_2,hdb_2015_3,hdb_2015_4],ignore_index=True)

In [61]:
hdb_2015_geocoded.shape

(17766, 63)

In [62]:
hdb_2015_geocoded.to_csv('../datasets/hdb/hdb_resale_2015_geocoded.csv',index=False)

### Fetch goocodes for Points of Interest

In [38]:
# Load school dataset and update with geocodes

school_df = pd.read_csv('../datasets/poi/school_cleaned.csv')
print('fetching lat/long and updating the dataframe .....')

for i in range(len(school_df)):
    print(school_df.iloc[i,0] + ' at ' +school_df.iloc[i,3])
    school_df.iloc[i,1], school_df.iloc[i,2] = get_pos_nomination(school_df.iloc[i,3])

school_df.to_csv('../datasets/poi/school_geocoded.csv', index = False)

print('completed')

fetching lat/long and updating the dataframe .....
completed


In [39]:
# Load shopping malls dataset and update with geocodes based on malls' names (if this tough for nomination)

df_malls = pd.read_csv('../datasets/poi/mall_cleaned.csv')

for i in range(len(df_malls)):
    lat, lng = get_pos_nomination(df_malls.iloc[i,0])
    df_malls.iloc[i,1] = lat
    df_malls.iloc[i,2] = lng
    time.sleep(2)

In [40]:
df_malls[df_malls.lat==0]

Unnamed: 0,mall,lat,lng
6,City Gate Mall,0.0,0.0
23,HillV2,0.0,0.0
57,Fitzpatrick's,0.0,0.0
58,Northshore Plaza,0.0,0.0
74,Forum The Shopping Mall,0.0,0.0
82,JCubeMall,0.0,0.0
86,Jurong Point Shopping Mall,0.0,0.0
87,Kallang Leisure Park,0.0,0.0
96,NeWEST Shopping Mall,0.0,0.0
103,Paragon Shopping Centre,0.0,0.0


In [None]:
# Nomination could not get geocodes by mall's name very well so I will use Google service for those missed
# Note that I need to enter Google API key into the function for it to work

for i in df_malls[df_malls['lat']==0].index:
    
    lat, lng = get_pos_google(df_malls.iloc[i,0])
    df_malls.iloc[i,1]=lat
    df_malls.iloc[i,2]=lng

In [45]:
# check if any malls still do not have latitude and longitude
df_malls[df_malls['lat']==0]

Unnamed: 0,mall,lat,lng


In [None]:
df_malls.to_csv("../datasets/poi/mall_geocoded.csv")