# Singapore Public Housing (HDB) Resale Price Prediction Model (Part 2)
### Data Collection - Amenities and Infrastructure Data

## 1. Initialization

In [1]:
# Import Vanilla Libraries
import requests, json, time, random, re
import pandas as pd
import numpy as np

# Import Library for Web Scraping
from bs4 import BeautifulSoup

In [2]:
# MAIN COORDINATES RETRIEVAL FUNCTION

def coordinate_retrieval(df_name, target_col, mrt=False):
    latitude = []
    longitude = []

    for i, entry in enumerate(df_name[target_col]):
        try:
            print('\rWaiting... ({})... {} entries remaining... '.format(entry, len(df_name)-i-1), end='.')
            if mrt:
                query = "https://developers.onemap.sg/commonapi/search?searchVal=" + entry + ' mrt' + "&returnGeom=Y&getAddrDetails=N"
            else:
                query = "https://developers.onemap.sg/commonapi/search?searchVal=" + entry + "&returnGeom=Y&getAddrDetails=N"
            response = requests.get(query)
            coor_json = json.loads(response.content)
            latitude.append(coor_json['results'][0]['LATITUDE'])
            longitude.append(coor_json['results'][0]['LONGITUDE'])

        except:
            latitude.append(np.nan)
            longitude.append(np.nan)

        time.sleep(random.randint(1,2)/4)
    
    return latitude, longitude

## 2. Data Scraping

### 2.1 Public Transport Data

#### 2.1.1 Mass Rapid Transit (MRT)

In [3]:
# Scraping table from wikipedia
url = "https://en.wikipedia.org/wiki/List_of_Singapore_MRT_stations"
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[1]

# Convert table into dataframe
dfs = pd.read_html(str(table))
mrt = dfs[0]
mrt

Unnamed: 0_level_0,Alpha-numeric code(s),Alpha-numeric code(s),Station name,Station name,Station name,Opening,Name(s) during planning stages,Abbreviation,Location(s),Connection(s) to other transport
Unnamed: 0_level_1,In operation,Future,English • Malay,Chinese,Tamil,Opening,Name(s) during planning stages,Abbreviation,Location(s),Connection(s) to other transport
0,North South Line (NSL),North South Line (NSL),North South Line (NSL),North South Line (NSL),North South Line (NSL),North South Line (NSL),North South Line (NSL),North South Line (NSL),North South Line (NSL),North South Line (NSL)
1,NS1 EW24,JE5,Jurong East,裕廊东,ஜூரோங் கிழக்கு,10 March 1990,Jurong East,JUR,Jurong East,Jurong East Temporary Bus Interchange
2,NS2,,Bukit Batok,武吉巴督,புக்கிட் பாத்தோக்,10 March 1990,Bukit Batok South,BBT,Bukit Batok,Bukit Batok Bus Interchange
3,NS3,,Bukit Gombak,武吉甘柏,புக்கிட் கோம்பாக்,10 March 1990,Bukit Batok North,BGB,Bukit Batok,
4,,NS3A,Brickland,红砖,பிரிக்லேன்ட்,Mid-2030s,Brickland,TBA,Exact location not yet known,
...,...,...,...,...,...,...,...,...,...,...
249,,CR13 TE7,Bright Hill,光明山,பிரைட் ஹில்,2029,Bright Hill,BRH,Bishan,
250,Punggol Extension (CRLe),Punggol Extension (CRLe),Punggol Extension (CRLe),Punggol Extension (CRLe),Punggol Extension (CRLe),Punggol Extension (CRLe),Punggol Extension (CRLe),Punggol Extension (CRLe),Punggol Extension (CRLe),Punggol Extension (CRLe)
251,,CP2,Elias,伊莱雅,இலியாஸ்,2031,Elias,TBA,Pasir Ris,
252,PE4,CP3,Riviera,里维拉,ரிவியாரா,2031,Riviera,TBA,Punggol,


In [4]:
# Drop off first level of header
mrt.columns = mrt.columns.droplevel(0)
mrt.head()

Unnamed: 0,In operation,Future,English • Malay,Chinese,Tamil,Opening,Name(s) during planning stages,Abbreviation,Location(s),Connection(s) to other transport
0,North South Line (NSL),North South Line (NSL),North South Line (NSL),North South Line (NSL),North South Line (NSL),North South Line (NSL),North South Line (NSL),North South Line (NSL),North South Line (NSL),North South Line (NSL)
1,NS1 EW24,JE5,Jurong East,裕廊东,ஜூரோங் கிழக்கு,10 March 1990,Jurong East,JUR,Jurong East,Jurong East Temporary Bus Interchange
2,NS2,,Bukit Batok,武吉巴督,புக்கிட் பாத்தோக்,10 March 1990,Bukit Batok South,BBT,Bukit Batok,Bukit Batok Bus Interchange
3,NS3,,Bukit Gombak,武吉甘柏,புக்கிட் கோம்பாக்,10 March 1990,Bukit Batok North,BGB,Bukit Batok,
4,,NS3A,Brickland,红砖,பிரிக்லேன்ட்,Mid-2030s,Brickland,TBA,Exact location not yet known,


In [5]:
# Drop off secondary header on MRT Line Name 
# by comparing Chinese and Tamil Station Name
mrt = mrt[mrt['Chinese'] != mrt['Tamil']]
mrt.head()

Unnamed: 0,In operation,Future,English • Malay,Chinese,Tamil,Opening,Name(s) during planning stages,Abbreviation,Location(s),Connection(s) to other transport
1,NS1 EW24,JE5,Jurong East,裕廊东,ஜூரோங் கிழக்கு,10 March 1990,Jurong East,JUR,Jurong East,Jurong East Temporary Bus Interchange
2,NS2,,Bukit Batok,武吉巴督,புக்கிட் பாத்தோக்,10 March 1990,Bukit Batok South,BBT,Bukit Batok,Bukit Batok Bus Interchange
3,NS3,,Bukit Gombak,武吉甘柏,புக்கிட் கோம்பாக்,10 March 1990,Bukit Batok North,BGB,Bukit Batok,
4,,NS3A,Brickland,红砖,பிரிக்லேன்ட்,Mid-2030s,Brickland,TBA,Exact location not yet known,
5,NS4 BP1,JS1,Choa Chu Kang,蔡厝港,சுவா சூ காங்,10 March 1990,Choa Chu Kang,CCK,Choa Chu Kang,Choa Chu Kang Bus Interchange


In [6]:
# Drop off Unnecessary Columns
mrt = mrt.drop(['Chinese', 'Tamil', 'Opening', 'Name(s) during planning stages', 'Abbreviation', 'Location(s)'], 
             axis=1)
mrt.head()

Unnamed: 0,In operation,Future,English • Malay,Connection(s) to other transport
1,NS1 EW24,JE5,Jurong East,Jurong East Temporary Bus Interchange
2,NS2,,Bukit Batok,Bukit Batok Bus Interchange
3,NS3,,Bukit Gombak,
4,,NS3A,Brickland,
5,NS4 BP1,JS1,Choa Chu Kang,Choa Chu Kang Bus Interchange


In [7]:
# Renaming columns to be more readable
mrt.columns = ['Code', 'Future Code', 'Name', 'Interchange']
mrt.head()

Unnamed: 0,Code,Future Code,Name,Interchange
1,NS1 EW24,JE5,Jurong East,Jurong East Temporary Bus Interchange
2,NS2,,Bukit Batok,Bukit Batok Bus Interchange
3,NS3,,Bukit Gombak,
4,,NS3A,Brickland,
5,NS4 BP1,JS1,Choa Chu Kang,Choa Chu Kang Bus Interchange


In [8]:
# Drop off Station under construction
mrt = mrt.dropna(subset=['Code'])

In [9]:
# Drop off duplicates
# Interchange Stations that shows up more than 1 time

mrt = mrt.drop_duplicates(subset='Name')

In [10]:
# Fill up empty cells in Interchange column with empty string
mrt['Interchange'] = mrt['Interchange'].fillna(' ')

In [11]:
# Function to return boolean code on whether 
# the MRT station is also a bus interchange

def bus_interchange(element):
    if 'Bus Interchange' in element:
        return 1
    else:
        return 0

mrt['Bus_Interchange'] = mrt['Interchange'].apply(bus_interchange)

In [12]:
# Function to return boolean code on whether 
# the MRT station is a MRT interchange

def mrt_interchange(element):
    
    # If there are more than 1 code for a single station
    if len(element.split())>1:
        return 1
    
    else:
        return 0
    
mrt['MRT_Interchange'] = mrt['Code'].apply(mrt_interchange)

In [13]:
# Drop off pre-engineered column
mrt = mrt.drop(['Code', 'Future Code', 'Interchange'], axis=1)

In [14]:
# Fixing "Botanic Gardens" name
mrt['Name'].replace({
    'Botanic Gardens • Kebun Bunga' : 'Botanic Gardens',
}, inplace=True)

In [15]:
# Reset Index for dataframe
mrt.reset_index(drop=True, inplace=True)

In [16]:
# Extract coordinates for each MRT station for distance calc
mrt['Latitude'], mrt['Longitude'] = coordinate_retrieval(mrt, "Name", mrt=True)

Waiting... (Riviera)... 0 entries remaining... .ing... . ..

In [17]:
# The only null column is a LRT station
mrt[mrt['Latitude'].isnull()]

Unnamed: 0,Name,Bus_Interchange,MRT_Interchange,Latitude,Longitude
122,Riviera,0,0,,


In [18]:
mrt.dropna(inplace=True)

In [19]:
mrt.sample(5)

Unnamed: 0,Name,Bus_Interchange,MRT_Interchange,Latitude,Longitude
110,Bendemeer,0,0,1.313874329,103.8630628
61,Clarke Quay,0,0,1.286898788,103.8460645
63,Farrer Park,0,0,1.312464255,103.8529729
30,Tanah Merah,0,1,1.3268651000000002,103.9457914
95,Bayfront,0,1,1.282705224,103.8597592


In [20]:
# Export data to CSV
mrt.to_csv('./Dataset/Engineered/MRT.csv', index=False)

#### 2.1.2 Bus Stops

https://github.com/mylee16/onemap-api

In [21]:
# Read data from csv file
bus_stop = pd.read_csv('./Dataset/Raw/bus_stops.csv')

In [22]:
# Check on the data
bus_stop.head()

Unnamed: 0.1,Unnamed: 0,BusStopCode,Description,Latitude,Longitude,RoadName
0,0,1012,Hotel Grand Pacific,1.296848,103.852536,Victoria St
1,1,1013,St. Joseph's Ch,1.29771,103.853225,Victoria St
2,2,1019,Bras Basah Cplx,1.29699,103.853022,Victoria St
3,3,1029,Opp Natl Lib,1.296673,103.854414,Nth Bridge Rd
4,4,1039,Bugis Cube,1.298208,103.855491,Nth Bridge Rd


In [23]:
# Dropping unnecessary columns
bus_stop.drop(['Unnamed: 0', 'RoadName', 'BusStopCode'], axis=1, inplace=True)

# Renaming columns to be more readable
bus_stop.columns = ['name', 'latitude', 'longitude']

In [24]:
# Sanity check on dataframe
bus_stop.sample(5)

Unnamed: 0,name,latitude,longitude
466,Valley Pt,1.293283,103.826635
3483,Blk 465A,1.372397,103.897774
3320,Bef Lavender St,1.315304,103.861404
4941,Changi Baptist Ch,1.365904,103.97535
479,Opp Great World City,1.294092,103.830838


In [25]:
# Export data to CSV
bus_stop.to_csv('./Dataset/Engineered/Bus_Stop.csv', index=False)

### 2.2 Education

#### 2.2.1 Primary School

In [26]:
# Scraping data from website with primary school detail
url = "https://www.salary.sg/2019/best-primary-schools-2019/"
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')
lis = soup.find('ol')

# List with all schools
all_ps = [li.text for li in lis.find_all('li')]

# List with only school with affiliation
aff_ps = [strong.text for strong in lis.find_all('strong')]

# Transform scraped data into dataframe
ps_df = pd.DataFrame(all_ps, columns=['name'])
ps_df.head()

Unnamed: 0,name
0,CHIJ St. Nicholas Girls’ School (Primary Secti...
1,Catholic High School (Primary Section) (boys) ...
2,Nan Hua Primary School – 20
3,Nanyang Primary School – 20
4,Pei Hwa Presbyterian Primary School – 20


In [27]:
# Extract vacancy data
ps_df['vacancy'] = [row[-1] for row in ps_df['name'].str.split()]

# Random check on vacancy
ps_df.sample(5)

Unnamed: 0,name,vacancy
121,Stamford Primary School – 62,62
38,Woodlands Primary School – 39,39
77,CHIJ Our Lady of Good Counsel (girls) – 50,50
89,Northland Primary School – 53,53
58,Kuo Chuan Presbyterian Primary School – 45,45


In [28]:
# Function to create new column 'affiliation'
def aff_school(row):
    if row in aff_ps:
        return 1
    else:
        return 0
    
ps_df['affiliation'] = ps_df['name'].apply(aff_school)

In [29]:
# Remove vacancy value and symbols from "name"
ps_df['name'] = ps_df['name'].str.extract(r'(.+) –.+')

In [30]:
# Function to remove all secondary information in name
def extract_name(row):
    if '(' in row:
        return re.findall(r'(.+) \(', row)[0]
    else:
        return row

# 1st-loop to remove one parentheses (boy/girl school)
ps_df['name'] = ps_df['name'].apply(extract_name)

# 2nd-loop to remove second parentheses (string of "Primary")
ps_df['name'] = ps_df['name'].apply(extract_name)

In [31]:
# Rectify name to OneMap standard address naming
ps_df['name'] = ps_df['name'].str.replace('’', "'").str.replace(r'St\.', 'Saint')

In [32]:
# Sanity check on affiliation
ps_df.sample(5)

Unnamed: 0,name,vacancy,affiliation
18,Shuqun Primary School,29,0
85,Elias Park Primary School,52,0
150,Palm View Primary School,71,0
46,Angsana Primary School,43,0
72,Hong Wen School,48,0


In [33]:
# Extract coordinates for distance calc
ps_df['latitude'], ps_df['longitude'] = coordinate_retrieval(ps_df, "name")

Waiting... (Valour Primary School)... 0 entries remaining... .. ... . .ining... .

In [34]:
# Missing rows = None
ps_df[ps_df['latitude'].isnull()]

Unnamed: 0,name,vacancy,affiliation,latitude,longitude


In [35]:
ps_df.head(10)

Unnamed: 0,name,vacancy,affiliation,latitude,longitude
0,CHIJ Saint Nicholas Girls' School,20,1,1.3737296730000002,103.8343089
1,Catholic High School,20,1,1.3547888769999998,103.8449341
2,Nan Hua Primary School,20,0,1.319836638,103.761404
3,Nanyang Primary School,20,1,1.32111549,103.8064681
4,Pei Hwa Presbyterian Primary School,20,0,1.338055078,103.7761082
5,Red Swastika School,20,0,1.33349383,103.9343749
6,Rosyth School,20,0,1.372915796,103.8746932
7,Temasek Primary School,20,0,1.317716389,103.9456952
8,Methodist Girls' School,21,1,1.33286306,103.7833634
9,Rulang Primary School,22,0,1.346844456,103.7190098


In [36]:
# Export data to CSV
ps_df.to_csv('./Dataset/Engineered/Primary_School.csv', index=False)

#### 2.2.2 Secondary School

In [37]:
# Scraping data from website with secondary school detail
url = "https://www.salary.sg/2018/secondary-school-ranking-based-on-cut-off-for-2018-intake/"
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')

lis = soup.find('ol')

# List with all schools
all_ss = [li.text for li in lis.find_all('li')]

# List with only school with affiliation
aff_ss = [strong.text for strong in lis.find_all('strong')]

# Transform scraped data into dataframe
ss_df = pd.DataFrame(all_ss, columns=['name'])
ss_df['cutoff_point'] = [row[-1] for row in ss_df['name'].str.split()]
ss_df.sample(5)

Unnamed: 0,name,cutoff_point
67,Gan Eng Seng School (co-ed) – 223,223
153,Yusof Ishak Secondary School (co-ed) – 188,188
79,Pasir Ris Crest Secondary School (co-ed) – 215,215
151,Yio Chu Kang Secondary School (co-ed) – 188,188
19,Singapore Chinese Girls’ School O-levels (girl...,250


In [38]:
# Function to create new column 'affiliation'
def aff_school(row):
    if row in aff_ss:
        return 1
    else:
        return 0
    
ss_df['affiliation'] = ss_df['name'].apply(aff_school)

In [39]:
# Remove vacancy value and symbols from "name"
ss_df['name'] = ss_df['name'].str.extract(r'(.+) –.+')

In [40]:
# Function to remove all secondary information in name
def extract_name(row):
    if '(' in row:
        return re.findall(r'(.+) \(', row)[0]
    else:
        return row

# 1st-loop to remove one parentheses (co-ed)
ss_df['name'] = ss_df['name'].apply(extract_name)

# 2nd-loop to remove second parentheses (string of "Secondary")
ss_df['name'] = ss_df['name'].apply(extract_name)

In [41]:
# Rectify name to OneMap standard address naming
ss_df['name'] = ss_df['name'].str.replace('’', "'").str.replace(r'St\.', 'Saint')

In [42]:
# Rectify name to remove the programme name
ss_df['name'] = ss_df['name'].str.replace('IP', '').str.replace('SAP', '')
ss_df['name'] = ss_df['name'].str.replace('O-levels', '').str.replace('IB', '')
ss_df['name'] = ss_df['name'].str.replace('Govt.', 'Government')
ss_df['name'] = ss_df['name'].str.replace('–', '').str.strip()

In [43]:
# Extract coordinates for distance calc
ss_df['latitude'], ss_df['longitude'] = coordinate_retrieval(ss_df, "name")

Waiting... (Yuying Secondary School)... 0 entries remaining... .... ..... .ning... .

In [44]:
# Missing rows
ss_df[ss_df['latitude'].isnull()]

Unnamed: 0,name,cutoff_point,affiliation,latitude,longitude
128,East View Secondary School,188,0,,
146,Shuqun Secondary School,188,0,,


In [45]:
# Remove closed-down school
ss_df = ss_df.dropna(subset=['latitude'])

In [46]:
# Export final dataframe to csv
ss_df.reset_index(drop=True).to_csv('./Dataset/Engineered/Secondary_School.csv', index=False)

## 2.3 Shopping Mall

In [47]:
# Making request to wikipedia page
url = "https://en.wikipedia.org/wiki/List_of_shopping_malls_in_Singapore"
res = requests.get(url)
soup = BeautifulSoup(res.content,'lxml')

In [48]:
# Scraping through all 7 tables in wikipedia page
mall_df = []
for idx in range(1, 8):
    mall_df += [lis.text for lis in soup.find_all('ul')[idx].find_all('li')]

In [49]:
# Rectify mall name to remove all symbols
mall_df = [re.findall(r'(.+)\[', mall)[0].strip() if '[' in mall else mall for mall in mall_df]
mall_df = [re.findall(r'(.+)\(', mall)[0].strip() if '(' in mall else mall for mall in mall_df]
mall_df = [re.findall(r'(.+),', mall)[0].strip() if ',' in mall else mall for mall in mall_df]
mall_df = [re.findall(r'(.+)@', mall)[0].strip() if '@' in mall else mall for mall in mall_df]

In [50]:
# Convert list into a dataframe
mall_df = pd.DataFrame(mall_df, columns=['name'])

In [51]:
# Extract coordinates for distance calc
mall_df['latitude'], mall_df['longitude'] = coordinate_retrieval(mall_df, "name")

Waiting... (Queensway Shopping Centre)... 0 entries remaining... .. ... .... .

In [52]:
# Malls with discrepancy with actual name
mall_df[mall_df['latitude'].isnull()]

Unnamed: 0,name,latitude,longitude
0,PointyLips Point,,
10,Clarke Quay Central,,
13,City Gate Mall,,
20,Holland Village Shopping Mall,,
34,Mustafa Shopping Centre,,
45,PoMo,,
49,Shaw House and Centre,,
101,Jubilee Square,,
143,Yew Tee Shopping Centre,,
146,KKH The Retail Mall,,


In [53]:
# Drop off missing rows
mall_df.dropna(inplace=True)

In [54]:
# Export data to CSV
mall_df.reset_index(drop=True).to_csv('./Dataset/Engineered/Mall.csv', index=False)