In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

- [x] Download suburb with geographic data from https://www.corra.com.au/australian-postcode-location-data/
- [x] Scrape suburb names that have a train station from https://en.wikipedia.org/wiki/List_of_Sydney_Trains_railway_stations
- [ ] Match suburb names and geographic data: Suburb Name - Latitude - Longitude

## Scrape suburbs with a train station

Sydney consists of over 600 suburbs and 33 local government areas. Hence, this report only concerns suburbs that have a train station.

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Sydney_Trains_railway_stations")
suburb_html = BeautifulSoup(r.text)

To visualise the html file, we can use the code `print(suburb.html.prettify())`.

Here is an example of the element we would like to access:

`<b> <a href="/wiki/Allawah_railway_station" title="Allawah railway station"> Allawah </a> </b>`

In [3]:
suburb_list = []

# Extract the name of each station
for b in suburb_html.find_all("b"):
    # Ignore any None results
    if b.a is None:
        continue
    # Ignore results that are not station names
    if "_railway_station" not in b.a.attrs['href']:
        continue
    suburb_list.append(b.a.text)

In [4]:
# Convert the list into a dataframe
suburb_station = pd.DataFrame(suburb_list, columns={'suburb'})

# Replace East Richmond as Richmond as the former is a partial area of the latter
suburb_station['suburb'] = suburb_station['suburb'].str.replace('East Richmond', 'Richmond')

In [5]:
suburb_station

Unnamed: 0,suburb
0,Allawah
1,Arncliffe
2,Artarmon
3,Ashfield
4,Asquith
...,...
165,Wollstonecraft
166,Woolooware
167,Wynyard
168,Yagoona


## Extract geographic data

In [6]:
# Read geographica data from csv file
geo = pd.read_csv('data/Australian_Post_Codes_Lat_Lon.csv')

# Convert suburb names from UPPERCASE to Title Case to match the suburb list
geo['suburb'] = geo['suburb'].str.title()

# Strip extra characters in Suburb, State, and Type columns
for c in geo[['suburb', 'state', 'type']].columns:
    geo[c] = geo[c].str.strip()
    
# Replace 'Mount Kuring-Gai' with 'Mount Kuring-gai' for consistency
geo['suburb'] = geo['suburb'].str.replace('Mount Kuring-Gai', 'Mount Kuring-gai')

geo

Unnamed: 0,postcode,suburb,state,dc,type,lat,lon
0,200,Australian National University,ACT,AUSTRALIAN NATIONAL UNI LPO,Post Office Boxes,-35.277272,149.117136
1,221,Barton,ACT,,LVR,-35.201372,149.095065
2,800,Darwin,NT,DARWIN DELIVERY CENTRE,Delivery Area,-12.801028,130.955789
3,801,Darwin,NT,DARWIN DELIVERY CENTRE,Post Office Boxes,-12.801028,130.955789
4,804,Parap,NT,PARAP,Post Office Boxes,-12.432181,130.843310
...,...,...,...,...,...,...,...
16075,9010,Brisbane,QLD,CITY DC - BRISBANE,LVR,-27.603479,152.823141
16076,9013,Brisbane,QLD,CITY DC - BRISBANE,LVR,-27.603479,152.823141
16077,9015,Brisbane,QLD,CITY DC - BRISBANE,LVR,-27.603479,152.823141
16078,9020,Brisbane,QLD,CITY DC - BRISBANE,LVR,-27.603479,152.823141


In [7]:
# Merge Suburb Dataframe and geographic dataframe 
suburb_geo = suburb_station.merge(geo, on='suburb', how='left')

# Only include suburbs within New South Wales state (Sydney)
suburb_geo = suburb_geo[suburb_geo['state'] == 'NSW']

suburb_geo

Unnamed: 0,suburb,postcode,state,dc,type,lat,lon
0,Allawah,2218.0,NSW,ROCKDALE DC,Delivery Area,-33.970018,151.114517
1,Arncliffe,2205.0,NSW,ROCKDALE DC,Delivery Area,-33.936592,151.146805
2,Artarmon,1570.0,NSW,ARTARMON POST SHOP,Post Office Boxes,-33.808087,151.192733
3,Artarmon,2064.0,NSW,ST LEONARDS DF,Delivery Area,-33.807664,151.189662
4,Ashfield,1800.0,NSW,ASHFIELD POST SHOP,Post Office Boxes,-34.096505,150.778939
...,...,...,...,...,...,...,...
293,Wolli Creek,2205.0,NSW,ROCKDALE DC,Delivery Area,-33.930744,151.155272
294,Wollstonecraft,2065.0,NSW,ST LEONARDS DF,Delivery Area,-33.828158,151.196621
295,Woolooware,2230.0,NSW,TAREN POINT DC,Delivery Area,-34.048276,151.141431
297,Yagoona,2199.0,NSW,LEIGHTONFIELD DF,Delivery Area,-33.907725,151.026108


In [8]:
# Remove any duplicates keeping only the first occurence
suburb_geo.drop_duplicates(subset='suburb', keep='first', inplace=True)
suburb_geo

Unnamed: 0,suburb,postcode,state,dc,type,lat,lon
0,Allawah,2218.0,NSW,ROCKDALE DC,Delivery Area,-33.970018,151.114517
1,Arncliffe,2205.0,NSW,ROCKDALE DC,Delivery Area,-33.936592,151.146805
2,Artarmon,1570.0,NSW,ARTARMON POST SHOP,Post Office Boxes,-33.808087,151.192733
4,Ashfield,1800.0,NSW,ASHFIELD POST SHOP,Post Office Boxes,-34.096505,150.778939
8,Asquith,2077.0,NSW,THORNLEIGH DC,Delivery Area,-33.687484,151.108685
...,...,...,...,...,...,...,...
293,Wolli Creek,2205.0,NSW,ROCKDALE DC,Delivery Area,-33.930744,151.155272
294,Wollstonecraft,2065.0,NSW,ST LEONARDS DF,Delivery Area,-33.828158,151.196621
295,Woolooware,2230.0,NSW,TAREN POINT DC,Delivery Area,-34.048276,151.141431
297,Yagoona,2199.0,NSW,LEIGHTONFIELD DF,Delivery Area,-33.907725,151.026108


In [9]:
# There are 170 stations but only 152 suburb names after matching with geographic data.
# Let's find out what are not included.
set(suburb_station['suburb']) - set(suburb_geo['suburb'])

{'Central',
 'Circular Quay',
 'Domestic Airport',
 'Edmondson Park',
 'Flemington',
 'Green Square',
 'International Airport',
 'Leightonfield',
 'Macarthur',
 'Macdonaldtown',
 'Martin Place',
 'Museum',
 'Olympic Park',
 'St James',
 'Town Hall',
 'Vineyard',
 'Wynyard'}

The excluded stations belong to four groups:

| Group 	|                  Description                 	|                                                  Station                                                 	|
|:-----:	|:--------------------------------------------:	|:--------------------------------------------------------------------------------------------------------:	|
|   1   	| Special landmarks                            	| Circular Quay<br>Domestic Airport<br>International Airport<br>Olympic Park <br>Martin Place<br>Town Hall 	|
|   2   	| Localities that exist <br>within a suburb    	| Flemington (Homebush)<br>Leightonfield (Villawood)<br>St James, Wynyard, Museum (CBD)                    	|
|   3   	| Regions that consists of <br>several suburbs 	| Green Square<br>Macarthur<br>Macdonaldtown                                                               	|
|   4   	| Other                                        	| Central (CBD)<br>Edmondson Park<br>Vineyard                                                              	|

Therefore, for simplicity, we shall proceed with 152 suburbs.

## Finalise the suburb dataset

In [10]:
suburb_df = suburb_geo.drop(columns={'postcode', 'state', 'dc', 'type'})
suburb_df

Unnamed: 0,suburb,lat,lon
0,Allawah,-33.970018,151.114517
1,Arncliffe,-33.936592,151.146805
2,Artarmon,-33.808087,151.192733
3,Ashfield,-34.096505,150.778939
4,Asquith,-33.687484,151.108685
...,...,...,...
147,Wolli Creek,-33.930744,151.155272
148,Wollstonecraft,-33.828158,151.196621
149,Woolooware,-34.048276,151.141431
150,Yagoona,-33.907725,151.026108


In [12]:
suburb_df.to_csv('data/sydney_suburbs.csv', index=False)