In [36]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pandarallel import pandarallel

In [37]:
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [38]:
sydney_suburbs_web = requests.get("https://en.wikipedia.org/wiki/List_of_Sydney_suburbs")

In [39]:
sydney_suburbs_html = BeautifulSoup(sydney_suburbs_web.text)

In [40]:
suburb_links = []
for a in sydney_suburbs_html.find_all("a"):
    if "href" not in a.attrs:
        continue
    if "class" in a.attrs and a.attrs["class"][0] != "mw-redirect":
        continue
    if "/wiki/Template" in a.attrs["href"]:
        continue
    if not "/wiki/" in a.attrs["href"]:
        continue
    if  "/wiki/Category" in a.attrs["href"]:
        continue
    if "/wiki/List_of_" in a.attrs["href"]:
        continue
    if "/wiki/Wikipedia" in a.attrs["href"]:
        continue
    if "accesskey" in a.attrs: 
        continue
    if not "title" in a.attrs:
        continue
    if "/wiki/Help" in a.attrs["href"]:
        continue
    if "/wiki/Portal" in a.attrs["href"]:
        continue
    if "donate" in a.attrs["href"]:
        continue
    suburb_links.append(a)

In [41]:
href_list = [a.attrs["href"] for a in suburb_links]
title_list = [a.text for a in suburb_links]

In [42]:
df = pd.DataFrame({
    "href": href_list,
    "title": title_list
})

In [43]:
def search_p_coordinate(tag):
    return tag.has_attr("class") and (tag.attrs["class"][0] == "latitude" or tag.attrs["class"][0] == "longitude")

In [44]:
def search_postcode(tag):
    return tag.has_attr("href") and tag.attrs["href"] == '/wiki/Postcodes_in_Australia'

In [45]:
def get_coordinate(row):
    href = f"https://en.wikipedia.org{row['href']}"
    res = requests.get(href)
    bs = BeautifulSoup(res.text)
    coordinate_tags = bs.find_all(search_p_coordinate)
#    a = bs.find(search_postcode)
#    if a is not None:
#        row['postcode'] = a.parent.next_sibling.text
#    else:
#        row['postcode'] = np.nan
    for c in coordinate_tags:
        row[c.attrs["class"][0]] = c.text
    return row

In [46]:
df_with_coords = df.parallel_apply(get_coordinate, axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=96), Label(value='0 / 96'))), HBox…

In [47]:
#df_with_coords.dropna(subset=['postcode'], axis=0, inplace=True)

In [48]:
#df_fin = df_with_coords[df_with_coords['postcode'].str.isnumeric()]

In [52]:
df_with_coords = df_with_coords[df_with_coords['title'] != 'Central business district']
df_with_coords.rename(columns={'title': 'suburb'}, inplace=True)

In [64]:
df_with_coords.drop_duplicates(subset='suburb', keep='first', inplace=True)

In [65]:
df_with_coords

Unnamed: 0,href,latitude,longitude,suburb
0,/wiki/Landsat_7,,,Landsat 7
1,/wiki/False-color,,,false-color
2,/wiki/Royal_National_Park,,,Royal National Park
3,/wiki/Ku-ring-gai_Chase_National_Park,,,Ku-ring-gai Chase National Park
4,/wiki/Blue_Mountains_National_Park,33°37′S,150°28′E,Blue Mountains National Park
...,...,...,...,...
747,/wiki/City_of_Hawkesbury,33°25′S,150°47′E,Hawkesbury
748,/wiki/The_Hills_Shire,33°46′S,151°00′E,The Hills
750,/wiki/Municipality_of_Hunter%27s_Hill,33°49′S,151°08′E,Hunter's Hill
752,/wiki/Ku-ring-gai_Council,33°45′15″S,151°09′06″E,Ku-ring-gai


In [66]:
df_with_coords.to_csv('data/suburb.csv', index=False)