# Download historical weather data:
## Inspect the webpage:  
http://www.weather.gov.sg/climate-historical-daily/

Import the following modules:-
 * BeautifulSoup  
 * urllib.request  
 * webdriver from selenium  
 * re  

In [1]:
from bs4 import BeautifulSoup as bs
import urllib.request as r
from selenium import webdriver
import re

Set url:

In [2]:
url = "http://www.weather.gov.sg/climate-historical-daily/"

Set webdriver:

In [3]:
driver = webdriver.PhantomJS(executable_path='../../WebDriver/phantomjs')

Retrieve page:

In [4]:
driver.get(url)

Gather BeautifulSoup object:

In [8]:
bsObj = bs(html, 'html5lib')

In [9]:
html = driver.page_source
bsObj = bs(html,"lxml")

Find all links on page:

In [10]:
bsObj.find_all('a')

[<a href="http://www.weather.gov.sg/home"><img alt="MSS" height="71" src="http://www.weather.gov.sg/wp-content/themes/wiptheme/assets/img/mss-logo.png" width="254"/></a>,
 <a href="http://www.gov.sg"><img alt="Singapore Government" height="30" src="http://www.weather.gov.sg/wp-content/themes/wiptheme/assets/img/sg-gov-logo.jpg" width="220"/></a>,
 <a href="http://www.weather.gov.sg/about-contact-us/">Contact Us</a>,
 <a class="dropdown-toggle" data-toggle="dropdown" href="#">Weather</a>,
 <a href="http://www.weather.gov.sg/weather-forecast-2hrnowcast/">2-hr Nowcast</a>,
 <a href="http://www.weather.gov.sg/weather-forecast-24hrforecast/">24-hr Forecast</a>,
 <a href="http://www.weather.gov.sg/weather-forecast-4dayoutlook/">4-Day Outlook</a>,
 <a href="http://www.weather.gov.sg/weather-fortnightly-outlook/">Fortnightly Outlook</a>,
 <a href="http://www.weather.gov.sg/weather-forecast-monsoon-update/">Monsoon Update</a>,
 <a href="http://www.weather.gov.sg/weather-world-forecast">World Fo

## Download a csv dataset
Set opener:

In [11]:
opener = r.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]

Create output directory:

In [12]:
import os
out_path = "../../data/1_raw/SG_weather/"
if not os.path.exists(out_path):
    os.makedirs(out_path)

Retrieve link from BeautifulSoup object:

In [13]:
link = bsObj.find('a', text="CSV")['href']
link

'http://www.weather.gov.sg/files/dailydata/DAILYDATA_S24_201711.csv'

Get filename:

In [14]:
from urllib.parse import urlsplit
urlsplit(link)

SplitResult(scheme='http', netloc='www.weather.gov.sg', path='/files/dailydata/DAILYDATA_S24_201711.csv', query='', fragment='')

In [15]:
filename = os.path.basename(urlsplit(link).path)
filename

'DAILYDATA_S24_201711.csv'

Download the file from and save it:

In [16]:
with opener.open(link) as response, open(out_path+filename, 'wb') as out_file:
    data = response.read() # a `bytes` object
    out_file.write(data)

## Retrieve all historical data
Examining the website, we will find the best way to retrieve all data:
 * loop through all the weather stations
 * then loop through the years
 * followed by the months
 * finally retrieve the CSV file links for that month of year of the station

Get list of cityname:

In [17]:
list_cityname = bsObj.find('button',id="cityname").find_next_sibling()

Loop to retrieve weather stations:

In [18]:
citynames = []
for item in list_cityname.children:
    if not item.isspace:
        cityname = item.find('a').text
        citynames.append(cityname)

In [19]:
citynames

['Admiralty',
 'Admiralty West',
 'Ang Mo Kio',
 'Boon Lay (East)',
 'Boon Lay (West)',
 'Botanic Garden',
 'Buangkok',
 'Bukit Panjang',
 'Bukit Timah',
 'Buona Vista',
 'Chai Chee',
 'Changi',
 'Choa Chu Kang (Central)',
 'Choa Chu Kang (South)',
 'Choa Chu Kang (West)',
 'Clementi',
 'Dhoby Ghaut',
 'East Coast Parkway',
 'Jurong (East)',
 'Jurong (North)',
 'Jurong (West)',
 'Jurong Island',
 'Jurong Pier',
 'Kampong Bahru',
 'Kent Ridge',
 'Khatib',
 'Kranji Reservoir',
 'Lim Chu Kang',
 'Lower Peirce Reservoir',
 'Macritchie Reservoir',
 'Mandai',
 'Marina Barrage',
 'Marine Parade',
 'Newton',
 'Nicoll Highway',
 'Pasir Panjang',
 'Pasir Ris (Central)',
 'Pasir Ris (West)',
 'Paya Lebar',
 'Pulau Ubin',
 'Punggol',
 'Queenstown',
 'Seletar',
 'Semakau Island',
 'Sembawang',
 'Sentosa Island',
 'Serangoon',
 'Serangoon North',
 'Simei',
 'Somerset (Road)',
 'Tai Seng',
 'Tanjong Katong',
 'Tanjong Pagar',
 'Tengah',
 'Toa Payoh',
 'Tuas',
 'Tuas South',
 'Tuas West',
 'Ulu Pandan

Find button ids for month and year:

In [20]:
# similarly set button id for month and year
id_month = 'month'
id_year = 'year'

import time module to let us wait to allow for page to load.

In [21]:
import time

Set driver:

In [22]:
driver = webdriver.PhantomJS(executable_path='../../WebDriver/phantomjs')
driver.get(url)

Set function to return html page after each click:

In [23]:
def get_html(id_str, link_text):
    driver.find_element_by_id(id_str).click()
    driver.find_element_by_link_text(link_text).click()
    time.sleep(1)
    if id_str==id_month :
        driver.find_element_by_id('display').click()
        time.sleep(2)
    return driver.page_source

Create loop to retrieve links of all historical weather data:

In [24]:
valid_months = ['December',
               'November',
               'October',
               'September',
               'August',
               'July',
               'June',
               'May',
               'April',
               'March',
               'February',
               'January']

In [26]:
file_links = []
for cityname in citynames:
    html_city = get_html('cityname', cityname)
    bs_city = bs(html_city,"lxml")
    list_year = bs_city.find('button',id=id_year).find_next_sibling()
    
    years = []
    for item in list_year.children:
        if not item.isspace:
            year = item.find('a').text
            years.append(year)
    
    for year in years:
        html_year = get_html(id_year, year)
        bs_year = bs(html_year,"lxml")
        list_month = bs_year.find('button',id=id_month).find_next_sibling()
        for item in list_month.children:
            if not item.isspace:
                month = item.find('a').text
                if month in valid_months:
                    html_month = get_html(id_month, month)
                    bs_month = bs(html_month,"lxml")
                    link = bs_month.find('a', text="CSV")['href']
                    if link not in file_links:
                        file_links.append(link)
                        print(link)
    

http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201711.csv
http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201710.csv
http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201708.csv
http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201707.csv
http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201705.csv
http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201704.csv
http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201703.csv
http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201702.csv
http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201612.csv
http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201611.csv
http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201609.csv
http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201608.csv
http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201607.csv
http://www.weather.gov.sg/files/dailydata/DAILYDATA_S104_201606.csv
http://www.weather.gov.sg/files/dailydata/DAILYD

TypeError: 'NoneType' object is not subscriptable

Using links to retrieve all csv files:

In [None]:
for link in file_links:
    filename = os.path.basename(urlsplit(link).path)
    with opener.open(link) as response, open(out_path+filename, 'wb') as out_file:
        data = response.read() # a `bytes` object
        out_file.write(data)