# WebScrape

In this notebook, we will be webscraping the historical daily records of weather data from the [Meterological Service Singapore](https://www.weather.gov.sg/climate-historical-daily/).

This notebook will download each of the monthly CSV files from 1980 to 2024 (Oct) and save it into a folder which can be easily be accessed and used to do an exploratory data analysis.

We will use *Selenium* to excecute our webscraping. This method is relatively easy as it imitates how a user would work around a webpage. As a bonus, *Selenium* will produce a pop up of the webpage and we can see how the code executes within the webpage. 

It is important to note that this URL is dynamic and some time must be allowed for the screen to load before some code can be executed (dropdown-menus).

In [1]:
# import libraries
import time
import requests
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
### CAN CONSIDER TO DELETE ###

# Target URL
url = "https://www.weather.gov.sg/climate-historical-daily/"

# Set up the Selenium WebDriver
driver = webdriver.Chrome()

try:
    # Open the webpage
    driver.get(url)
    wait = WebDriverWait(driver, 20)

    ## LOCATIONS
    locations = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "btn-group")))

    # print(locations.get_attribute('outerHTML')) # debug

    # click on location button  
    locations.click()

    # locations hidden in the dropdown menu
    loc_dropdown_items = locations.find_elements(By.TAG_NAME, "li")

    # append location into a list
    location_list = []

    for loc in loc_dropdown_items:
        location_list.append(loc.text)
        # print(loc.text) # debug
    
    # print("Available Locations")
    # print(location_list) # debug

    # # Click drop down button
    # location_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "btn-group")))
    # location_button.click()

    # Select Changi from dropdown options
    changi_option = wait.until(EC.element_to_be_clickable((By.XPATH, './/a[@class = "dropdown-item" and contains(text(), "Changi")]')))
    changi_option.click()

    
    
    ## MONTHS

    # access the webelement
    months = wait.until(EC.presence_of_element_located((By.ID, "monthDiv")))

    # find the class name that contains the list of months
    months_div = months.find_elements(By.CLASS_NAME, "dropdown-menu")

    # Find the button on the page that states month
    months_button = months.find_element(By.TAG_NAME, 'button')

    # click on the button to show the list
    months_button.click() 

    # the list is now shown
    dropdown_menu = months.find_element(By.CLASS_NAME, "dropdown-menu")  # Target only one class

    # Step 4: Find all <li> elements within the dropdown menu
    dropdown_items = dropdown_menu.find_elements(By.TAG_NAME, "a")

    # # Keep the list of available months
    # month_list = []
    # print('Available Months:')
    # for item in dropdown_items:
    #     month_list.append(item.text)
    #     # print(item.text) # debug
    # print(month_list) # debug


    ### Years ###
    years = wait.until(EC.presence_of_element_located((By.ID, "yearDiv")))

    # find the class name that contains the years recorded data
    years_div = years.find_elements(By.CLASS_NAME, "dropdown-menu")
    
    # Find the button on the page that states years
    years_button = years.find_element(By.TAG_NAME, 'button')

    # click on the button to show the list
    years_button.click()

    # the list is now shown
    years_dropdown_menu = years.find_elements(By.CLASS_NAME, "dropdown-menu")

    # Find all list elements within the menu
    years_dropdown_items = years.find_elements(By.TAG_NAME, "a")

    
    #####################



    ### Months ###

    # months clickable box
    months = wait.until(EC.presence_of_element_located((By.ID, "monthDiv")))

    # find the dropdown menu for months
    months_dropdown_menu = months.find_elements(By.CLASS_NAME, "dropdown-menu")

    # the button for months
    months_button = months.find_element(By.TAG_NAME, "button")

    # click on the button
    months_button.click()

    # show the dropdown list of months
    months_dropdown_items = months.find_elements(By.TAG_NAME, "a")

    months_list = []

    print("Available Months")
    for month in months_dropdown_items:
        months_list.append(month.text)
    
    print(f"\nThe months found in the dropbox:\n{months_list}\n")
    
    #####################
    # keep the list of years in a list
    year_list = []

    # click on years button
    years_button.click()

    print("Available Years:")
    for year in years_dropdown_items:
        year_list.append(year.text)
    
    print(f"The years found the the drop box:\n{year_list}\n")
    print('-'*75)
    
    # Click on Month Then Years
    
    for year in years_dropdown_items[::-1]:
        
        
        year_text = year.text
        print(f"Clicking on year: {year_text}")
        
        year.click()
        
        
        WebDriverWait(driver, 5)
        years_button.click()


        # months clickable box
        months = wait.until(EC.presence_of_element_located((By.ID, "monthDiv")))

        # the button for months
        months_button = months.find_element(By.TAG_NAME, "button")

        # click on months button
        months_button.click()

        WebDriverWait(driver, 5)

        # show the dropdown list of months
        # months_dropdown_menu

        ### Test block
        # months = wait.until(EC.presence_of_element_located((By.ID, "monthDiv")))

        # months_button = months.find_element(By.TAG_NAME, "button")

        # # click button
        # months_button.click()

        # # show list of months
        # months_dropdown_menu = months.find_elements(By.CLASS_NAME, "dropdown-menu")

        # # find all list elements within the menu
        # months_dropdown_items = months.find_elements(By.TAG_NAME, "a")
        ###
        
        for month in months_dropdown_items[::-1]:

            month_text = month.text
            print(f"\nClicking on month: {month_text}")
            month.click()

            WebDriverWait(driver, 5)

            months_button.click()

            

            # Display button
            display_button = wait.until(EC.element_to_be_clickable((By.ID, "display")))
            display_button
            display_button.click()

   
finally:
    # Close the browser
    driver.quit()

### Custom functions to navigate the dropdown menu

Trying it out as one long script has caused a certain error (Stale Request) multiple times. Thus, we will compartmentalize the **year** dropdown menu on its own but combine the downloading of **CSV** files into the function of **month** dropdown menu. 

In [12]:
# Create a function that loops and select the year
'''
In this function, we will indicate which year we would like to access the monthly data.
This function takes the year as a string as the webpage object shows the type as String.

It searches the HTML code until it finds the specific ID as the parent element and from 
there we can find the child element. 

'''

def get_year(input_year: str):
    
    # Parent element
    years = wait.until(EC.presence_of_element_located((By.ID, "yearDiv")))

    # years button
    years_button = years.find_element(By.TAG_NAME, "button")

    # show list of years 
    # years_dropdown_menu = years.find_elements(By.CLASS_NAME, "dropdown-menu") # MAY NOT NEED IT

    # displays the years of data available hidden by tag <a>
    years_dropdown_items = years.find_elements(By.TAG_NAME, "a")

    # click button to show the list
    years_button.click()

    # initiate counter
    i = 0
    
    # iterate through the list until the year specified
    while i<len(years_dropdown_items):
        # check for match
        if (years_dropdown_items[i].text == input_year):
            # click on the desired year
            years_dropdown_items[i].click()

            # print(years_dropdown_items[i].text) # debug
        
        # increment counter
        i += 1 

In [26]:
# Custom function to download csv files

'''
This functions takes in the following arguements:
download_path = the relative path to the folder you want saved
year = year as string object
month = month as string object
location = location of the station
'''

def download_csv(download_path: str, year: str, month: str, location: str):

    # Find the parent element
    table_container = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "download-link")))

    # print(table_container.get_attribute("outerHTML")) # debug
    
    # Find the child element with the csv html link
    csv_link = wait.until(EC.presence_of_element_located((By.LINK_TEXT, "CSV")))

    # extract the link as an object
    csv_url = csv_link.get_attribute("href")
    # print(csv_link.get_attribute("href")) # debug

    # use requests library to access the url
    response = requests.get(csv_url)

    # save into the desired folder
    with open(os.path.join(download_path, f"{year}_{month}_{location}.csv"), "wb") as f:
        f.write(response.content)

    # show the user that the file has been saved as formatted
    print(f"file has been downloaded as {year}_{month}_{location}.csv")

In [21]:
# Create a function that loops and select the month
'''
This function is similar to downloading the csv files. At the moment, the function
takes in arguements that are identical to the CSV custom function as this function
will call on the csv function within.

'''

def get_month(download_path: str, year: str, location: str):

    # Find the parent element
    months = wait.until(EC.presence_of_element_located((By.ID, "monthDiv")))

    # Find the button for months
    months_button = months.find_element(By.TAG_NAME, "button")

    # dropdown menu
    months_dropdown_menu = months.find_elements(By.CLASS_NAME, "dropdown-menu") # MIGHT NOT NEED

    # Show the list of months
    months_dropdown_items = months.find_elements(By.TAG_NAME, "a")

    # display button to refresh the page to show the specified parameters
    display_button = wait.until(EC.presence_of_element_located((By.ID, "display")))
   
     # click button
    months_button.click()
   
    # loop the months in reverse (chronological) order
    for month in months_dropdown_items[::-1]:

        # extract the text to first 3 letters and lower case
        name_of_month = month.text[:3].lower()

        # print(f"Clicking on month: {name_of_month}") # debug

        # click on the specified month
        month.click()

        # stop the script to allow page to load
        time.sleep(2)
        
        # click display button
        display_button.click()

        # stop the script to allow page to load
        time.sleep(2)

        # call on the download csv function
        download_csv(download_path, year, name_of_month, location)

        # click button to show dropdown menu
        months_button.click()

        # stop the script to allow page to load
        time.sleep(2)




In [5]:
# Able to save single file
# download csv file

download_path = "changi_csv_files"

year = "2024"
month = "Oct"

# Target URL
url = "https://www.weather.gov.sg/climate-historical-daily/"

# Set up the Selenium WebDriver
driver = webdriver.Chrome()

try:

    driver.get(url)
    wait = WebDriverWait(driver, 20)

    table_container = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "download-link")))

    print(table_container.get_attribute("outerHTML"))

    csv_link = wait.until(EC.presence_of_element_located((By.LINK_TEXT, "CSV")))

    csv_url = csv_link.get_attribute("href")
    print(csv_link.get_attribute("href"))

    response = requests.get(csv_url)

    with open(os.path.join(download_path, f"{year}_{month}_Changi.csv"), "wb") as f:
        f.write(response.content)


finally:
    driver.quit()

<p class="download-link">Download as:<a class="myload" download="" href="https://www.weather.gov.sg/files/dailydata/DAILYDATA_S24_202410.csv">CSV</a>|<a class="myload" download="" href="https://www.weather.gov.sg/files/dailydata/DAILYDATA_S24_202410.pdf">PDF</a></p>
https://www.weather.gov.sg/files/dailydata/DAILYDATA_S24_202410.csv


# Execute WebScrape

The final cell block opens the URL with *Selenium* and calls on the needed functions to navigate the years, months, and download the csv files into a saved folder.

In [32]:
# The come together

# declare save folder
download_path = "changi_csv_files"

# Target URL
url = "https://www.weather.gov.sg/climate-historical-daily/"

# Set up the Selenium WebDriver
driver = webdriver.Chrome() # set Chrome as browser

# declare location
location = "changi"


try:

    # open url
    driver.get(url) # webpage pops up

    # timeout of 20 seconds
    wait = WebDriverWait(driver, 20)

    # execute from year 1980 to 2024
    for year in range(1980, 2025):
        
        # convert to string
        year_string = str(year)

        # function call of year
        get_year(year_string)

        # stop the script to allow page to load
        time.sleep(5)

        # function call to navigate months and download csv files
        get_month(download_path=download_path, 
                  year=year_string, 
                  location = location)

finally:
    time.sleep(2) # freeze webpage for 2 seconds
    driver.quit() # close pop-up


Clicking on month: jan
file has been downloaded as 1980_jan_changi.csv
Clicking on month: feb
file has been downloaded as 1980_feb_changi.csv
Clicking on month: mar
file has been downloaded as 1980_mar_changi.csv
Clicking on month: apr
file has been downloaded as 1980_apr_changi.csv
Clicking on month: may
file has been downloaded as 1980_may_changi.csv
Clicking on month: jun
file has been downloaded as 1980_jun_changi.csv
Clicking on month: jul
file has been downloaded as 1980_jul_changi.csv
Clicking on month: aug
file has been downloaded as 1980_aug_changi.csv
Clicking on month: sep
file has been downloaded as 1980_sep_changi.csv
Clicking on month: oct
file has been downloaded as 1980_oct_changi.csv
Clicking on month: nov
file has been downloaded as 1980_nov_changi.csv
Clicking on month: dec
file has been downloaded as 1980_dec_changi.csv

Clicking on month: jan
file has been downloaded as 1981_jan_changi.csv
Clicking on month: feb
file has been downloaded as 1981_feb_changi.csv
Clic