## Goal 1 (Optional): Scrap actual and forecasted wind generation from 2024-03-16 onwards 

This notebook is used to scrape the website and download the csv file. 
I used Selenium (https://www.selenium.dev/) a open source project for browser automation. 
The package can be install by pip here https://pypi.org/project/selenium/.

In [6]:
import zipfile
import os, time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [7]:
download_dir = "/Users/lok/5DE/data"  # Replace with your desired directory
os.makedirs(download_dir, exist_ok=True)

# the website is unsecured so we had to use some chrome options to bypass everything, including the pop up to download
options = webdriver.ChromeOptions()

prefs = {
    "download.default_directory": download_dir,
    "download.directory_upgrade": "true",
    "download.prompt_for_download": "false",
    "disable-popup-blocking": "true",
    "safebrowsing.enabled": False,
    "default_content_settings": "contentSettings",
    "download": "download"
}
options.add_experimental_option("prefs", prefs)
options.add_argument("--headless")
options.add_argument("--disable-notifications")
options.add_argument('--disable-gpu')
options.add_argument('--disable-software-rasterizer')
driver = webdriver.Chrome(options=options)
options.add_argument('--safebrowsing-disable-download-protection')

The following portions of the code could take a few minutes or so to run. 
The website connection is unstable at certain times so I decided to add buffer between each steps to ensure all the loading are finished first.

In [8]:
# Initialize selenium webdriver with the Chrome options
driver = webdriver.Chrome(options=options)

# Initialize selenium webdriver

# Navigate to the website
driver.get('http://oasis.caiso.com/mrioasis/logon.do')

# Hover over the SYSTEM DEMAND menu
action = ActionChains(driver)
system_demand_menu = WebDriverWait(driver, 10).until(
    EC.visibility_of_element_located((By.ID, 'LowerMenuItem.oasisMenu.SysDemand')))
action.move_to_element(system_demand_menu).perform()

# winsolar button
apply_button = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.ID, 'DropdownMenu.oasisMenu.SysDemand.windSolar')))
apply_button.click()
time.sleep(2)

# date from input
date_from_div = WebDriverWait(driver, 20).until(
    EC.presence_of_element_located((By.ID, 'PFC_date_from')))
date_from_input = date_from_div.find_element(By.TAG_NAME, 'input')
date_from_input.clear()
date_from_input.send_keys('03/16/2024', Keys.ENTER)
time.sleep(2)

# calendar button
calendar_icon = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.ID, 'PFC_date_to_CAL_ICON')))
calendar_icon.click()
time.sleep(5)

# today button
today_button_to = WebDriverWait(driver, 20).until(
    EC.element_to_be_clickable((By.XPATH, "(//button[@title='Today'])[last()]")))
today_button_to.click()
time.sleep(2)

# download button
download_button = WebDriverWait(driver, 20).until(
    EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Download CSV')]")))
download_button.click()
 
# buffer time to download files
time.sleep(30)  # Adjust this delay as needed for the download to complete
print("Download process finished. The zip file is in the data folder. Please Unzip the file with the function below.")
# Close the browser
driver.quit()

Download process finished. The zip file is in the data folder. Please Unzip the file with the function below.


## Wait for the above to first first before proceeding.

In [9]:
def find_most_recent_zip(folder_path):
    # List all files in the folder
    files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    # Filter out non-zip files
    zip_files = [f for f in files if f.endswith('.zip')]

    # Sort the zip files by modification time, descending
    zip_files.sort(key=lambda x: os.path.getmtime(os.path.join(folder_path, x)), reverse=True)

    return zip_files[0] if zip_files else None

def unzip_most_recent_file(folder_path, extract_to):
    try:
        # Find the most recent zip file
        recent_zip = find_most_recent_zip(folder_path)

        if recent_zip is None:
            print("No ZIP files found in the directory.")
            return

        # Full path of the zip file
        zip_path = os.path.join(folder_path, recent_zip)

        # Unzipping the file
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            extracted_files = zip_ref.namelist()
            zip_ref.extractall(extract_to)

            # Find the CSV file in the extracted files
            csv_file = next((f for f in extracted_files if f.endswith('.csv')), None)
            if csv_file is None:
                print("No CSV file found in the ZIP.")
                return

            # Rename the CSV file to 'most_recent_data.csv'
            os.rename(os.path.join(extract_to, csv_file), os.path.join(extract_to, 'most_recent_data.csv'))
            print(f"Unzipped '{recent_zip}' to '{extract_to}'")
    except Exception as e:
        print(f"Error occurred: {e}")

unzip_most_recent_file(download_dir, download_dir)
print('Default file name: most_recent_data.csv')

Unzipped '20240316_20240409_SLD_REN_FCST_N_20240408_16_06_44_v1.zip' to '/Users/lok/5DE/data'
Default file name: most_recent_data.csv
