In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import time
from rich.progress import track

# Define the date range for scraping (modify these dates as needed)
start_date = datetime(2019, 10, 24)
end_date = datetime(2019, 10, 31)

# Generate a list of dates for the given range
total_days = (end_date - start_date).days + 1
date_list = [start_date + timedelta(days=i) for i in range(total_days)]

all_data = []
headers = None  # Will be set on the first successful extraction

# Use Rich's track to display a progress bar
for current_date in track(date_list, description="Processing Dates..."):
    # Construct the URL for the current date
    url = (
        "https://www.weatherforyou.com/reports/index.php?"
        "forecast=pass&pass=archivenws&zipcode=94102&pands=&place=san+francisco&"
        "state=ca&icao=&country=us&month={}&day={}&year={}&dosubmit=Go"
    ).format(current_date.strftime('%m'), current_date.strftime('%d'), current_date.year)
    
    response = requests.get(url)
    if response.status_code != 200:
        print(f"\nFailed to retrieve data for {current_date.strftime('%Y-%m-%d')}. Status code: {response.status_code}")
        continue

    soup = BeautifulSoup(response.text, "html.parser")
    
    # Locate a header cell to identify the target nested table
    header_cell = soup.find("th", class_="IntWxHeader")
    if not header_cell:
        print(f"\nNo header found for {current_date.strftime('%Y-%m-%d')}")
        continue

    # Find the parent table that contains this header cell
    target_table = header_cell.find_parent("table")
    if not target_table:
        print(f"\nNo parent table found for {current_date.strftime('%Y-%m-%d')}")
        continue

    rows = target_table.find_all("tr")
    if len(rows) < 2:
        print(f"\nNo data rows for {current_date.strftime('%Y-%m-%d')}")
        continue

    # On the first successful extraction, get the headers and add an extra "Date" column
    if headers is None:
        header_elements = soup.find_all("th", class_="IntWxHeader")
        headers = [header.get_text(strip=True) for header in header_elements]
        headers.append("Date")

    # Process all data rows (skip the header row, which is the first <tr>)
    for row in rows[1:]:
        span_elements = row.find_all("span", style=lambda s: s and "font-size: 16px;" in s)
        row_data = [span.get_text(strip=True) for span in span_elements]
        # Append the current date to the row data
        row_data.append(current_date.strftime("%Y-%m-%d"))
        all_data.append(row_data)
    
    # Optional pause to be respectful to the server
    time.sleep(0.5)

# Create a DataFrame from the collected data with the proper headers
df = pd.DataFrame(all_data, columns=headers)

print("Final DataFrame:")
print(df.head())
#Use this line below to export your data locally

#df.to_csv(r'pathname')