In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import re

In [3]:
def parse_planet_info(text):
    """Extracts planet details from a text block."""
    planet = {}
    
    # Extract planet name and status (e.g., "HATS-6b (LOW)")
    m = re.search(r"^(\S+)\s+\(([^)]+)\)", text)
    if m:
        planet["name"] = m.group(1)
        planet["status"] = m.group(2)
    
    # Extract minimum aperture (e.g., "Min. aperture: 11.99\"")
    m = re.search(r"Min\. aperture:\s*([\d\.]+)\"", text)
    if m:
        planet["min_aperture"] = float(m.group(1))
    
    # Extract total and recent observations (e.g., "Total Observations (Recent): 36 (3)")
    m = re.search(r"Total Observations \(Recent\):\s*(\d+)\s+\((\d+)\)", text)
    if m:
        planet["total_observations"] = int(m.group(1))
        planet["recent_observations"] = int(m.group(2))
    
    # Extract O-C timing (e.g., "O-C: 3.3 ± 0.33 minutes")
    m = re.search(r"O-C:\s*([-\d\.]+\s*±\s*[\d\.]+\s*minutes)", text)
    if m:
        planet["oc"] = m.group(1)
    
    return planet

In [7]:
def parse_planet_row_fixed(row, observatory):
    """Extracts all information about a planet from a <tr> row, handling missing fields."""
    planet_info = {"observatory": observatory}

    # Extract all table cells in the row
    cells = row.find_all("td", style="border-right: solid 1px")
    if len(cells) < 2:
        return None

    # Planet name and status
    planet_text = cells[0].get_text(" ", strip=True)
    m = re.search(r"^(\S+)\s+\(([^)]+)\)", planet_text)
    if m:
        planet_info["name"] = m.group(1)
        planet_info["status"] = m.group(2)

    # Min aperture
    m = re.search(r"Min\. aperture:\s*([\d\.]+)\"", planet_text)
    if m:
        planet_info["min_aperture"] = float(m.group(1))

    # Observations
    m = re.search(r"Total Observations \(Recent\):\s*(\d+)\s+\((\d+)\)", planet_text)
    if m:
        planet_info["total_observations"] = int(m.group(1))
        planet_info["recent_observations"] = int(m.group(2))

    # O-C timing
    m = re.search(r"O-C:\s*([-\d\.]+\s*±\s*[\d\.]+\s*minutes)", planet_text)
    if m:
        planet_info["oc"] = m.group(1)

    # Extract second cell (position & brightness details)
    details_text = cells[1].get_text(" ", strip=True)

    m = re.search(r"RA:\s*([\d:\.]+)\s*hours", details_text)
    if m:
        planet_info["RA"] = m.group(1)

    m = re.search(r"DEC:\s*([-\d:\.]+)\s*degrees", details_text)
    if m:
        planet_info["DEC"] = m.group(1)

    m = re.search(r"MagR:\s*([\d\.]+)\s*mag", details_text)
    if m:
        planet_info["magnitude"] = float(m.group(1))

    m = re.search(r"DepthR:\s*([\d\.]+)\s*mmag", details_text)
    if m:
        planet_info["depth"] = float(m.group(1))

    m = re.search(r"Duration:\s*([\d\.]+)\s*hours", details_text)
    if m:
        planet_info["duration"] = float(m.group(1))

    # Extract transit times and altitudes from third cell (if present)
    transit_times = []
    if len(cells) > 2:
        for transit_td in cells[2].find_all("td"):
            transit_text = transit_td.get_text(" ", strip=True)
            m = re.search(r"(\d{4}/\d{2}/\d{2})\s*(\d{2}:\d{2})\s*Alt:\s*(\d+)°\s*Azi:\s*(\d+)°\s*\((\w+)\)\s*HA:\s*([\d\.]+)h", transit_text)
            if m:
                transit_times.append({
                    "date": m.group(1),
                    "time": m.group(2),
                    "altitude": int(m.group(3)),
                    "azimuth": int(m.group(4)),
                    "direction": m.group(5),
                    "hour_angle": float(m.group(6))
                })

    planet_info["transits"] = transit_times if transit_times else None

    # Extract meridian crossing, moon illumination, and moon distance (if present)
    if len(cells) > 2:
        meridian_text = cells[2].get_text(" ", strip=True)

        m = re.search(r"Meridian crossing at:\s*(\d{4}/\d{2}/\d{2} \d{2}:\d{2})", meridian_text)
        if m:
            planet_info["meridian_crossing"] = m.group(1)

        m = re.search(r"Moon illumination:\s*([\d\.]+)%", meridian_text)
        if m:
            planet_info["moon_illumination"] = float(m.group(1))

        m = re.search(r"Moon distance:\s*([\d\.]+)°", meridian_text)
        if m:
            planet_info["moon_distance"] = float(m.group(1))

    return planet_info



In [14]:
# Process the HTML file again with extended parsing
# Load the HTML file
file_path = "ExoClock - My Schedule - Small.html"
with open(file_path, "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

# Extract observatories and their corresponding planets
extended_data = []
observatories = soup.find_all("h4")

for observatory in observatories:
    obs_name = observatory.text.strip()
    table = observatory.find_next("table")
    if not table:
        continue
    
    i = 0  
    for row in table.find_all("tr", style="border-bottom: dotted 1px"):
        print(observatory, row)
        x = parse_planet_row_fixed(row, obs_name)
        for y in x:
            print(y)
        print(i, x)
        if i == 0:
            break
        # if planet_cell:
        #     planet_info = parse_planet_info(planet_cell.get_text(" ", strip=True))
        #     if planet_info:
        #         planet_info["observatory"] = obs_name  # Add observatory name
        #         extended_data.append(planet_info)

# Convert to DataFrame
# df_extended = pd.DataFrame(extended_data)


<h4>Haleakala Observatory - PlaneWave Delta Rho 350 - QHY600 CMOS</h4> <tr style="border-bottom: dotted 1px"><td style="border-right: solid 1px" width="200px"><a href="https://www.exoclock.space/database/planets/HATS-6b" target="_blank"><font style="color:#94420c; font-size:1.5em">HATS-6b</font></a> (<a data-target="#priorities" data-toggle="modal" href="https://www.exoclock.space/schedule/100#priorities"><font color="green">LOW</font></a>)<br/>Min. aperture: 11.99"<br/>Total Observations (Recent): 36 (3)<br/>O-C: 3.3 ± 0.33 minutes</td><td style="border-right: solid 1px" width="300px"> RA: 05:52:35.2366 hours (J2000)<br/>DEC: -19:01:53.970 degrees (J2000)<br/>Mag<sub>R</sub>: 14.139 mag<br/>Depth<sub>R</sub>: 41.78 mmag<br/>Duration: 2.05 hours</td><td><table cellpadding="5" class="text-center" width="100%"><tbody><tr><td><font color="black">2025/02/17<br/>06:04<br/>Alt: 50°<br/>Azi: 172° (S)<br/>HA: 23.62h</font></td><td><font color="black">2025/02/17<br/>07:04<br/>Alt: 49°<br/>Azi: 

In [5]:
# Process the HTML file again with extended parsing
# Load the HTML file
file_path = "ExoClock - My Schedule - Small.html"
with open(file_path, "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

# Extract observatories and their corresponding planets
extended_data = []
observatories = soup.find_all("h4")

for observatory in observatories:
    obs_name = observatory.text.strip()
    table = observatory.find_next("table")
    if not table:
        continue
    
    for row in table.find_all("tr", style="border-bottom: dotted 1px"):
        planet_cell = row.find("td", style="border-right: solid 1px")
        if planet_cell:
            planet_info = parse_planet_info(planet_cell.get_text(" ", strip=True))
            if planet_info:
                planet_info["observatory"] = obs_name  # Add observatory name
                extended_data.append(planet_info)

# Convert to DataFrame
df_extended = pd.DataFrame(extended_data)


In [6]:
df_extended.columns

Index(['name', 'status', 'min_aperture', 'total_observations',
       'recent_observations', 'oc', 'observatory'],
      dtype='object')