# setup libraries and functions

In [1]:
import pandas as pd
import json
import re
import datetime
from bs4 import BeautifulSoup

In [2]:
def info_from_cell0(cell, verbose = False):

    if verbose:
        print(f' ... Extracting info from cell0, line {cell.sourceline}')

    info_cell0 = {}

    # EXTRACT >> Planet and ExoClock Status
    a_tags = cell.find_all("a")
    for a in a_tags:
        if '/planets' in a.attrs['href']:
            info_cell0['planet'] = a.text
        elif '#priorities' in a.attrs['href']:
            info_cell0['status'] = a.text

    # EXTRACT >> Min. aperture, Recent observations, O-C
    # Find all <br> tags, Extract the next sibling (text that follows <br>)
    br_tags = cell.find_all("br")
    br_texts = [br.next_sibling.strip() for br in br_tags if br.next_sibling]
    if len(br_texts) != 3:
        print(f' ... CELL0 : WARNING !! Found {len(br_texts)} br_texts, expecting 3, source line {cell.sourceline}')
    info_cell0['min_aperture'] = br_texts[0]
    info_cell0['recent_observations'] = br_texts[1]
    info_cell0['oc'] = br_texts[2]

    return info_cell0


In [3]:
def info_from_cell1(cell, verbose = False):

    if verbose:
        print(f' ... Extracting info from cell1, line {cell.sourceline}')

    # dictionary to return
    info_cell1 = {}

    # # EXTRACT >> RA, DEC, Mag, Depth, Duration
    # eg RA: 05:52:35.2366 hours (J2000)DEC: -19:01:53.970 degrees (J2000)MagR: 14.139 magDepthR: 41.78 mmagDuration: 2.05 hours
    cell_text = cell.text + 'END'
    substrings = ['RA', 'DEC', 'MagR', 'magDepthR', 'mmagDuration','END']
    for i in range(len(substrings) - 1):
        substring1 = substrings[i]
        substring2 = substrings[i + 1]
        
        # Define the pattern to extract text between substring1 and substring2
        pattern = re.escape(substring1) + r"(.*?)" + re.escape(substring2)
        match = re.search(pattern, cell_text)

        info_cell1[substring1] = match.group(1).strip() if match else 'MISSING'

    if verbose:
        print(info_cell1)
        
    return(info_cell1)


In [4]:
def info_from_cell_times(cell, cellno, verbose = False):
 
    if verbose:
        print(f' ... Extracting info from cell, line {cell.sourceline}')
        print(cell.contents)

    cell_type = ['','','','1h before ingress','Transit Start','Mid Transit','Transit End','1h after egress','']
    cell_header = ['','','','t_before','t_start','t_mid','t_end','t_after','']

    # dictionary to return
    info_cell = {}

    header = cell_header[cellno]

    # Extract the <font> element
    font_tag = cell.font
    # Get the content of the font tag with HTML tags included
    html_with_tags = font_tag.decode_contents()
    # Split the content by <br/> to separate the elements
    elements = html_with_tags.split("<br/>")

    if len(elements) != 5:
        print(f' ... CELL TIMES : WARNING !! {len(elements)} elements, expecting 5 : source line {cell.sourceline}')
        print(elements)

    # # EXTRACT >> date, time, alt, azi, ha
    for i, element in enumerate(elements):
        if i == 0:
            info_cell[f'{header}_date'] = element
        elif i == 1:
            info_cell[f'{header}_time'] = element
        elif i == 2:
            info_cell[f'{header}_alt'] = element
        elif i == 3:
            info_cell[f'{header}_azi'] = element
        elif i == 4:
            info_cell[f'{header}_ha'] = element

    return(info_cell)


In [5]:
def info_from_cell8(cell, verbose = False, warning = False):
    if verbose:
        print(f' ... Extracting info from cell8, line {cell.sourceline}')
        print(cell.contents)

    info_cell8 = {}
    # Get the text content, keeping the structure
    html_with_tags = cell.decode_contents()

    # EXTRACT >> Meridian, Max count, Moon illumination, Moon distance
    # eg ['Meridian crossing at: 2025/02/17 06:30', 
    #     'Max counts increase during observation:R:0%V:0%', 
    #     'Moon illumination:80.1%, Moon distance:106.0°']
    # Split the text based on <br/> tags
    elements = html_with_tags.split("<br/>")

    # there isn't always a meridian crossing 
    if 'Meridian crossing' in elements[0]:
        meridian = True
    else:
        meridian = False
        if warning:
            print(f' ... CELL8 : WARNING !! No Meridian crossing : source line {cell.sourceline}')
        temp = ['', elements[0], elements[1]]
        elements = temp

    if len(elements) != 3:
        print(f' ... CELL8 : WARNING !! {len(elements)} elements, expecting 3 : source line {cell.sourceline}')
        print(cell.contents)
        print(elements)

    temp = elements[1].split("</a>: ")
    elements[1] = temp[1]

    if verbose:
        for ie, element in enumerate(elements):
            print(ie, ':', element)
    
    # Element 0 -----------------------------------------------
    pattern = r"(\d{4}/\d{2}/\d{2}) (\d{2}:\d{2})"
    match = re.search(pattern, elements[0])

    if match:
        date = match.group(1)  # Extract date
        time = match.group(2)  # Extract time
    else:
        date = 'MISSING'
        time = 'MISSING'

    # Element 1 -----------------------------------------------
    pattern = r"R:(\d+%)</b>.*?V:(\d+%)</b>"
    match = re.search(pattern, elements[1])

    if match:
        r_value = match.group(1)  
        v_value = match.group(2) 
    else:
        r_value = 'MISSING'
        v_value = 'MISSING'

    # Element 3 -----------------------------------------------
    pattern = r"Moon illumination: <b>(\d+\.\d+%)</b>, Moon distance: <b>(\d+\.\d+°)</b>"
    match = re.search(pattern, elements[2])

    if match:
        illumination = match.group(1)
        distance = match.group(2)
    else:
        illumination = 'MISSING'
        distance = 'MISSING'    
    
    info_cell8['meridian_date'] = date
    info_cell8['meridian_time'] = time
    info_cell8['max_counts_R'] = r_value
    info_cell8['max_counts_V'] = v_value
    info_cell8['moon_illumination'] = illumination
    info_cell8['moon_distance'] = distance
    return info_cell8

# parse html

In [10]:
mode = 'bigjobbie'
# mode = 'small'

if mode == 'small':
    file_path = "ExoClock - My Schedule - Small.html"
else:
    file_path = "ExoClock - My Schedule.html"

list_planets = []
cell3_notes = ['OBSERVATION HAS STARTED!',
               'OPTIMAL TARGET NOTE: this planet cannot be observed by a smaller telescope. Please give priority!',
               'DRIFTING EPHEMERIS NOTE',
               'HIGH EFFICIENCY NOTE']

with open(file_path, "r", encoding="utf-8") as infile:

    soup = BeautifulSoup(infile, "html.parser")

    # collect the sequence of observatories
    h4_tags = soup.find_all("h4")
    print(f' ... Found {len(h4_tags)} <h4> tags')

    observatories = 0
    for io, h4_tag in enumerate(h4_tags):       
        if not 'Observatory' in h4_tag.text:
            continue 

        observatories += 1
        observatory = h4_tag
        
        # details of the observatory
        details_observatory = observatory.text.split(" - ")
        name_observatory = details_observatory[0]
        print(f' ... Processing {name_observatory}')
        name_telescope = details_observatory[1]
        name_camera = details_observatory[2]

        # Find the next table, which should contains the planets for this observatory
        # assuming the html is regular and organised and tags follow the pattern <h4>, followed by <table>
        table = observatory.find_next("table")
        if not table:
            continue
        
        # Iterate through each planet entry
        planets = table.find_all("tr", style="border-bottom: dotted 1px")
        print(' ---------------------------------------------------------------')
        print(f' ... Found {len(planets)} Planets for {name_observatory}')

        for ip, planet in enumerate(planets):
            cells = planet.find_all("td")

            # sometimes there's a note in cell3
            planet_note = ''
            if len(cells) == 10:
                for note in cell3_notes:
                    if note in cells[3].text:
                        planet_note = cells[3].text
                        temp = cells
                        cells = [temp[0], temp[1], temp[2], temp[4], temp[5], temp[6], temp[7], temp[8], temp[9]]

            # there must be 9 cells for this to work properly
            if len(cells) != 9:
                print(f' ... MAIN : WARNING !! Found {len(cells)} cells expecting 9 line {planet.sourceline}')
                if planet.sourceline == 378:
                    for xx, cell in enumerate(cells):
                        print(xx, ' : ', cell)

                continue
        
            if ip % 100 == 0:
                current_time = datetime.datetime.now().strftime("%H:%M:%S")
                print(f' ... {current_time} : Processing planet {ip} observatory {name_observatory}')

            for ic, cell in enumerate(cells):
                # cell0, left most cell
                if ic == 0:
                    info_cell0 = info_from_cell0(cell)
                elif ic == 1:
                    info_cell1 = info_from_cell1(cell)
                elif ic == 2:
                    # this is a repeat of the following, i don't understand html well
                    # enough to understand why it's repeated
                    # print(cell) 
                    pass
                elif ic == 3:
                    # 1h before ingress
                    info_cell3 = info_from_cell_times(cell, ic)
                elif ic == 4:
                    info_cell4 = info_from_cell_times(cell, ic)
                elif ic == 5:
                    info_cell5 = info_from_cell_times(cell, ic)
                elif ic == 6:
                    info_cell6 = info_from_cell_times(cell, ic)
                elif ic == 7:
                    info_cell7 = info_from_cell_times(cell, ic)
                elif ic == 8:
                    info_cell8 = info_from_cell8(cell)
            
            #dict_info = {'observatory': name_observatory, 'telescope': name_telescope, 'camera': name_camera}
            dict_info = {'observatory': name_observatory, 'note': planet_note}
            dict_planet = {**dict_info, **info_cell0, **info_cell1, **info_cell3, **info_cell4, **info_cell5, **info_cell6, **info_cell7, **info_cell8}

            # append this planet to the list of planets
            list_planets.append(dict_planet)

# create a pandas dataframe from the list of planets
df = pd.DataFrame(list_planets)


 ... Found 15 <h4> tags
 ... Processing Haleakala Observatory
 ---------------------------------------------------------------
 ... Found 604 Planets for Haleakala Observatory
 ... 19:52:40 : Processing planet 0 observatory Haleakala Observatory
 ... 19:52:40 : Processing planet 100 observatory Haleakala Observatory
 ... 19:52:40 : Processing planet 200 observatory Haleakala Observatory
 ... 19:52:40 : Processing planet 300 observatory Haleakala Observatory
 ... 19:52:40 : Processing planet 400 observatory Haleakala Observatory
 ... 19:52:40 : Processing planet 500 observatory Haleakala Observatory
 ... 19:52:40 : Processing planet 600 observatory Haleakala Observatory
 ... Processing McDonald Observatory
 ---------------------------------------------------------------
 ... Found 567 Planets for McDonald Observatory
 ... 19:52:40 : Processing planet 0 observatory McDonald Observatory
 ... 19:52:40 : Processing planet 100 observatory McDonald Observatory
 ... 19:52:40 : Processing plane

In [11]:
df.value_counts('observatory')

observatory
Cerro Tololo Observatory     822
Sutherland Observatory       807
Siding Spring Observatory    776
Haleakala Observatory        604
Teide Observatory            588
McDonald Observatory         567
Nostos Observatory           472
Name: count, dtype: int64

In [13]:
df.to_csv('exoclock_schedule.csv', index=False)