In [7]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException

In [None]:
# Login credentials (consider using environment variables instead of plain text)
USERNAME = "YOUR_USERNAME_HERE"
PASSWORD = "YOUR_PASSWORD_HERE"

In [9]:
# Initialize WebDriver
driver = webdriver.Chrome()  # or webdriver.Firefox()
driver.maximize_window()

In [10]:
#  Login to Herefish
driver.get("https://app.herefish.com/")
time.sleep(10)  # wait for page load

# Enter email
email_input = driver.find_element(By.XPATH, "/html/body/div/div/div[1]/div/form/div[2]/input")
email_input.send_keys(USERNAME)
time.sleep(2)

# Enter password
password_input = driver.find_element(By.XPATH, "/html/body/div/div/div[1]/div/form/div[3]/input")
password_input.send_keys(PASSWORD)
time.sleep(2)

In [11]:
# Click login button
login_button = driver.find_element(By.XPATH, "/html/body/div/div/div[1]/div/form/div[5]/button")
login_button.click()
time.sleep(2)

In [None]:
automation_ids = range(10105, 10116)
automation_data = []
throttle_success = 7  # wait after successful scrape
throttle_fail = 1      # wait after failed scrape
retry_wait = 300       # wait 5 minutes if network fails
save_interval = 2       # save the rpogress ever 2 successful scrapes
output_file = "herefish_automations.xlsx"

scrapes_done = 0

for automation_id in automation_ids:
    while True:
        try:
            url = f"https://app.herefish.com/Automations/Automation/{automation_id}"
            driver.get(url)

            # Check for popup
            try:
                popup = WebDriverWait(driver, 2).until(
                    EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div/div"))
                )
                print(f"Automation {automation_id} skipped: popup detected.")
                time.sleep(throttle_fail)
                break
            except:
                pass

            # Main content container
            automation_xpath = "/html/body/div[2]/div[1]/div/div[6]"
            element = WebDriverWait(driver, 5).until(
                EC.visibility_of_element_located((By.XPATH, automation_xpath))
            )
            automation_text = element.get_attribute("innerText").strip()

            if not automation_text:
                print(f"Automation {automation_id} returned empty content, waiting 2 seconds and skipping.")
                time.sleep(throttle_fail)
                break

            # --------------------------------------------
            # Extract Automation Name / Type / Status
            # --------------------------------------------

            # Name
            name_elem = driver.find_element(
                By.XPATH,
                "/html/body/div[2]/div[1]/div/div[3]/div/div[1]/h4/span"
            )
            automation_name = name_elem.text.strip()

            # Type
            type_elem = driver.find_element(
                By.XPATH,
                "/html/body/div[2]/div[1]/div/div[3]/div/div[1]/h4/span/span[5]/small"
            )
            automation_type = type_elem.text.strip()

            # Status
            status_elem = driver.find_element(
                By.XPATH,
                "/html/body/div[2]/div[1]/div/div[3]/div/div[1]/h4/span/span[1]"
            )
            automation_status = status_elem.text.strip()

            # --------------------------------------------
            # Save all extracted data
            # --------------------------------------------
            automation_data.append({
                "ID": automation_id,
                "Name": automation_name,
                "Type": automation_type,
                "Status": automation_status,
                "Content": automation_text
            })

            scrapes_done += 1
            print(f"Automation {automation_id} scraped successfully (Total: {scrapes_done})")

            if scrapes_done % save_interval == 0:
                pd.DataFrame(automation_data).to_excel(output_file, index=False)
                print(f"Progress saved after {scrapes_done} successful scrapes.")

            time.sleep(throttle_success)
            break

        except WebDriverException:
            print(f"Network error on automation {automation_id}, waiting 5 minutes before retrying...")
            time.sleep(retry_wait)

        except Exception as e:
            print(f"Automation {automation_id} failed: {e}")
            time.sleep(throttle_fail)
            break

# Transform to DataFrame and save
automation_data = pd.DataFrame(automation_data)

# Final scrape save
pd.DataFrame(automation_data).to_excel(output_file, index=False)
print("Final save complete.")


In [None]:
# Go to the Automations overview page
driver.get("https://app.herefish.com/Automations/Automations")
time.sleep(2)

# 1. Click the element you specified
click_xpath = "/html/body/div[2]/div[1]/div/div[3]/div/div[3]/div[1]/span[1]"

click_elem = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH, click_xpath))
)
click_elem.click()
time.sleep(2)

# 2. Wait for the specific table to appear
table_xpath = "/html/body/div[2]/div[1]/div/div[3]/div/div[3]/div[2]/div[1]/table"

table_element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, table_xpath))
)

# 3. Parse the table
rows = table_element.find_elements(By.TAG_NAME, "tr")

hibernated_automations = []
headers = []

for i, row in enumerate(rows):
    cells = row.find_elements(By.TAG_NAME, "th")
    if not cells:
        cells = row.find_elements(By.TAG_NAME, "td")

    values = [c.text.strip() for c in cells]

    if i == 0 and row.find_elements(By.TAG_NAME, "th"):
        headers = values
    else:
        hibernated_automations.append(values)

# Fallback header creation if needed
if not headers:
    max_len = max(len(r) for r in hibernated_automations) if hibernated_automations else 0
    headers = [f"Column_{i+1}" for i in range(max_len)]

# 4. Create the dataframe
hibernated_automations = pd.DataFrame(hibernated_automations, columns=headers)

# Convert all column names to Title Case
hibernated_automations.columns = [col.title() for col in hibernated_automations.columns]

## Convert the column to datetime
# Strip whitespace and replace non-breaking spaces
hibernated_automations['Date Hibernated'] = hibernated_automations['Date Hibernated'].str.strip()
hibernated_automations['Date Hibernated'] = hibernated_automations['Date Hibernated'].str.replace('\xa0', ' ', regex=False)

# Convert to datetime letting pandas infer the format
hibernated_automations['Date Hibernated'] = pd.to_datetime(
    hibernated_automations['Date Hibernated'], errors='coerce', dayfirst=False
)

# Format as dd/mm/yyyy
hibernated_automations['Date Hibernated'] = hibernated_automations['Date Hibernated'].dt.strftime('%d/%m/%Y')


# Drop the "Actions" column
hibernated_automations = hibernated_automations.drop(columns=["Actions"])

hibernated_automations

In [14]:
# # Convert the column to datetime
# pd.to_datetime(hibernated_automations['Date Hibernated'], format='%B %d, %Y', errors='coerce')

# # Format as dd/mm/yyyy
# hibernated_automations['Date Hibernated'] = hibernated_automations['Date Hibernated'].dt.strftime('%d/%m/%Y')


In [15]:
# Create a boolean column in automation_data to see if the automation is hibernated
automation_data['IsHibernated'] = automation_data['Name'].isin(hibernated_automations['Automation Name'])


In [16]:
# Final save
with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
    # Save automation_data
    pd.DataFrame(automation_data).to_excel(writer, sheet_name="Automation Data", index=False)
    
    # Save hibernated_automations
    hibernated_automations.to_excel(writer, sheet_name="Hibernated Automations", index=False)



print("Final save complete.")

Final save complete.
