# Bullhorn Automations Scrapper

In [None]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException


In [None]:
# Login credentials (consider using environment variables instead of plain text)
USERNAME = "YOUR_USERNAME_HERE"
PASSWORD = "YOUR_PASSWORD_HERE"


In [3]:
# Initialize WebDriver
driver = webdriver.Chrome()  # or webdriver.Firefox()
driver.maximize_window()


In [4]:
#  Login to Herefish
driver.get("https://app.herefish.com/")
time.sleep(10)  # wait for page load

# Enter email
email_input = driver.find_element(By.XPATH, "/html/body/div/div/div[1]/div/form/div[2]/input")
email_input.send_keys(USERNAME)
time.sleep(2)

# Enter password
password_input = driver.find_element(By.XPATH, "/html/body/div/div/div[1]/div/form/div[3]/input")
password_input.send_keys(PASSWORD)
time.sleep(2)



In [5]:
# Click login button
login_button = driver.find_element(By.XPATH, "/html/body/div/div/div[1]/div/form/div[5]/button")
login_button.click()
time.sleep(2)

In [None]:


automation_ids = range(10000, 25000)            # Change depending on how long the company has been using Automations
automation_data = []
throttle_success = 10  # wait after successful scrape
throttle_fail = 2      # wait after failed scrape
retry_wait = 300       # wait 5 minutes if network fails
save_interval = 2
output_file = "herefish_automations.xlsx"

scrapes_done = 0

for automation_id in automation_ids:
    while True:  # retry loop
        try:
            url = f"https://app.herefish.com/Automations/Automation/{automation_id}"
            driver.get(url)

            # Check for popup
            try:
                popup = WebDriverWait(driver, 2).until(
                    EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div/div"))
                )
                print(f"Automation {automation_id} skipped: popup detected.")
                time.sleep(throttle_fail)
                break
            except:
                pass

            # Wait for the container to appear
            automation_xpath = "/html/body/div[2]/div[1]/div/div[6]"
            element = WebDriverWait(driver, 5).until(
                EC.visibility_of_element_located((By.XPATH, automation_xpath))
            )
            automation_text = element.get_attribute("innerText").strip()  # get innerText and strip whitespace

            # Check if anything was returned
            if not automation_text:
                print(f"Automation {automation_id} returned empty content, waiting 2 seconds and skipping.")
                time.sleep(throttle_fail)
                break  # skip this automation

            # Save content
            automation_data.append({"ID": automation_id, "Content": automation_text})
            scrapes_done += 1
            print(f"Automation {automation_id} scraped successfully (Total: {scrapes_done})")

            # Periodic save
            if scrapes_done % save_interval == 0:
                pd.DataFrame(automation_data).to_excel(output_file, index=False)
                print(f"Progress saved after {scrapes_done} successful scrapes.")

            time.sleep(throttle_success)
            break

        except WebDriverException:
            print(f"Network error on automation {automation_id}, waiting 5 minutes before retrying...")
            time.sleep(retry_wait)
        except Exception as e:
            print(f"Automation {automation_id} failed: {e}")
            time.sleep(throttle_fail)
            break

# Final save
pd.DataFrame(automation_data).to_excel(output_file, index=False)
print("Final save complete.")
