In [2]:
tg_sec = os.environ["YT_SECURE_VAULT_TG_SEC"]

In [3]:
BASE_URL = f"https://api.telegram.org/bot{tg_sec}"
OPERATION_TITLE = "pleias 1b"
ITER_TIMEOUT = 30

In [4]:
def get_chat_ids() -> set:
    response = requests.get(
        f"{BASE_URL}/getUpdates",
    )
    response.raise_for_status()
    chat_ids = set()
    for message in response.json()["result"]:
        chat_id = message["message"]["chat"]["id"]
        chat_ids.add(chat_id)
    return chat_ids


def is_op_valid(op_id):
    op_state = yt.get_operation_state(op_id)
    return not op_state.is_unsuccessfully_finished()


def send_uvaga_message(chat_ids: set, message: str):
    url = f"{BASE_URL}/sendMessage"
    for chat_id in chat_ids:
        data = {
            "chat_id": chat_id,
            "text": message,
        }
        response = requests.post(url, data=data)
        response.raise_for_status()


def save_chat_ids(chat_ids: set):
    with open("chat_ids", "w") as f:
        f.write(",".join(map(str, chat_ids)))


def save_error(e):
    with open("errors", "w") as f:
        f.write(f"{time.time()} {e}")


def get_training_operations():
    operation_ids = [operation["id"] for operation in yt.list_operations(state="running", filter=OPERATION_TITLE)["operations"]]
    return operation_ids

In [5]:
def main():
    chat_ids = set()
    operation_ids = get_training_operations()
    assert len(operation_ids) == 1
    
    operation_id = operation_ids[0]
    print(operation_id)

    while True:
        try:
            new_chat_ids = get_chat_ids()
            chat_ids.update(new_chat_ids)
            save_chat_ids(chat_ids)

            if not is_op_valid(operation_id):
                message = f"Training https://charlie.yt.nebius.yt/charlie/operations/{operation_id}/details has been failed"
                send_uvaga_message(chat_ids, message)

                operation_ids = get_training_operations()
                if len(operation_ids) != 1:
                    send_uvaga_message(chat_ids, f"There should be exactly 1 training operation, found {operation_ids}")
                    continue
                operation_id = operation_ids[0]
                print(operation_id)
        except Exception as e:
            print("there is an error", e)
            save_error(e)
        finally:
            time.sleep(ITER_TIMEOUT)

In [6]:
main()

2d2d9571-df145fc1-27ea03e8-96b27f2e
