In [1]:
import pandas as pd
from dotenv import load_dotenv

import os
import json
import requests
from datetime import datetime
import warnings


pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
warnings.filterwarnings('ignore')


In [2]:
# Load environment variables from .env file
load_dotenv()

True

In [48]:
BUILDKITE_API_TOKEN = os.getenv('BUILDKITE_API_TOKEN')
ORGANIZATION_SLUG='vllm'
PIPELINE_SLUG = 'ci-aws'
TODAY = (datetime.utcnow() - pd.Timedelta(days=1)).strftime('%Y-%m-%dT22:00:00Z') # it is UTC, so -2 hours from Finnish local time
WAITING_TIME_ALERT_THR = 14400 # 4 hours
AGENT_FAILED_BUILDS_THR = 3 # agents declaired unhealthy if they have failed jobs from >=3 unique builds


In [49]:
TODAY

'2024-11-07T22:00:00Z'

In [62]:
params = {
    'created_from': TODAY,
    'per_page': 100
}

In [63]:
def fetch_data(params, token=BUILDKITE_API_TOKEN, org_slug=ORGANIZATION_SLUG, pipe_slug=PIPELINE_SLUG):
    # Set the URL
    url = f"https://api.buildkite.com/v2/organizations/{org_slug}/pipelines/{pipe_slug}/builds"


    # Set the headers
    headers = {
        'Authorization': f'Bearer {token}'
    }
    
    # Make the GET request
    response = requests.get(url, headers=headers, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        return pd.json_normalize(data)
    else:
        print(f"Request failed with status code {response.status_code}")
        return pd.DataFrame()

In [64]:
df = fetch_data(params)

In [65]:
df.created_at.min()

'2024-11-07T22:38:40.426Z'

In [66]:
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df['scheduled_at'] = pd.to_datetime(df['scheduled_at'], errors='coerce')
df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce')
df['finished_at'] = pd.to_datetime(df['finished_at'], errors='coerce')
df['number'] = df['number'].astype('int')

In [67]:
df.created_at.min(), df.created_at.max(), df.number.min(), df.number.max(), df.shape

(Timestamp('2024-11-07 22:38:40.426000+0000', tz='UTC'),
 Timestamp('2024-11-08 07:58:14.211000+0000', tz='UTC'),
 10935,
 10973,
 (39, 105))

In [68]:
df.shape

(39, 105)

In [69]:
df.state.value_counts()

state
passed      14
canceled    11
failed       9
running      4
failing      1
Name: count, dtype: int64

In [70]:
useful_columns = ['id', 'web_url', 'url', 'number', 'state', 'cancel_reason', 'blocked', 'blocked_state', 'jobs']

In [71]:
d = df[useful_columns]

In [72]:
jobs_df = pd.json_normalize(df['jobs'].explode())
jobs_df.head()

Unnamed: 0,id,graphql_id,type,name,step_key,agent_query_rules,state,build_url,web_url,log_url,raw_log_url,artifacts_url,command,soft_failed,exit_status,artifact_paths,created_at,scheduled_at,runnable_at,started_at,finished_at,expired_at,retried,retried_in_job_id,retries_count,retry_source,retry_type,parallel_group_index,parallel_group_total,matrix,cluster_id,cluster_url,cluster_queue_id,cluster_queue_url,step.id,step.signature,priority.number,agent.id,agent.url,agent.web_url,agent.name,agent.connection_state,agent.ip_address,agent.hostname,agent.user_agent,agent.version,agent.creator,agent.created_at,agent.job,agent.last_job_finished_at,agent.priority,agent.meta_data,agent.cluster_url,agent.cluster_queue_url,label,unblocked_by,unblocked_at,unblockable,unblock_url,agent,retry_source.job_id,retry_source.retry_type
0,01930ac7-eabf-4417-9b4a-c32b16ebbf93,Sm9iLS0tMDE5MzBhYzctZWFiZi00NDE3LTliNGEtYzMyYj...,script,bootstrap,bootstrap,[queue=small_cpu_queue],passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10973...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,"if [[ -n """" ]]; then VLLM_CI_BRANCH= curl -sSL...",False,0.0,,2024-11-08T07:58:14.140Z,2024-11-08T07:58:14.140Z,2024-11-08T07:58:14.262Z,2024-11-08T07:58:24.406Z,2024-11-08T07:58:30.897Z,,False,,,,,,,,,,,,01930ac7-ea79-4e35-af9b-0bfd9cdc9b8f,,0.0,01930ac2-5a82-46d1-bd40-e00662898dd0,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/organizations/vllm/unclu...,bk-small-cpu-queue-i-0afc5fac4d3998bab-1,disconnected,54.188.80.223,ip-10-0-86-255.us-west-2.compute.internal,buildkite-agent/3.73.1.8506 (linux; amd64),3.73.1,,2024-11-08T07:52:09.602Z,,2024-11-08T08:02:04.575Z,0.0,"[queue=small_cpu_queue, docker=25.0.3, stack=b...",,,,,,,,,,
1,01930ac8-2710-46d8-b71f-19499ac4f6c0,Sm9iLS0tMDE5MzBhYzgtMjcxMC00NmQ4LWI3MWYtMTk0OT...,script,:docker: build image,image-build,[queue=cpu_queue],passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10973...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,aws ecr-public get-login-password --region us-...,False,0.0,,2024-11-08T07:58:29.627Z,2024-11-08T07:58:14.140Z,2024-11-08T07:58:30.985Z,2024-11-08T07:59:06.840Z,2024-11-08T07:59:14.173Z,,False,,,,,,,,,,,,01930ac8-2687-41f6-893f-c80bf500d113,,0.0,01930ac8-b54e-4bf9-bc96-d468d03a5bd8,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/organizations/vllm/unclu...,bk-cpu-queue-i-016a36d034d4371f0-1,disconnected,35.88.131.233,ip-10-0-56-89.us-west-2.compute.internal,buildkite-agent/3.73.1.8506 (linux; amd64),3.73.1,,2024-11-08T07:59:06.062Z,,2024-11-08T07:59:14.185Z,0.0,"[queue=cpu_queue, docker=25.0.3, stack=bk-cpu-...",,,,,,,,,,
2,01930ac8-2711-40bb-9095-8d0200ac99ce,Sm9iLS0tMDE5MzBhYzgtMjcxMS00MGJiLTkwOTUtOGQwMj...,script,Documentation Build,,[queue=small_cpu_queue],passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10973...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,,False,0.0,,2024-11-08T07:58:29.627Z,2024-11-08T07:58:14.140Z,2024-11-08T07:59:14.284Z,2024-11-08T07:59:21.460Z,2024-11-08T08:02:04.563Z,,False,,,,,,,,,,,,01930ac8-2688-4ce8-8e7e-5b78443a9edf,,0.0,01930ac2-5a82-46d1-bd40-e00662898dd0,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/organizations/vllm/unclu...,bk-small-cpu-queue-i-0afc5fac4d3998bab-1,disconnected,54.188.80.223,ip-10-0-86-255.us-west-2.compute.internal,buildkite-agent/3.73.1.8506 (linux; amd64),3.73.1,,2024-11-08T07:52:09.602Z,,2024-11-08T08:02:04.575Z,0.0,"[queue=small_cpu_queue, docker=25.0.3, stack=b...",,,,,,,,,,
3,01930ac8-2713-4c6b-9c63-0ff84ea5fd88,Sm9iLS0tMDE5MzBhYzgtMjcxMy00YzZiLTljNjMtMGZmOD...,script,"Async Engine, Inputs, Utils, Worker Test",,[queue=gpu_1_queue],running,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10973...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,,False,,,2024-11-08T07:58:29.627Z,2024-11-08T07:58:14.140Z,2024-11-08T07:59:14.284Z,2024-11-08T07:59:19.928Z,,,False,,,,,,,,,,,,01930ac8-268a-40e1-9862-9f8bc59fc8ef,,0.0,01930ac3-3869-4e61-9a16-8e1868017585,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/organizations/vllm/unclu...,bk-gpu-1-queue-i-09ba328b2b599fe24-1,connected,18.246.69.93,ip-10-0-154-22.us-west-2.compute.internal,buildkite-agent/3.73.1.8506 (linux; amd64),3.73.1,,2024-11-08T07:53:06.409Z,,2024-11-08T07:55:39.392Z,0.0,"[queue=gpu_1_queue, docker=25.0.3, stack=bk-gp...",,,,,,,,,,
4,01930ac8-2715-4dc5-89f7-94c655ca096f,Sm9iLS0tMDE5MzBhYzgtMjcxNS00ZGM1LTg5ZjctOTRjNj...,script,Basic Correctness Test,,[queue=gpu_1_queue],passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10973...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,,False,0.0,,2024-11-08T07:58:29.627Z,2024-11-08T07:58:14.140Z,2024-11-08T07:59:14.284Z,2024-11-08T07:59:22.958Z,2024-11-08T08:08:03.625Z,,False,,,,,,,,,,,,01930ac8-268c-4406-b8ae-2470e09f0ab5,,0.0,01930a2f-5085-4b88-8483-b27f25dd84db,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/organizations/vllm/unclu...,bk-gpu-1-queue-i-038c116415811468b-1,connected,54.203.145.121,ip-10-0-180-198.us-west-2.compute.internal,buildkite-agent/3.73.1.8506 (linux; amd64),3.73.1,,2024-11-08T05:11:33.253Z,,2024-11-08T08:19:57.900Z,0.0,"[queue=gpu_1_queue, docker=25.0.3, stack=bk-gp...",,,,,,,,,,


In [61]:
jobs_df.build_url.nunique(), jobs_df.id.nunique(), jobs_df.shape, jobs_df.state.value_counts()

(9,
 812,
 (812, 62),
 state
 blocked           334
 canceled          253
 passed            174
 waiting_failed     20
 broken             18
 failed             12
 timed_out           1
 Name: count, dtype: int64)

In [43]:
jobs_df[jobs_df.state=='broken'].head()

Unnamed: 0,id,graphql_id,type,name,step_key,agent_query_rules,state,build_url,web_url,log_url,raw_log_url,artifacts_url,command,soft_failed,exit_status,artifact_paths,created_at,scheduled_at,runnable_at,started_at,finished_at,expired_at,retried,retried_in_job_id,retries_count,retry_source,retry_type,parallel_group_index,parallel_group_total,matrix,cluster_id,cluster_url,cluster_queue_id,cluster_queue_url,step.id,step.signature,priority.number,agent.id,agent.url,agent.web_url,agent.name,agent.connection_state,agent.ip_address,agent.hostname,agent.user_agent,agent.version,agent.creator,agent.created_at,agent.job,agent.last_job_finished_at,agent.priority,agent.meta_data,agent.cluster_url,agent.cluster_queue_url,label,unblocked_by,unblocked_at,unblockable,unblock_url,agent
45,01930ac8-2749-4701-ab29-3785a5c3d264,Sm9iLS0tMDE5MzBhYzgtMjc0OS00NzAxLWFiMjktMzc4NW...,script,Decoder-only Language Models Test (Extended),,[queue=gpu_1_queue],broken,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10973...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,,False,,,2024-11-08T07:58:29.627Z,2024-11-08T07:58:14.140Z,,,,,False,,,,,,,,,,,,01930ac8-26ae-4dac-a740-7a7a7625c3c1,,0.0,,,,,,,,,,,,,,,,,,,,,,,
47,01930ac8-274c-403e-a574-858da218a3a1,Sm9iLS0tMDE5MzBhYzgtMjc0Yy00MDNlLWE1NzQtODU4ZG...,script,Decoder-only Multi-Modal Models Test (Extended),,[queue=gpu_1_queue],broken,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10973...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,,False,,,2024-11-08T07:58:29.627Z,2024-11-08T07:58:14.140Z,,,,,False,,,,,,,,,,,,01930ac8-26b2-4cdd-84c4-10bc9d6deaee,,0.0,,,,,,,,,,,,,,,,,,,,,,,
129,01930ac5-7448-4915-9649-50e40b6462dc,Sm9iLS0tMDE5MzBhYzUtNzQ0OC00OTE1LTk2NDktNTBlND...,script,Decoder-only Language Models Test (Extended),,[queue=gpu_1_queue],broken,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10972...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,,False,,,2024-11-08T07:55:32.725Z,2024-11-08T07:55:18.986Z,,,,,False,,,,,,,,,,,,01930ac5-73c2-4b44-a30a-d389227e1c88,,0.0,,,,,,,,,,,,,,,,,,,,,,,
131,01930ac5-744b-43e7-b0a0-a2d6fc8f3d83,Sm9iLS0tMDE5MzBhYzUtNzQ0Yi00M2U3LWIwYTAtYTJkNm...,script,Decoder-only Multi-Modal Models Test (Extended),,[queue=gpu_1_queue],broken,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10972...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,,False,,,2024-11-08T07:55:32.725Z,2024-11-08T07:55:18.986Z,,,,,False,,,,,,,,,,,,01930ac5-73c4-48be-b953-544bff33abfd,,0.0,,,,,,,,,,,,,,,,,,,,,,,
217,01930aae-1c86-442a-9a8a-c7dfec366c7e,Sm9iLS0tMDE5MzBhYWUtMWM4Ni00NDJhLTlhOGEtYzdkZm...,script,Decoder-only Language Models Test (Extended),,[queue=gpu_1_queue],broken,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10971...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,,False,,,2024-11-08T07:30:02.720Z,2024-11-08T07:29:48.239Z,,,,,False,,,,,,,,,,,,01930aae-19ed-4200-8508-b155c5a409ea,,0.0,,,,,,,,,,,,,,,,,,,,,,,


In [243]:
jobs_df[jobs_df.state=='timed_out'].web_url.values

array(['https://buildkite.com/vllm/ci-aws/builds/10737#0192fb23-9832-44a0-a0f3-0b3e86bb560c'],
      dtype=object)

In [73]:
jobs_useful_columns = ['id', 'name', 'state', 'build_url', 'web_url', 'soft_failed', 'created_at', 'scheduled_at', 'runnable_at', 'started_at',	'finished_at', 'expired_at', 'retried', 'agent.id', 'agent.name', 'agent.web_url', 'agent.connection_state', 'agent.meta_data']

In [74]:
j = jobs_df[jobs_useful_columns]
j.head()

Unnamed: 0,id,name,state,build_url,web_url,soft_failed,created_at,scheduled_at,runnable_at,started_at,finished_at,expired_at,retried,agent.id,agent.name,agent.web_url,agent.connection_state,agent.meta_data
0,01930ac7-eabf-4417-9b4a-c32b16ebbf93,bootstrap,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10973...,False,2024-11-08T07:58:14.140Z,2024-11-08T07:58:14.140Z,2024-11-08T07:58:14.262Z,2024-11-08T07:58:24.406Z,2024-11-08T07:58:30.897Z,,False,01930ac2-5a82-46d1-bd40-e00662898dd0,bk-small-cpu-queue-i-0afc5fac4d3998bab-1,https://buildkite.com/organizations/vllm/unclu...,disconnected,"[queue=small_cpu_queue, docker=25.0.3, stack=b..."
1,01930ac8-2710-46d8-b71f-19499ac4f6c0,:docker: build image,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10973...,False,2024-11-08T07:58:29.627Z,2024-11-08T07:58:14.140Z,2024-11-08T07:58:30.985Z,2024-11-08T07:59:06.840Z,2024-11-08T07:59:14.173Z,,False,01930ac8-b54e-4bf9-bc96-d468d03a5bd8,bk-cpu-queue-i-016a36d034d4371f0-1,https://buildkite.com/organizations/vllm/unclu...,disconnected,"[queue=cpu_queue, docker=25.0.3, stack=bk-cpu-..."
2,01930ac8-2711-40bb-9095-8d0200ac99ce,Documentation Build,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10973...,False,2024-11-08T07:58:29.627Z,2024-11-08T07:58:14.140Z,2024-11-08T07:59:14.284Z,2024-11-08T07:59:21.460Z,2024-11-08T08:02:04.563Z,,False,01930ac2-5a82-46d1-bd40-e00662898dd0,bk-small-cpu-queue-i-0afc5fac4d3998bab-1,https://buildkite.com/organizations/vllm/unclu...,disconnected,"[queue=small_cpu_queue, docker=25.0.3, stack=b..."
3,01930ac8-2713-4c6b-9c63-0ff84ea5fd88,"Async Engine, Inputs, Utils, Worker Test",running,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10973...,False,2024-11-08T07:58:29.627Z,2024-11-08T07:58:14.140Z,2024-11-08T07:59:14.284Z,2024-11-08T07:59:19.928Z,,,False,01930ac3-3869-4e61-9a16-8e1868017585,bk-gpu-1-queue-i-09ba328b2b599fe24-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=gpu_1_queue, docker=25.0.3, stack=bk-gp..."
4,01930ac8-2715-4dc5-89f7-94c655ca096f,Basic Correctness Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10973...,False,2024-11-08T07:58:29.627Z,2024-11-08T07:58:14.140Z,2024-11-08T07:59:14.284Z,2024-11-08T07:59:22.958Z,2024-11-08T08:08:03.625Z,,False,01930a2f-5085-4b88-8483-b27f25dd84db,bk-gpu-1-queue-i-038c116415811468b-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=gpu_1_queue, docker=25.0.3, stack=bk-gp..."


In [75]:
result_df = d.drop(columns=['jobs']).merge(j.reset_index(drop=True), left_on='url', right_on='build_url', suffixes=['_build', '_job'], how='outer')
result_df.shape 

(3518, 26)

In [76]:
result_df[result_df.name.isna()].shape # 515 jobs have no name or nothing really except for id_job, state_job, these jobs are either blocked or canceled

(943, 26)

In [78]:
#result_df[(result_df.name.isna())]# & (result_df.state_job=='unblocked')]#.state_job.value_counts()

In [79]:
result_df_amd = result_df[(result_df.name.notna()) & (result_df.name.str.contains('AMD'))]
result_df_amd.shape

(429, 26)

In [80]:
result_df_amd.info()

<class 'pandas.core.frame.DataFrame'>
Index: 429 entries, 50 to 2569
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id_build                429 non-null    object 
 1   web_url_build           429 non-null    object 
 2   url                     429 non-null    object 
 3   number                  429 non-null    float64
 4   state_build             429 non-null    object 
 5   cancel_reason           121 non-null    object 
 6   blocked                 429 non-null    object 
 7   blocked_state           429 non-null    object 
 8   id_job                  429 non-null    object 
 9   name                    429 non-null    object 
 10  state_job               429 non-null    object 
 11  build_url               429 non-null    object 
 12  web_url_job             429 non-null    object 
 13  soft_failed             429 non-null    object 
 14  created_at              429 non-null    objec

In [81]:
result_df_amd['number'] = result_df_amd['number'].astype('int')

In [83]:
result_df_amd[result_df_amd.started_at.isna()].head()

Unnamed: 0,id_build,web_url_build,url,number,state_build,cancel_reason,blocked,blocked_state,id_job,name,state_job,build_url,web_url_job,soft_failed,created_at,scheduled_at,runnable_at,started_at,finished_at,expired_at,retried,agent.id,agent.name,agent.web_url,agent.connection_state,agent.meta_data
183,01930948-20c2-4add-9c07-854b81152aae,https://buildkite.com/vllm/ci-aws/builds/10937,https://api.buildkite.com/v2/organizations/vll...,10937,canceled,build_skipping,False,,01930948-5329-4f29-8174-7ee097f16b4e,AMD: Core Test,canceled,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10937...,False,2024-11-08T00:59:14.986Z,2024-11-08T00:59:02.420Z,,,2024-11-08T00:59:53.498Z,,False,,,,,
184,01930948-20c2-4add-9c07-854b81152aae,https://buildkite.com/vllm/ci-aws/builds/10937,https://api.buildkite.com/v2/organizations/vll...,10937,canceled,build_skipping,False,,01930948-532b-4e4e-82cc-b22967f65a04,AMD: Entrypoints Test,canceled,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10937...,True,2024-11-08T00:59:14.986Z,2024-11-08T00:59:02.420Z,,,2024-11-08T00:59:53.498Z,,False,,,,,
185,01930948-20c2-4add-9c07-854b81152aae,https://buildkite.com/vllm/ci-aws/builds/10937,https://api.buildkite.com/v2/organizations/vll...,10937,canceled,build_skipping,False,,01930948-532c-4427-999c-68bf68bd5ac7,AMD: Regression Test,canceled,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10937...,False,2024-11-08T00:59:14.986Z,2024-11-08T00:59:02.420Z,,,2024-11-08T00:59:53.498Z,,False,,,,,
186,01930948-20c2-4add-9c07-854b81152aae,https://buildkite.com/vllm/ci-aws/builds/10937,https://api.buildkite.com/v2/organizations/vll...,10937,canceled,build_skipping,False,,01930948-532d-427c-ae56-5d66572f0b07,AMD: Engine Test,canceled,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10937...,False,2024-11-08T00:59:14.986Z,2024-11-08T00:59:02.420Z,,,2024-11-08T00:59:53.498Z,,False,,,,,
187,01930948-20c2-4add-9c07-854b81152aae,https://buildkite.com/vllm/ci-aws/builds/10937,https://api.buildkite.com/v2/organizations/vll...,10937,canceled,build_skipping,False,,01930948-532f-4112-8fe2-431be1e9c33f,AMD: LogitsProcessor Test,canceled,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10937...,False,2024-11-08T00:59:14.986Z,2024-11-08T00:59:02.420Z,,,2024-11-08T00:59:53.498Z,,False,,,,,


In [105]:
tmp = pd.DataFrame({'runnable_at':['2024-11-08T01:00:00.986Z', '2024-11-08T01:00:00.420Z'],'started_at':['2024-11-08T01:05:00.986Z', None]})
tmp

Unnamed: 0,runnable_at,started_at
0,2024-11-08T01:00:00.986Z,2024-11-08T01:05:00.986Z
1,2024-11-08T01:00:00.420Z,


In [109]:
def calculate_wait_time(df):
    now_utc = pd.Timestamp.now(tz='UTC')
    df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce')
    df['runnable_at'] = pd.to_datetime(df['runnable_at'], errors='coerce')

    

    # Calculate the difference in seconds
    df['waited_seconds'] = df.apply(
        lambda row: (row['started_at'] - row['runnable_at']).total_seconds() if pd.notna(row['started_at']) and pd.notna(row['runnable_at']) \
              else (now_utc - row['runnable_at']).total_seconds() if pd.isna(row['started_at']) and pd.notna(row['runnable_at']) \
                else None,
        axis=1
    )
    
    
    return df





In [107]:
calculate_wait_time(tmp)

2024-11-08 08:45:36.417009+00:00


Unnamed: 0,runnable_at,started_at,waited_seconds
0,2024-11-08 01:00:00.986000+00:00,2024-11-08 01:05:00.986000+00:00,300.0
1,2024-11-08 01:00:00.420000+00:00,NaT,27935.997009


In [110]:
result_df_amd = calculate_wait_time(result_df_amd)

In [111]:
result_df_amd.head()

Unnamed: 0,id_build,web_url_build,url,number,state_build,cancel_reason,blocked,blocked_state,id_job,name,state_job,build_url,web_url_job,soft_failed,created_at,scheduled_at,runnable_at,started_at,finished_at,expired_at,retried,agent.id,agent.name,agent.web_url,agent.connection_state,agent.meta_data,waited_seconds
50,019308c7-9eda-4459-ab74-28a57a4a6af8,https://buildkite.com/vllm/ci-aws/builds/10935,https://api.buildkite.com/v2/organizations/vll...,10935,failed,,False,,019308c8-52a4-4f75-885d-a276c11c9241,AMD: :docker: build image,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10935...,False,2024-11-07T22:39:26.274Z,2024-11-07T22:38:40.389Z,2024-11-07 22:39:26.620000+00:00,2024-11-07 22:39:34.374000+00:00,2024-11-07T22:59:22.666Z,,False,0190a38f-f51d-4532-a853-7c8afb5dc1e1,rocm-framework-build-04-8,https://buildkite.com/organizations/vllm/unclu...,connected,[queue=amd-cpu],7.754
51,019308c7-9eda-4459-ab74-28a57a4a6af8,https://buildkite.com/vllm/ci-aws/builds/10935,https://api.buildkite.com/v2/organizations/vll...,10935,failed,,False,,019308c8-52a5-4729-a7dc-472b15205b36,AMD: Core Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10935...,False,2024-11-07T22:39:26.274Z,2024-11-07T22:38:40.389Z,2024-11-07 22:59:22.850000+00:00,2024-11-07 22:59:31.037000+00:00,2024-11-07T23:13:46.489Z,,False,01922256-33b6-47ce-bf67-35d0e1bbd089,gpu6931.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",8.187
52,019308c7-9eda-4459-ab74-28a57a4a6af8,https://buildkite.com/vllm/ci-aws/builds/10935,https://api.buildkite.com/v2/organizations/vll...,10935,failed,,False,,019308c8-52a7-439d-b81d-9f61dc78c8ce,AMD: Entrypoints Test,failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10935...,True,2024-11-07T22:39:26.274Z,2024-11-07T22:38:40.389Z,2024-11-07 22:59:22.850000+00:00,2024-11-07 22:59:24.418000+00:00,2024-11-07T23:02:44.565Z,,False,019222ee-371e-400a-b771-597dff74cbff,gpu942a.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,lost,"[queue=amd, docker=true, mi250=true]",1.568
53,019308c7-9eda-4459-ab74-28a57a4a6af8,https://buildkite.com/vllm/ci-aws/builds/10935,https://api.buildkite.com/v2/organizations/vll...,10935,failed,,False,,019308c8-52a8-4fce-aff9-f95c730f2f2d,AMD: Regression Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10935...,False,2024-11-07T22:39:26.274Z,2024-11-07T22:38:40.389Z,2024-11-07 22:59:22.850000+00:00,2024-11-07 22:59:27.277000+00:00,2024-11-07T23:05:06.504Z,,False,0192df24-0cae-4861-af82-7b8941d35e62,gpud525.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",4.427
54,019308c7-9eda-4459-ab74-28a57a4a6af8,https://buildkite.com/vllm/ci-aws/builds/10935,https://api.buildkite.com/v2/organizations/vll...,10935,failed,,False,,019308c8-52a9-4076-8400-b432a287c1c9,AMD: Engine Test,failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/10935...,False,2024-11-07T22:39:26.274Z,2024-11-07T22:38:40.389Z,2024-11-07 22:59:22.850000+00:00,2024-11-07 22:59:26.016000+00:00,2024-11-07T23:02:46.762Z,,False,0192df1f-4752-4421-9969-066f3352ed25,gpud518.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",3.166


In [112]:
result_df_amd['agent.name'].value_counts()

agent.name
gpu6931.jax.cs.cpe.ice.amd.com-1    34
gpu942a.jax.cs.cpe.ice.amd.com-1    34
gpud573.jax.cs.cpe.ice.amd.com-1    33
gpud518.jax.cs.cpe.ice.amd.com-1    31
gpu1100.jax.cs.cpe.ice.amd.com-1    31
gpud525.jax.cs.cpe.ice.amd.com-1    30
gpud523.jax.cs.cpe.ice.amd.com-1    30
gpu9496.jax.cs.cpe.ice.amd.com-1    27
rocm-framework-build-04-8           15
rocm-framework-build-04-4           11
rocm-framework-build-04-3            8
rocm-framework-build-04-5            5
Name: count, dtype: int64

In [256]:
result_df_amd['agent.meta_data'].value_counts()

agent.meta_data
[queue=amd, docker=true, mi250=true]    240
[queue=amd-cpu]                          29
Name: count, dtype: int64

In [257]:
result_df_amd.shape, result_df_amd.id_job.nunique()

((319, 27), 319)

In [258]:
result_df_amd['agent.id'].value_counts()

agent.id
019222ee-371e-400a-b771-597dff74cbff    31
0192df1f-4752-4421-9969-066f3352ed25    31
0192224e-348f-4799-b625-268260226ef9    30
01922256-33b6-47ce-bf67-35d0e1bbd089    30
0192b565-f0f7-43d1-8ce1-6f5e2cf6546e    29
019222e8-aa03-4c67-90a5-2ae2355c5d4e    26
0192df24-0cae-4861-af82-7b8941d35e62    25
0192df22-3df6-449a-a16e-8b362bdf0866    25
0190a38f-f36c-4168-8350-f5f09c6ab9fd    13
0192e85e-63f9-49d3-bb43-52e9f61b361f    13
0190a38f-f28c-47f6-90ef-45f51f2007b6     7
0190a38f-f440-4ea5-a6cd-6e312bdb8739     7
0190a38f-f3d8-4fa6-84e7-7cff9408907c     2
Name: count, dtype: int64

In [113]:

failed_jobs_from_diff_builds = result_df_amd[(result_df_amd.state_job=='failed') & (result_df_amd.soft_failed==False)].groupby(['agent.id', 'agent.name', 'agent.web_url'], as_index=False).id_build.nunique() #'agent.connection_state', 'agent.meta_data'

In [114]:
failed_jobs_from_diff_builds

Unnamed: 0,agent.id,agent.name,agent.web_url,id_build
0,0190a38f-f2ea-4a5f-8589-69ba1d6a7d4a,rocm-framework-build-04-3,https://buildkite.com/organizations/vllm/unclu...,1
1,0190a38f-f36c-4168-8350-f5f09c6ab9fd,rocm-framework-build-04-4,https://buildkite.com/organizations/vllm/unclu...,2
2,0190a38f-f51d-4532-a853-7c8afb5dc1e1,rocm-framework-build-04-8,https://buildkite.com/organizations/vllm/unclu...,2
3,019222e8-aa03-4c67-90a5-2ae2355c5d4e,gpu1100.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,1
4,019222ee-371e-400a-b771-597dff74cbff,gpu942a.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,1
5,0192df1f-4752-4421-9969-066f3352ed25,gpud518.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,1


In [125]:
tmp = pd.DataFrame({'runnable_at':['2024-11-08T01:00:00.986Z', '2024-11-08T01:00:00.420Z'],'started_at':['2024-11-08T01:05:00.986Z', None], 'waited_seconds':[14401, 300], 'number':[1,2], 'web_url_job':['sdsd', 'dsf'], 'name':['name1', 'name2'], 'agent.name':['agent1', 'agent2']})
tmp

Unnamed: 0,runnable_at,started_at,waited_seconds,number,web_url_job,name,agent.name
0,2024-11-08T01:00:00.986Z,2024-11-08T01:05:00.986Z,14401,1,sdsd,name1,agent1
1,2024-11-08T01:00:00.420Z,,300,2,dsf,name2,agent2


In [None]:
def alert(df, wait_time_thr=WAITING_TIME_ALERT_THR, agent_failed_builds_thr=AGENT_FAILED_BUILDS_THR):
    alerts = []
    # waiting time alert:
    for index, row in df.iterrows():
        if row['waited_seconds'] > wait_time_thr:
            alert_message = f"Job {row['name']} from build number {row['number']} waited for {row['waited_seconds']} seconds (more than {wait_time_thr} seconds or {wait_time_thr/3600} hours). More info at {row['web_url_job']}, agent {row['agent.name']}"
            alerts.append(alert_message)

    # alert for agent health:
    failed_jobs_from_diff_builds = df[(df.state_job=='failed') & (df.soft_failed==False)].groupby(['agent.id', 'agent.name', 'agent.web_url'], as_index=False).agg(unique_builds=('id_build', 'nunique'))
    unhealthy_agents = failed_jobs_from_diff_builds[failed_jobs_from_diff_builds.unique_builds>=agent_failed_builds_thr]
    for index, row in unhealthy_agents.iterrows():
        alerts.append('Agent alerts:')
        alert_message = f"Agent {row['agent.name']} has failed jobs from {row['unique_builds']} unique builds. More info at {row['agent.web_url']}"
        alerts.append(alert_message)


    return alerts

In [None]:

#alerts = alert(tmp)
#alerts

['Job name1 from build number 1 waited for 14401 seconds (more than 14400 seconds or 4.0 hours). More info at sdsd, agent agent1']

In [129]:

alerts = alert(result_df_amd)
alerts

[]

In [118]:
#result_df_amd[result_df_amd.started_at.isna()][['number', 'name', 'state_job','scheduled_at','runnable_at', 'started_at', 'waited_seconds']] # doesn't seem to have nan values in only one of runnable_at or started_at
