In [327]:
import pandas as pd
from dotenv import load_dotenv
import numpy as np

import os
import json
import requests
from datetime import datetime
import zoneinfo
import warnings


pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
warnings.filterwarnings('ignore')


In [328]:
import mysql.connector
from mysql.connector import errorcode


In [329]:
# Load environment variables from .env file
load_dotenv()

True

In [330]:
BUILDKITE_API_TOKEN = os.getenv('BUILDKITE_API_TOKEN')
ORGANIZATION_SLUG='vllm'
PIPELINE_SLUG = 'ci-aws'
TODAY = (datetime.utcnow() - pd.Timedelta(days=1)).strftime('%Y-%m-%dT22:00:00Z') # it is UTC, so -2 hours from Finnish local time
WAITING_TIME_ALERT_THR = 14400 # 4 hours
AGENT_FAILED_BUILDS_THR = 3 # agents declaired unhealthy if they have failed jobs from >=3 unique builds

MYSQL_USER = os.getenv('MYSQL_USER')
MYSQL_PSSWD = os.getenv('MYSQL_PSSWD')


#TODAY_cutoff = (datetime.utcnow()).strftime('%Y-%m-%dT07:00:00Z')

In [305]:
#connection = mysql.connector.connect(user=MYSQL_USER,password=MYSQL_PSSWD, database='bk_monitor')
#cursor = connection.cursor()


In [306]:
TODAY

'2024-11-10T22:00:00Z'

In [331]:
params = {
    'created_from': TODAY,
    'per_page': 100,
    #'created_to': TODAY_cutoff,

}

In [332]:
def fetch_data(params, token=BUILDKITE_API_TOKEN, org_slug=ORGANIZATION_SLUG, pipe_slug=PIPELINE_SLUG):
    # Set the URL
    url = f"https://api.buildkite.com/v2/organizations/{org_slug}/pipelines/{pipe_slug}/builds"


    # Set the headers
    headers = {
        'Authorization': f'Bearer {token}'
    }
    
    # Make the GET request
    response = requests.get(url, headers=headers, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()
        return pd.json_normalize(data)
    else:
        print(f"Request failed with status code {response.status_code}")
        return pd.DataFrame()

In [333]:
df = fetch_data(params)

In [358]:
def write_fetch_log(df, path='/mnt/home/buildkite_logs/'):
    df.to_csv(path + 'fetch_' + datetime.now(zoneinfo.ZoneInfo('Europe/Helsinki')).isoformat() + '.csv')
    return None

In [335]:
write_fetch_log(df)

ArrowNotImplementedError: Cannot write struct type 'env' with no child field to Parquet. Consider adding a dummy child field.

In [336]:
df.created_at.min()

'2024-11-10T23:08:59.850Z'

In [337]:
def types_fix(df, jobs=False):
    df['number'] = df['number'].astype('int')
    df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
    df['scheduled_at'] = pd.to_datetime(df['scheduled_at'], errors='coerce')
    df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce')   
    df['finished_at'] = pd.to_datetime(df['finished_at'], errors='coerce')
    if jobs:
        df['soft_failed'] = df['soft_failed'].astype('bool')
        df['runnable_at'] = pd.to_datetime(df['runnable_at'], errors='coerce')
        df['expired_at'] = pd.to_datetime(df['expired_at'], errors='coerce')
        df.columns = df.columns.str.replace('.', '_')


    return df    


In [338]:

df = types_fix(df, jobs=False)
df.head()

Unnamed: 0,id,graphql_id,url,web_url,number,state,cancel_reason,blocked,blocked_state,message,commit,branch,tag,source,creator,created_at,scheduled_at,started_at,finished_at,rebuilt_from,jobs,cluster_id,cluster_url,author.name,author.username,author.email,meta_data.buildkite:git:commit,pull_request.id,pull_request.base,pull_request.repository,pipeline.id,pipeline.graphql_id,pipeline.url,pipeline.web_url,pipeline.name,pipeline.description,pipeline.slug,pipeline.repository,pipeline.cluster_id,pipeline.pipeline_template_uuid,pipeline.branch_configuration,pipeline.default_branch,pipeline.skip_queued_branch_builds,pipeline.skip_queued_branch_builds_filter,pipeline.cancel_running_branch_builds,pipeline.cancel_running_branch_builds_filter,pipeline.allow_rebuilds,pipeline.provider.id,pipeline.provider.settings.build_branches,pipeline.provider.settings.build_merge_group_checks_requested,pipeline.provider.settings.build_pull_request_base_branch_changed,pipeline.provider.settings.build_pull_request_forks,pipeline.provider.settings.build_pull_request_labels_changed,pipeline.provider.settings.build_pull_request_ready_for_review,pipeline.provider.settings.build_pull_requests,pipeline.provider.settings.build_tags,pipeline.provider.settings.cancel_deleted_branch_builds,pipeline.provider.settings.filter_enabled,pipeline.provider.settings.prefix_pull_request_fork_branch_names,pipeline.provider.settings.publish_blocked_as_pending,pipeline.provider.settings.publish_commit_status_per_step,pipeline.provider.settings.publish_commit_status,pipeline.provider.settings.pull_request_branch_filter_enabled,pipeline.provider.settings.separate_pull_request_statuses,pipeline.provider.settings.skip_builds_for_existing_commits,pipeline.provider.settings.skip_pull_request_builds_for_existing_commits,pipeline.provider.settings.trigger_mode,pipeline.provider.settings.use_step_key_as_commit_status,pipeline.provider.settings.repository,pipeline.provider.settings.pull_request_branch_filter_configuration,pipeline.provider.settings.filter_condition,pipeline.provider.webhook_url,pipeline.builds_url,pipeline.badge_url,pipeline.created_by.id,pipeline.created_by.graphql_id,pipeline.created_by.name,pipeline.created_by.email,pipeline.created_by.avatar_url,pipeline.created_by.created_at,pipeline.created_at,pipeline.archived_at,pipeline.env,pipeline.scheduled_builds_count,pipeline.running_builds_count,pipeline.scheduled_jobs_count,pipeline.running_jobs_count,pipeline.waiting_jobs_count,pipeline.visibility,pipeline.tags,pipeline.emoji,pipeline.color,pipeline.configuration,pipeline.steps,pipeline.cluster_url,pull_request,creator.id,creator.graphql_id,creator.name,creator.email,creator.avatar_url,creator.created_at
0,01931af6-9aa0-4295-a78b-fd8c41daf820,QnVpbGQtLS0wMTkzMWFmNi05YWEwLTQyOTUtYTc4Yi1mZD...,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11072,11072,running,,False,,remove intentional failed test\n\nSigned-off-b...,bb152d7a9ca0f380bd90d6b92ef8029c79e478e3,Isotr0py:fix-cpu-enc-dec,,webhook,,2024-11-11 11:23:09.420000+00:00,2024-11-11 11:23:09.330000+00:00,2024-11-11 11:23:39.389000+00:00,NaT,,[{'id': '01931af6-9ae8-43bc-8953-c97d54db1d65'...,,,Isotr0py,Isotr0py,2037008807@qq.com,commit bb152d7a9ca0f380bd90d6b92ef8029c79e478e...,10218.0,main,https://github.com/Isotr0py/vllm.git,018fac60-6fa2-4681-9714-0ffcf8e619e9,UGlwZWxpbmUtLS0wMThmYWM2MC02ZmEyLTQ2ODEtOTcxNC...,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws,CI AWS,CI testing of vLLM repo running on AWS,ci-aws,https://github.com/vllm-project/vllm.git,,,main,main,True,!main,True,!main,True,github,True,False,False,True,True,False,True,False,False,True,True,False,True,True,False,True,False,True,code,False,vllm-project/vllm,,"(build.pull_request.labels includes ""ready"" &&...",https://webhook.buildkite.com/deliver/3e7ddff5...,https://api.buildkite.com/v2/organizations/vll...,https://badge.buildkite.com/0f7dfe72085fa6fc9e...,018d3825-2689-4a67-9591-48c416f0fb8d,VXNlci0tLTAxOGQzODI1LTI2ODktNGE2Ny05NTkxLTQ4Yz...,Kevin Luu,kevin@anyscale.com,https://www.gravatar.com/avatar/ac89bf812fbcfd...,2024-01-23T21:06:15.304Z,2024-05-24T20:52:32.290Z,,,0,1,0,28,0,public,,:aws:,,"steps:\n - label: ""bootstrap""\n key: boots...","[{'type': 'script', 'name': 'bootstrap', 'comm...",,,,,,,,
1,01931aef-3d9e-47ee-a471-f9ef3bc95e9c,QnVpbGQtLS0wMTkzMWFlZi0zZDllLTQ3ZWUtYTQ3MS1mOW...,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11071,11071,canceled,build_skipping,False,,change intentional failed test place\n\nSigned...,7cf4c36bd0d42c7cc3c8546cc56d1734fe010ab8,Isotr0py:fix-cpu-enc-dec,,webhook,,2024-11-11 11:15:06.832000+00:00,2024-11-11 11:15:06.772000+00:00,2024-11-11 11:15:43.377000+00:00,2024-11-11 11:23:09.714000+00:00,,[{'id': '01931aef-3dce-4d52-9cc6-399420787dd1'...,,,Isotr0py,Isotr0py,2037008807@qq.com,commit 7cf4c36bd0d42c7cc3c8546cc56d1734fe010ab...,10218.0,main,https://github.com/Isotr0py/vllm.git,018fac60-6fa2-4681-9714-0ffcf8e619e9,UGlwZWxpbmUtLS0wMThmYWM2MC02ZmEyLTQ2ODEtOTcxNC...,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws,CI AWS,CI testing of vLLM repo running on AWS,ci-aws,https://github.com/vllm-project/vllm.git,,,main,main,True,!main,True,!main,True,github,True,False,False,True,True,False,True,False,False,True,True,False,True,True,False,True,False,True,code,False,vllm-project/vllm,,"(build.pull_request.labels includes ""ready"" &&...",https://webhook.buildkite.com/deliver/3e7ddff5...,https://api.buildkite.com/v2/organizations/vll...,https://badge.buildkite.com/0f7dfe72085fa6fc9e...,018d3825-2689-4a67-9591-48c416f0fb8d,VXNlci0tLTAxOGQzODI1LTI2ODktNGE2Ny05NTkxLTQ4Yz...,Kevin Luu,kevin@anyscale.com,https://www.gravatar.com/avatar/ac89bf812fbcfd...,2024-01-23T21:06:15.304Z,2024-05-24T20:52:32.290Z,,,0,1,0,28,0,public,,:aws:,,"steps:\n - label: ""bootstrap""\n key: boots...","[{'type': 'script', 'name': 'bootstrap', 'comm...",,,,,,,,
2,01931ae9-ff5a-4609-aff7-b1df62876773,QnVpbGQtLS0wMTkzMWFlOS1mZjVhLTQ2MDktYWZmNy1iMW...,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11070,11070,canceled,build_skipping,False,,code format\n\nSigned-off-by: Isotr0py <203700...,009c0086a38c65d5f1decd83b2a98d1f9bc0b953,Isotr0py:fix-cpu-enc-dec,,webhook,,2024-11-11 11:09:23.227000+00:00,2024-11-11 11:09:23.152000+00:00,2024-11-11 11:09:32.691000+00:00,2024-11-11 11:15:07.046000+00:00,,[{'id': '01931ae9-ff95-43dd-964a-30ca8ba5b343'...,,,DarkLight1337,DarkLight1337,tlleungac@connect.ust.hk,commit 009c0086a38c65d5f1decd83b2a98d1f9bc0b95...,10218.0,main,https://github.com/Isotr0py/vllm.git,018fac60-6fa2-4681-9714-0ffcf8e619e9,UGlwZWxpbmUtLS0wMThmYWM2MC02ZmEyLTQ2ODEtOTcxNC...,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws,CI AWS,CI testing of vLLM repo running on AWS,ci-aws,https://github.com/vllm-project/vllm.git,,,main,main,True,!main,True,!main,True,github,True,False,False,True,True,False,True,False,False,True,True,False,True,True,False,True,False,True,code,False,vllm-project/vllm,,"(build.pull_request.labels includes ""ready"" &&...",https://webhook.buildkite.com/deliver/3e7ddff5...,https://api.buildkite.com/v2/organizations/vll...,https://badge.buildkite.com/0f7dfe72085fa6fc9e...,018d3825-2689-4a67-9591-48c416f0fb8d,VXNlci0tLTAxOGQzODI1LTI2ODktNGE2Ny05NTkxLTQ4Yz...,Kevin Luu,kevin@anyscale.com,https://www.gravatar.com/avatar/ac89bf812fbcfd...,2024-01-23T21:06:15.304Z,2024-05-24T20:52:32.290Z,,,0,1,0,28,0,public,,:aws:,,"steps:\n - label: ""bootstrap""\n key: boots...","[{'type': 'script', 'name': 'bootstrap', 'comm...",,,,,,,,
3,01931aab-b30e-4c69-8f4d-045120bfc2b5,QnVpbGQtLS0wMTkzMWFhYi1iMzBlLTRjNjktOGY0ZC0wND...,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11069,11069,passed,,True,,[V1] Allow `tokenizer_mode` and `trust_remote_...,5fb1f935b04c29c5c379952681a8a49ad533355d,main,,webhook,,2024-11-11 10:01:20.474000+00:00,2024-11-11 10:01:20.380000+00:00,2024-11-11 10:02:04.094000+00:00,2024-11-11 11:24:08.624000+00:00,,[{'id': '01931aab-b358-4201-8479-a7226308bb95'...,,,Roger Wang,ywang96,136131678+ywang96@users.noreply.github.com,commit 5fb1f935b04c29c5c379952681a8a49ad533355...,,,,018fac60-6fa2-4681-9714-0ffcf8e619e9,UGlwZWxpbmUtLS0wMThmYWM2MC02ZmEyLTQ2ODEtOTcxNC...,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws,CI AWS,CI testing of vLLM repo running on AWS,ci-aws,https://github.com/vllm-project/vllm.git,,,main,main,True,!main,True,!main,True,github,True,False,False,True,True,False,True,False,False,True,True,False,True,True,False,True,False,True,code,False,vllm-project/vllm,,"(build.pull_request.labels includes ""ready"" &&...",https://webhook.buildkite.com/deliver/3e7ddff5...,https://api.buildkite.com/v2/organizations/vll...,https://badge.buildkite.com/0f7dfe72085fa6fc9e...,018d3825-2689-4a67-9591-48c416f0fb8d,VXNlci0tLTAxOGQzODI1LTI2ODktNGE2Ny05NTkxLTQ4Yz...,Kevin Luu,kevin@anyscale.com,https://www.gravatar.com/avatar/ac89bf812fbcfd...,2024-01-23T21:06:15.304Z,2024-05-24T20:52:32.290Z,,,0,1,0,28,0,public,,:aws:,,"steps:\n - label: ""bootstrap""\n key: boots...","[{'type': 'script', 'name': 'bootstrap', 'comm...",,,,,,,,
4,01931a9b-4e5c-4c8c-bbf9-38704b947ff1,QnVpbGQtLS0wMTkzMWE5Yi00ZTVjLTRjOGMtYmJmOS0zOD...,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11068,11068,failed,,False,,[LoRA][Kernel] Remove the unused libentry modu...,36e4acd02a955f71ebb7b220cbfae4a4379bc57b,main,,webhook,,2024-11-11 09:43:26.108000+00:00,2024-11-11 09:43:26.032000+00:00,2024-11-11 09:43:30.122000+00:00,2024-11-11 10:47:26.842000+00:00,,[{'id': '01931a9b-4e9b-4c93-8e4d-b5287f77b8e5'...,,,Jee Jee Li,jeejeelee,pandaleefree@gmail.com,commit 36e4acd02a955f71ebb7b220cbfae4a4379bc57...,,,,018fac60-6fa2-4681-9714-0ffcf8e619e9,UGlwZWxpbmUtLS0wMThmYWM2MC02ZmEyLTQ2ODEtOTcxNC...,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws,CI AWS,CI testing of vLLM repo running on AWS,ci-aws,https://github.com/vllm-project/vllm.git,,,main,main,True,!main,True,!main,True,github,True,False,False,True,True,False,True,False,False,True,True,False,True,True,False,True,False,True,code,False,vllm-project/vllm,,"(build.pull_request.labels includes ""ready"" &&...",https://webhook.buildkite.com/deliver/3e7ddff5...,https://api.buildkite.com/v2/organizations/vll...,https://badge.buildkite.com/0f7dfe72085fa6fc9e...,018d3825-2689-4a67-9591-48c416f0fb8d,VXNlci0tLTAxOGQzODI1LTI2ODktNGE2Ny05NTkxLTQ4Yz...,Kevin Luu,kevin@anyscale.com,https://www.gravatar.com/avatar/ac89bf812fbcfd...,2024-01-23T21:06:15.304Z,2024-05-24T20:52:32.290Z,,,0,1,0,28,0,public,,:aws:,,"steps:\n - label: ""bootstrap""\n key: boots...","[{'type': 'script', 'name': 'bootstrap', 'comm...",,,,,,,,


In [339]:
df.created_at.min(), df.created_at.max(), df.number.min(), df.number.max(), df.shape

(Timestamp('2024-11-10 23:08:59.850000+0000', tz='UTC'),
 Timestamp('2024-11-11 11:23:09.420000+0000', tz='UTC'),
 11049,
 11072,
 (24, 102))

In [340]:
df.shape

(24, 102)

In [254]:
df.state.value_counts()

state
passed      10
failed       4
running      3
canceled     2
Name: count, dtype: int64

In [341]:
useful_columns = ['id', 'web_url', 'url', 'number', 'state', 'cancel_reason', 'blocked', 'blocked_state', 'jobs']

In [342]:
d = df[useful_columns]

In [343]:
jobs_df = pd.json_normalize(df['jobs'].explode())
jobs_df.head()

Unnamed: 0,id,graphql_id,type,name,step_key,agent_query_rules,state,build_url,web_url,log_url,raw_log_url,artifacts_url,command,soft_failed,exit_status,artifact_paths,created_at,scheduled_at,runnable_at,started_at,finished_at,expired_at,retried,retried_in_job_id,retries_count,retry_source,retry_type,parallel_group_index,parallel_group_total,matrix,cluster_id,cluster_url,cluster_queue_id,cluster_queue_url,step.id,step.signature,priority.number,agent.id,agent.url,agent.web_url,agent.name,agent.connection_state,agent.ip_address,agent.hostname,agent.user_agent,agent.version,agent.creator,agent.created_at,agent.job,agent.last_job_finished_at,agent.priority,agent.meta_data,agent.cluster_url,agent.cluster_queue_url,label,unblocked_by,unblocked_at,unblockable,unblock_url,agent,retry_source.job_id,retry_source.retry_type
0,01931af6-9ae8-43bc-8953-c97d54db1d65,Sm9iLS0tMDE5MzFhZjYtOWFlOC00M2JjLTg5NTMtYzk3ZD...,script,bootstrap,bootstrap,[queue=small_cpu_queue],passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11072...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,"if [[ -n """" ]]; then VLLM_CI_BRANCH= curl -sSL...",False,0.0,,2024-11-11T11:23:09.392Z,2024-11-11T11:23:09.392Z,2024-11-11T11:23:09.525Z,2024-11-11T11:23:39.389Z,2024-11-11T11:23:52.051Z,,False,,,,,,,,,,,,01931af6-9aca-4ad2-95e2-80ee0256ddab,,0.0,01931af7-0d4e-40c5-80d0-009d92f9b1d1,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/organizations/vllm/unclu...,bk-small-cpu-queue-i-07d0e5274247e1ed7-1,disconnected,54.202.220.5,ip-10-0-29-102.us-west-2.compute.internal,buildkite-agent/3.73.1.8506 (linux; amd64),3.73.1,,2024-11-11T11:23:38.702Z,,2024-11-11T11:30:28.399Z,0.0,"[queue=small_cpu_queue, docker=25.0.3, stack=b...",,,,,,,,,,
1,01931af7-3c52-444d-b5ad-02c0cfdebe9d,Sm9iLS0tMDE5MzFhZjctM2M1Mi00NDRkLWI1YWQtMDJjMG...,script,:docker: build image,image-build,[queue=cpu_queue],passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11072...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,aws ecr-public get-login-password --region us-...,False,0.0,,2024-11-11T11:23:50.717Z,2024-11-11T11:23:09.392Z,2024-11-11T11:23:52.135Z,2024-11-11T11:23:54.216Z,2024-11-11T11:38:14.497Z,,False,,,,,,,,,,,,01931af7-3bd8-4b36-9c62-1fe92c1048e9,,0.0,01931aea-f28f-4725-99ef-561dc6b0f750,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/organizations/vllm/unclu...,bk-cpu-queue-i-0d9bbe33673d32b47-1,disconnected,18.237.80.71,ip-10-0-95-59.us-west-2.compute.internal,buildkite-agent/3.73.1.8506 (linux; amd64),3.73.1,,2024-11-11T11:10:25.423Z,,2024-11-11T11:38:14.508Z,0.0,"[queue=cpu_queue, docker=25.0.3, stack=bk-cpu-...",,,,,,,,,,
2,01931af7-3c53-47a6-9d64-35ba8f97dea6,Sm9iLS0tMDE5MzFhZjctM2M1My00N2E2LTlkNjQtMzViYT...,script,Documentation Build,,[queue=small_cpu_queue],passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11072...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,,False,0.0,,2024-11-11T11:23:50.717Z,2024-11-11T11:23:09.392Z,2024-11-11T11:38:14.588Z,2024-11-11T11:38:59.577Z,2024-11-11T11:41:48.794Z,,False,,,,,,,,,,,,01931af7-3bd9-405b-bdee-78eeba464909,,0.0,01931b05-17b0-4eca-b71a-c6f0e2164040,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/organizations/vllm/unclu...,bk-small-cpu-queue-i-0f3a4c90bf933bfe9-1,connected,18.236.230.99,ip-10-0-136-5.us-west-2.compute.internal,buildkite-agent/3.73.1.8506 (linux; amd64),3.73.1,,2024-11-11T11:38:58.864Z,,2024-11-11T11:41:48.807Z,0.0,"[queue=small_cpu_queue, docker=25.0.3, stack=b...",,,,,,,,,,
3,01931af7-3c55-43b3-9a67-00736f7df514,Sm9iLS0tMDE5MzFhZjctM2M1NS00M2IzLTlhNjctMDA3Mz...,script,"Async Engine, Inputs, Utils, Worker Test",,[queue=gpu_1_queue],running,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11072...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,,False,,,2024-11-11T11:23:50.717Z,2024-11-11T11:23:09.392Z,2024-11-11T11:38:14.588Z,2024-11-11T11:38:23.170Z,,,False,,,,,,,,,,,,01931af7-3bda-4a11-8044-53950859c291,,0.0,01931afb-ca6c-48c0-88bf-fbf013739644,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/organizations/vllm/unclu...,bk-gpu-1-queue-i-0993b07063b6598e6-1,connected,35.89.67.44,ip-10-0-170-191.us-west-2.compute.internal,buildkite-agent/3.73.1.8506 (linux; amd64),3.73.1,,2024-11-11T11:28:49.260Z,,2024-11-11T11:37:42.560Z,0.0,"[queue=gpu_1_queue, docker=25.0.3, stack=bk-gp...",,,,,,,,,,
4,01931af7-3c56-40de-8761-f233b1b33978,Sm9iLS0tMDE5MzFhZjctM2M1Ni00MGRlLTg3NjEtZjIzM2...,script,Basic Correctness Test,,[queue=gpu_1_queue],running,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11072...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,,False,,,2024-11-11T11:23:50.717Z,2024-11-11T11:23:09.392Z,2024-11-11T11:38:14.588Z,2024-11-11T11:38:22.093Z,,,False,,,,,,,,,,,,01931af7-3bda-4faa-99e5-4c926f1b4c95,,0.0,01931afb-888c-40b2-8074-47bc138a898e,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/organizations/vllm/unclu...,bk-gpu-1-queue-i-09242f67ccabcb9b2-1,connected,44.244.31.185,ip-10-0-92-92.us-west-2.compute.internal,buildkite-agent/3.73.1.8506 (linux; amd64),3.73.1,,2024-11-11T11:28:32.396Z,,2024-11-11T11:37:11.565Z,0.0,"[queue=gpu_1_queue, docker=25.0.3, stack=bk-gp...",,,,,,,,,,


In [344]:
jobs_df.build_url.nunique(), jobs_df.id.nunique(), jobs_df.shape, jobs_df.state.value_counts()

(24,
 2095,
 (2095, 62),
 state
 blocked           934
 passed            702
 canceled          271
 waiting_failed     56
 failed             53
 broken             48
 running            26
 blocked_failed      4
 timed_out           1
 Name: count, dtype: int64)

In [259]:
jobs_df[jobs_df.id=='019319d4-ee12-47fd-a621-aa7d87e22b70']

Unnamed: 0,id,graphql_id,type,name,step_key,agent_query_rules,state,build_url,web_url,log_url,raw_log_url,artifacts_url,command,soft_failed,exit_status,artifact_paths,created_at,scheduled_at,runnable_at,started_at,finished_at,expired_at,retried,retried_in_job_id,retries_count,retry_source,retry_type,parallel_group_index,parallel_group_total,matrix,cluster_id,cluster_url,cluster_queue_id,cluster_queue_url,step.id,step.signature,priority.number,agent.id,agent.url,agent.web_url,agent.name,agent.connection_state,agent.ip_address,agent.hostname,agent.user_agent,agent.version,agent.creator,agent.created_at,agent.job,agent.last_job_finished_at,agent.priority,agent.meta_data,agent.cluster_url,agent.cluster_queue_url,label,unblocked_by,unblocked_at,unblockable,unblock_url,agent,retry_source.job_id,retry_source.retry_type
1375,019319d4-ee12-47fd-a621-aa7d87e22b70,Sm9iLS0tMDE5MzE5ZDQtZWUxMi00N2ZkLWE2MjEtYWE3ZD...,script,AMD: :docker: build image,amd-build,[queue=amd-cpu],passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11052...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,https://api.buildkite.com/v2/organizations/vll...,docker build --build-arg max_jobs=16 --tag roc...,False,0.0,,2024-11-11T06:06:45.247Z,2024-11-11T00:10:55.637Z,2024-11-11T06:06:45.434Z,2024-11-11T06:06:46.638Z,2024-11-11T06:26:18.983Z,,False,,1.0,,,,,,,,,,0193188f-edba-405b-bbfe-42ba6b8eac70,,0.0,0190a38f-f36c-4168-8350-f5f09c6ab9fd,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/organizations/vllm/unclu...,rocm-framework-build-04-4,connected,165.204.78.2,rocm-framework-build-04,buildkite-agent/3.74.1.8727 (linux; amd64),3.74.1,,2024-07-11T20:50:38.572Z,,2024-11-11T07:40:43.857Z,0.0,[queue=amd-cpu],,,,,,,,,0193188f-ee60-48ee-b65b-ee4c54b2565f,manual


In [260]:
jobs_df[jobs_df.id=='0193188f-ee60-48ee-b65b-ee4c54b2565f']

Unnamed: 0,id,graphql_id,type,name,step_key,agent_query_rules,state,build_url,web_url,log_url,raw_log_url,artifacts_url,command,soft_failed,exit_status,artifact_paths,created_at,scheduled_at,runnable_at,started_at,finished_at,expired_at,retried,retried_in_job_id,retries_count,retry_source,retry_type,parallel_group_index,parallel_group_total,matrix,cluster_id,cluster_url,cluster_queue_id,cluster_queue_url,step.id,step.signature,priority.number,agent.id,agent.url,agent.web_url,agent.name,agent.connection_state,agent.ip_address,agent.hostname,agent.user_agent,agent.version,agent.creator,agent.created_at,agent.job,agent.last_job_finished_at,agent.priority,agent.meta_data,agent.cluster_url,agent.cluster_queue_url,label,unblocked_by,unblocked_at,unblockable,unblock_url,agent,retry_source.job_id,retry_source.retry_type


In [345]:
jobs_useful_columns = ['id', 'name', 'state', 'build_url', 'web_url', 'soft_failed', 'created_at', 'scheduled_at', 'runnable_at', 'started_at',	'finished_at', 'expired_at', 'retried', 'agent.id', 'agent.name', 'agent.web_url', 'agent.connection_state', 'agent.meta_data']

In [346]:
j = jobs_df[jobs_useful_columns]
j.head()

Unnamed: 0,id,name,state,build_url,web_url,soft_failed,created_at,scheduled_at,runnable_at,started_at,finished_at,expired_at,retried,agent.id,agent.name,agent.web_url,agent.connection_state,agent.meta_data
0,01931af6-9ae8-43bc-8953-c97d54db1d65,bootstrap,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11072...,False,2024-11-11T11:23:09.392Z,2024-11-11T11:23:09.392Z,2024-11-11T11:23:09.525Z,2024-11-11T11:23:39.389Z,2024-11-11T11:23:52.051Z,,False,01931af7-0d4e-40c5-80d0-009d92f9b1d1,bk-small-cpu-queue-i-07d0e5274247e1ed7-1,https://buildkite.com/organizations/vllm/unclu...,disconnected,"[queue=small_cpu_queue, docker=25.0.3, stack=b..."
1,01931af7-3c52-444d-b5ad-02c0cfdebe9d,:docker: build image,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11072...,False,2024-11-11T11:23:50.717Z,2024-11-11T11:23:09.392Z,2024-11-11T11:23:52.135Z,2024-11-11T11:23:54.216Z,2024-11-11T11:38:14.497Z,,False,01931aea-f28f-4725-99ef-561dc6b0f750,bk-cpu-queue-i-0d9bbe33673d32b47-1,https://buildkite.com/organizations/vllm/unclu...,disconnected,"[queue=cpu_queue, docker=25.0.3, stack=bk-cpu-..."
2,01931af7-3c53-47a6-9d64-35ba8f97dea6,Documentation Build,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11072...,False,2024-11-11T11:23:50.717Z,2024-11-11T11:23:09.392Z,2024-11-11T11:38:14.588Z,2024-11-11T11:38:59.577Z,2024-11-11T11:41:48.794Z,,False,01931b05-17b0-4eca-b71a-c6f0e2164040,bk-small-cpu-queue-i-0f3a4c90bf933bfe9-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=small_cpu_queue, docker=25.0.3, stack=b..."
3,01931af7-3c55-43b3-9a67-00736f7df514,"Async Engine, Inputs, Utils, Worker Test",running,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11072...,False,2024-11-11T11:23:50.717Z,2024-11-11T11:23:09.392Z,2024-11-11T11:38:14.588Z,2024-11-11T11:38:23.170Z,,,False,01931afb-ca6c-48c0-88bf-fbf013739644,bk-gpu-1-queue-i-0993b07063b6598e6-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=gpu_1_queue, docker=25.0.3, stack=bk-gp..."
4,01931af7-3c56-40de-8761-f233b1b33978,Basic Correctness Test,running,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11072...,False,2024-11-11T11:23:50.717Z,2024-11-11T11:23:09.392Z,2024-11-11T11:38:14.588Z,2024-11-11T11:38:22.093Z,,,False,01931afb-888c-40b2-8074-47bc138a898e,bk-gpu-1-queue-i-09242f67ccabcb9b2-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=gpu_1_queue, docker=25.0.3, stack=bk-gp..."


In [347]:
result_df = d.drop(columns=['jobs']).merge(j.reset_index(drop=True), left_on='url', right_on='build_url', suffixes=['_build', '_job'], how='outer')
result_df.shape 

(2095, 26)

In [264]:
result_df[result_df.name.isna()].shape # many jobs have no name or nothing really except for id_job, state_job, these jobs are either blocked or canceled

(398, 26)

In [265]:
#result_df[(result_df.name.isna())]# & (result_df.state_job=='unblocked')]#.state_job.value_counts()

In [348]:
result_df_amd = result_df[(result_df.name.notna()) & (result_df.name.str.contains('AMD'))]
result_df_amd.shape

(264, 26)

In [349]:
result_df_amd.info()

<class 'pandas.core.frame.DataFrame'>
Index: 264 entries, 50 to 1578
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id_build                264 non-null    object 
 1   web_url_build           264 non-null    object 
 2   url                     264 non-null    object 
 3   number                  264 non-null    float64
 4   state_build             264 non-null    object 
 5   cancel_reason           44 non-null     object 
 6   blocked                 264 non-null    object 
 7   blocked_state           264 non-null    object 
 8   id_job                  264 non-null    object 
 9   name                    264 non-null    object 
 10  state_job               264 non-null    object 
 11  build_url               264 non-null    object 
 12  web_url_job             264 non-null    object 
 13  soft_failed             264 non-null    object 
 14  created_at              264 non-null    objec

In [350]:
result_df_amd = types_fix(result_df_amd, jobs=True)

In [351]:
result_df_amd.info()

<class 'pandas.core.frame.DataFrame'>
Index: 264 entries, 50 to 1578
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   id_build                264 non-null    object             
 1   web_url_build           264 non-null    object             
 2   url                     264 non-null    object             
 3   number                  264 non-null    int64              
 4   state_build             264 non-null    object             
 5   cancel_reason           44 non-null     object             
 6   blocked                 264 non-null    object             
 7   blocked_state           264 non-null    object             
 8   id_job                  264 non-null    object             
 9   name                    264 non-null    object             
 10  state_job               264 non-null    object             
 11  build_url               264 non-null    object  

In [353]:
result_df_amd.shape, result_df_amd.id_job.nunique(), result_df_amd.id_build.nunique()

((264, 26), 264, 24)

In [271]:
#tmp = pd.DataFrame({'runnable_at':['2024-11-08T01:00:00.986Z', '2024-11-08T01:00:00.420Z'],'started_at':['2024-11-08T01:05:00.986Z', None]})
#tmp

In [354]:
def calculate_wait_time(df):
    now_utc = pd.Timestamp.now(tz='UTC')
    
    # Calculate the difference in seconds
    df['waited_seconds'] = df.apply(
        lambda row: (row['started_at'] - row['runnable_at']).total_seconds() if pd.notna(row['started_at']) and pd.notna(row['runnable_at']) \
              else (now_utc - row['runnable_at']).total_seconds() if pd.isna(row['started_at']) and pd.notna(row['runnable_at']) \
                else None,
        axis=1
    )
    
    return df





In [273]:
#calculate_wait_time(tmp)

In [355]:
result_df_amd = calculate_wait_time(result_df_amd)

In [356]:
result_df_amd.head()

Unnamed: 0,id_build,web_url_build,url,number,state_build,cancel_reason,blocked,blocked_state,id_job,name,state_job,build_url,web_url_job,soft_failed,created_at,scheduled_at,runnable_at,started_at,finished_at,expired_at,retried,agent_id,agent_name,agent_web_url,agent_connection_state,agent_meta_data,waited_seconds
50,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad78-4733-b460-9225a32a57e6,AMD: :docker: build image,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,False,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:09:14.258000+00:00,2024-11-10 23:09:22.577000+00:00,2024-11-10 23:29:00.454000+00:00,NaT,False,0190a38f-f3d8-4fa6-84e7-7cff9408907c,rocm-framework-build-04-5,https://buildkite.com/organizations/vllm/unclu...,connected,[queue=amd-cpu],8.319
51,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad79-4a1f-b46f-859ca9a141fe,AMD: Core Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,False,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:29:00.586000+00:00,2024-11-10 23:29:05.057000+00:00,2024-11-10 23:43:35.035000+00:00,NaT,False,01930c9a-87ce-4435-a1c9-0b09b333158d,gpu5985.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",4.471
52,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad7a-415a-b728-c3b72c01bf2e,AMD: Entrypoints Test,failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,True,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:29:00.586000+00:00,2024-11-10 23:29:02.929000+00:00,2024-11-10 23:36:44.736000+00:00,NaT,False,0192df1f-4752-4421-9969-066f3352ed25,gpud518.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",2.343
53,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad7b-456a-906c-f00cfb53ca24,AMD: Regression Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,False,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:29:00.586000+00:00,2024-11-10 23:29:04.269000+00:00,2024-11-10 23:34:58.760000+00:00,NaT,False,019222e8-aa03-4c67-90a5-2ae2355c5d4e,gpu1100.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",3.683
54,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad7c-4e60-840e-0bb1a56b2e0e,AMD: Engine Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,False,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:29:00.586000+00:00,2024-11-10 23:29:02.790000+00:00,2024-11-10 23:43:47.222000+00:00,NaT,False,0192b565-f0f7-43d1-8ce1-6f5e2cf6546e,gpud573.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",2.204


In [359]:
write_fetch_log(result_df_amd)

In [360]:
tmp = pd.read_csv('/mnt/home/buildkite_logs/fetch_2024-11-11T13:51:59.869775+02:00.csv')

In [361]:
tmp.head()

Unnamed: 0.1,Unnamed: 0,id_build,web_url_build,url,number,state_build,cancel_reason,blocked,blocked_state,id_job,name,state_job,build_url,web_url_job,soft_failed,created_at,scheduled_at,runnable_at,started_at,finished_at,expired_at,retried,agent_id,agent_name,agent_web_url,agent_connection_state,agent_meta_data,waited_seconds
0,50,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad78-4733-b460-9225a32a57e6,AMD: :docker: build image,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,False,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:09:14.258000+00:00,2024-11-10 23:09:22.577000+00:00,2024-11-10 23:29:00.454000+00:00,,False,0190a38f-f3d8-4fa6-84e7-7cff9408907c,rocm-framework-build-04-5,https://buildkite.com/organizations/vllm/unclu...,connected,['queue=amd-cpu'],8.319
1,51,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad79-4a1f-b46f-859ca9a141fe,AMD: Core Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,False,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:29:00.586000+00:00,2024-11-10 23:29:05.057000+00:00,2024-11-10 23:43:35.035000+00:00,,False,01930c9a-87ce-4435-a1c9-0b09b333158d,gpu5985.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"['queue=amd', 'docker=true', 'mi250=true']",4.471
2,52,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad7a-415a-b728-c3b72c01bf2e,AMD: Entrypoints Test,failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,True,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:29:00.586000+00:00,2024-11-10 23:29:02.929000+00:00,2024-11-10 23:36:44.736000+00:00,,False,0192df1f-4752-4421-9969-066f3352ed25,gpud518.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"['queue=amd', 'docker=true', 'mi250=true']",2.343
3,53,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad7b-456a-906c-f00cfb53ca24,AMD: Regression Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,False,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:29:00.586000+00:00,2024-11-10 23:29:04.269000+00:00,2024-11-10 23:34:58.760000+00:00,,False,019222e8-aa03-4c67-90a5-2ae2355c5d4e,gpu1100.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"['queue=amd', 'docker=true', 'mi250=true']",3.683
4,54,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad7c-4e60-840e-0bb1a56b2e0e,AMD: Engine Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,False,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:29:00.586000+00:00,2024-11-10 23:29:02.790000+00:00,2024-11-10 23:43:47.222000+00:00,,False,0192b565-f0f7-43d1-8ce1-6f5e2cf6546e,gpud573.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"['queue=amd', 'docker=true', 'mi250=true']",2.204


In [362]:
tmp = types_fix(tmp, jobs=True)
tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   Unnamed: 0              264 non-null    int64              
 1   id_build                264 non-null    object             
 2   web_url_build           264 non-null    object             
 3   url                     264 non-null    object             
 4   number                  264 non-null    int64              
 5   state_build             264 non-null    object             
 6   cancel_reason           44 non-null     object             
 7   blocked                 264 non-null    bool               
 8   blocked_state           0 non-null      float64            
 9   id_job                  264 non-null    object             
 10  name                    264 non-null    object             
 11  state_job               264 non-null    objec

In [363]:
tmp.head()

Unnamed: 0.1,Unnamed: 0,id_build,web_url_build,url,number,state_build,cancel_reason,blocked,blocked_state,id_job,name,state_job,build_url,web_url_job,soft_failed,created_at,scheduled_at,runnable_at,started_at,finished_at,expired_at,retried,agent_id,agent_name,agent_web_url,agent_connection_state,agent_meta_data,waited_seconds
0,50,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad78-4733-b460-9225a32a57e6,AMD: :docker: build image,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,False,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:09:14.258000+00:00,2024-11-10 23:09:22.577000+00:00,2024-11-10 23:29:00.454000+00:00,NaT,False,0190a38f-f3d8-4fa6-84e7-7cff9408907c,rocm-framework-build-04-5,https://buildkite.com/organizations/vllm/unclu...,connected,['queue=amd-cpu'],8.319
1,51,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad79-4a1f-b46f-859ca9a141fe,AMD: Core Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,False,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:29:00.586000+00:00,2024-11-10 23:29:05.057000+00:00,2024-11-10 23:43:35.035000+00:00,NaT,False,01930c9a-87ce-4435-a1c9-0b09b333158d,gpu5985.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"['queue=amd', 'docker=true', 'mi250=true']",4.471
2,52,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad7a-415a-b728-c3b72c01bf2e,AMD: Entrypoints Test,failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,True,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:29:00.586000+00:00,2024-11-10 23:29:02.929000+00:00,2024-11-10 23:36:44.736000+00:00,NaT,False,0192df1f-4752-4421-9969-066f3352ed25,gpud518.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"['queue=amd', 'docker=true', 'mi250=true']",2.343
3,53,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad7b-456a-906c-f00cfb53ca24,AMD: Regression Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,False,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:29:00.586000+00:00,2024-11-10 23:29:04.269000+00:00,2024-11-10 23:34:58.760000+00:00,NaT,False,019222e8-aa03-4c67-90a5-2ae2355c5d4e,gpu1100.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"['queue=amd', 'docker=true', 'mi250=true']",3.683
4,54,01931856-760d-4514-a5de-e0f9319a0ec5,https://buildkite.com/vllm/ci-aws/builds/11049,https://api.buildkite.com/v2/organizations/vll...,11049,passed,,True,,01931856-ad7c-4e60-840e-0bb1a56b2e0e,AMD: Engine Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11049...,False,2024-11-10 23:09:13.881000+00:00,2024-11-10 23:08:59.822000+00:00,2024-11-10 23:29:00.586000+00:00,2024-11-10 23:29:02.790000+00:00,2024-11-10 23:43:47.222000+00:00,NaT,False,0192b565-f0f7-43d1-8ce1-6f5e2cf6546e,gpud573.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"['queue=amd', 'docker=true', 'mi250=true']",2.204


In [276]:
#result_df_amd['alert_sent'] = False

In [277]:
#jobs_to_sql = result_df_amd[['id_job', 'name', 'state_job', 'web_url_job', 'soft_failed', 'created_at', 'scheduled_at',	'runnable_at',	'started_at',	'finished_at',	'expired_at',	'retried', 'agent_id', 'agent_name', 'agent_web_url', 'waited_seconds', 'alert_sent', 'id_build']]

In [278]:
#jobs_to_sql.info()

In [279]:
#jobs_to_sql = jobs_to_sql.apply(lambda x: x.fillna() if x.dtype == 'object' else x)
#jobs_to_sql[jobs_to_sql.agent_name.isna()]

In [280]:
def create_table_from_df(df, table_name, cursor, connection):
    query = f"CREATE OR REPLACE TABLE {table_name} ("
    for column, dtype in zip(df.columns, df.dtypes):
        if dtype == 'int64':
            sql_type = 'INT'
        elif dtype == 'float64':
            sql_type = 'FLOAT'
        elif dtype == 'object':
            sql_type = 'VARCHAR(255)'
        elif dtype == 'datetime64[ns, UTC]' or dtype == 'datetime64[ns]':
            sql_type = 'DATETIME'
        elif dtype == 'bool':
            sql_type = 'BOOLEAN'    
        else:
            sql_type = 'VARCHAR(255)'
        query += f"{column} {sql_type}, "
    query = query.rstrip(', ') + ')'
    print(query)
    cursor.execute(query)
    connection.commit()


In [281]:
#create_table_from_df(jobs_to_sql, 'jobs', cursor, connection)



In [282]:
def insert_data_from_df(df, table_name, cursor, connection):
    placeholders = ', '.join(['%s'] * len(df.columns))
    insert_query = f"INSERT INTO {table_name} ({', '.join(df.columns)}) VALUES ({placeholders})"
    for row in df.itertuples(index=False, name=None):
        print(row)
        cursor.execute(insert_query, row)
    connection.commit()

In [283]:
#jobs_to_sql[jobs_to_sql.agent_name.isna()]

In [284]:
#result_df_amd['agent.name'].value_counts()

In [285]:
#jobs_to_sql.to_csv('jobs.csv')

In [364]:

failed_jobs_from_diff_builds = result_df_amd[(result_df_amd.state_job=='failed') & (result_df_amd.soft_failed==False)].groupby(['agent_id', 'agent_name', 'agent_web_url'], as_index=False).agg(nunique_failed_builds=('id_build', 'nunique'), failed_builds=('id_build', 'unique'), failed_numbers=('number','unique'))
#'agent.connection_state', 'agent.meta_data'

In [365]:
failed_jobs_from_diff_builds

Unnamed: 0,agent_id,agent_name,agent_web_url,nunique_failed_builds,failed_builds,failed_numbers
0,0190a38f-f36c-4168-8350-f5f09c6ab9fd,rocm-framework-build-04-4,https://buildkite.com/organizations/vllm/unclu...,2,"[019319eb-d727-4907-b356-9e126b283090, 01931ae...","[11059, 11070]"
1,01930c9a-5726-4226-b6e8-d51eb466c550,gpud559.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,1,[019319f5-04e3-4459-ac05-ec733a787df5],[11060]


In [288]:
result_df_amd[result_df_amd.number==11052]

Unnamed: 0,id_build,web_url_build,url,number,state_build,cancel_reason,blocked,blocked_state,id_job,name,state_job,build_url,web_url_job,soft_failed,created_at,scheduled_at,runnable_at,started_at,finished_at,expired_at,retried,agent_id,agent_name,agent_web_url,agent_connection_state,agent_meta_data,waited_seconds
248,0193188f-28e7-420f-8f89-2550115e6d7a,https://buildkite.com/vllm/ci-aws/builds/11052,https://api.buildkite.com/v2/organizations/vll...,11052,passed,,True,,019319d4-ee12-47fd-a621-aa7d87e22b70,AMD: :docker: build image,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11052...,False,2024-11-11 06:06:45.247000+00:00,2024-11-11 00:10:55.637000+00:00,2024-11-11 06:06:45.434000+00:00,2024-11-11 06:06:46.638000+00:00,2024-11-11 06:26:18.983000+00:00,NaT,False,0190a38f-f36c-4168-8350-f5f09c6ab9fd,rocm-framework-build-04-4,https://buildkite.com/organizations/vllm/unclu...,connected,[queue=amd-cpu],1.204
249,0193188f-28e7-420f-8f89-2550115e6d7a,https://buildkite.com/vllm/ci-aws/builds/11052,https://api.buildkite.com/v2/organizations/vll...,11052,passed,,True,,0193188f-ee62-4907-8451-c71e335b2413,AMD: Core Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11052...,False,2024-11-11 00:11:46.028000+00:00,2024-11-11 00:10:55.637000+00:00,2024-11-11 06:26:19.089000+00:00,2024-11-11 06:26:25.826000+00:00,2024-11-11 06:40:50.580000+00:00,NaT,False,019222e8-aa03-4c67-90a5-2ae2355c5d4e,gpu1100.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",6.737
250,0193188f-28e7-420f-8f89-2550115e6d7a,https://buildkite.com/vllm/ci-aws/builds/11052,https://api.buildkite.com/v2/organizations/vll...,11052,passed,,True,,0193188f-ee63-40f1-befb-f5acedc3b0d2,AMD: Entrypoints Test,failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11052...,True,2024-11-11 00:11:46.028000+00:00,2024-11-11 00:10:55.637000+00:00,2024-11-11 06:26:19.089000+00:00,2024-11-11 06:26:24.156000+00:00,2024-11-11 06:34:14.789000+00:00,NaT,False,0192fd90-6bd7-443b-af3a-f3c2edd0ff8b,gpud523.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",5.067
251,0193188f-28e7-420f-8f89-2550115e6d7a,https://buildkite.com/vllm/ci-aws/builds/11052,https://api.buildkite.com/v2/organizations/vll...,11052,passed,,True,,0193188f-ee64-4550-a8e4-c4757ed7382a,AMD: Regression Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11052...,False,2024-11-11 00:11:46.028000+00:00,2024-11-11 00:10:55.637000+00:00,2024-11-11 06:26:19.089000+00:00,2024-11-11 06:26:21.219000+00:00,2024-11-11 06:32:04.395000+00:00,NaT,False,0192b565-f0f7-43d1-8ce1-6f5e2cf6546e,gpud573.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",2.13
252,0193188f-28e7-420f-8f89-2550115e6d7a,https://buildkite.com/vllm/ci-aws/builds/11052,https://api.buildkite.com/v2/organizations/vll...,11052,passed,,True,,0193188f-ee66-45fa-b93e-c75cefc3e958,AMD: Engine Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11052...,False,2024-11-11 00:11:46.028000+00:00,2024-11-11 00:10:55.637000+00:00,2024-11-11 06:26:19.089000+00:00,2024-11-11 06:26:20.042000+00:00,2024-11-11 06:40:31.583000+00:00,NaT,False,019309e0-c42e-4411-b400-0d516f72d799,gpu942a.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",0.953
253,0193188f-28e7-420f-8f89-2550115e6d7a,https://buildkite.com/vllm/ci-aws/builds/11052,https://api.buildkite.com/v2/organizations/vll...,11052,passed,,True,,0193188f-ee67-49ec-9504-e0164a05e46e,AMD: LogitsProcessor Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11052...,False,2024-11-11 00:11:46.028000+00:00,2024-11-11 00:10:55.637000+00:00,2024-11-11 06:26:19.089000+00:00,2024-11-11 06:26:24.358000+00:00,2024-11-11 06:30:18.014000+00:00,NaT,False,01930c9a-87ce-4435-a1c9-0b09b333158d,gpu5985.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",5.269
254,0193188f-28e7-420f-8f89-2550115e6d7a,https://buildkite.com/vllm/ci-aws/builds/11052,https://api.buildkite.com/v2/organizations/vll...,11052,passed,,True,,0193188f-ee6c-442b-8aee-0914f555b107,AMD: LoRA Test %N,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11052...,False,2024-11-11 00:11:46.028000+00:00,2024-11-11 00:10:55.637000+00:00,2024-11-11 06:26:19.089000+00:00,2024-11-11 06:26:24.397000+00:00,2024-11-11 06:49:57.074000+00:00,NaT,False,01930c9a-5726-4226-b6e8-d51eb466c550,gpud559.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",5.308
255,0193188f-28e7-420f-8f89-2550115e6d7a,https://buildkite.com/vllm/ci-aws/builds/11052,https://api.buildkite.com/v2/organizations/vll...,11052,passed,,True,,0193188f-ee6e-40cc-bf84-b5d01be30f51,AMD: Kernels Test %N,failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11052...,True,2024-11-11 00:11:46.028000+00:00,2024-11-11 00:10:55.637000+00:00,2024-11-11 06:26:19.089000+00:00,2024-11-11 06:26:23.423000+00:00,2024-11-11 07:00:16.386000+00:00,NaT,False,0192df24-0cae-4861-af82-7b8941d35e62,gpud525.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",4.334
256,0193188f-28e7-420f-8f89-2550115e6d7a,https://buildkite.com/vllm/ci-aws/builds/11052,https://api.buildkite.com/v2/organizations/vll...,11052,passed,,True,,0193188f-ee70-4856-beda-772665f8d93b,AMD: Tensorizer Test,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11052...,False,2024-11-11 00:11:46.028000+00:00,2024-11-11 00:10:55.637000+00:00,2024-11-11 06:26:19.089000+00:00,2024-11-11 06:26:25.842000+00:00,2024-11-11 06:36:20.916000+00:00,NaT,False,0192df1f-4752-4421-9969-066f3352ed25,gpud518.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",6.753
257,0193188f-28e7-420f-8f89-2550115e6d7a,https://buildkite.com/vllm/ci-aws/builds/11052,https://api.buildkite.com/v2/organizations/vll...,11052,passed,,True,,0193188f-ee72-4528-bd1d-b8762dbd40f1,AMD: Benchmarks,passed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11052...,False,2024-11-11 00:11:46.028000+00:00,2024-11-11 00:10:55.637000+00:00,2024-11-11 06:26:19.089000+00:00,2024-11-11 06:26:20.378000+00:00,2024-11-11 06:34:03.284000+00:00,NaT,False,0192df22-3df6-449a-a16e-8b362bdf0866,gpu9496.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,connected,"[queue=amd, docker=true, mi250=true]",1.289


In [368]:
result_df_amd[result_df_amd.started_at.isna()]

Unnamed: 0,id_build,web_url_build,url,number,state_build,cancel_reason,blocked,blocked_state,id_job,name,state_job,build_url,web_url_job,soft_failed,created_at,scheduled_at,runnable_at,started_at,finished_at,expired_at,retried,agent_id,agent_name,agent_web_url,agent_connection_state,agent_meta_data,waited_seconds
711,019319eb-d727-4907-b356-9e126b283090,https://buildkite.com/vllm/ci-aws/builds/11059,https://api.buildkite.com/v2/organizations/vll...,11059,failed,,False,,019319eb-f890-4340-b9cb-c8584c143d86,AMD: Core Test,waiting_failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11059...,False,2024-11-11 06:31:55.179000+00:00,2024-11-11 06:31:46.843000+00:00,NaT,NaT,NaT,NaT,False,,,,,,
712,019319eb-d727-4907-b356-9e126b283090,https://buildkite.com/vllm/ci-aws/builds/11059,https://api.buildkite.com/v2/organizations/vll...,11059,failed,,False,,019319eb-f891-466f-997d-12e94404fdc6,AMD: Entrypoints Test,waiting_failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11059...,False,2024-11-11 06:31:55.179000+00:00,2024-11-11 06:31:46.843000+00:00,NaT,NaT,NaT,NaT,False,,,,,,
713,019319eb-d727-4907-b356-9e126b283090,https://buildkite.com/vllm/ci-aws/builds/11059,https://api.buildkite.com/v2/organizations/vll...,11059,failed,,False,,019319eb-f892-4c4f-8561-d153f8fe4d75,AMD: Regression Test,waiting_failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11059...,False,2024-11-11 06:31:55.179000+00:00,2024-11-11 06:31:46.843000+00:00,NaT,NaT,NaT,NaT,False,,,,,,
714,019319eb-d727-4907-b356-9e126b283090,https://buildkite.com/vllm/ci-aws/builds/11059,https://api.buildkite.com/v2/organizations/vll...,11059,failed,,False,,019319eb-f893-4ac7-a988-f5f57a023a19,AMD: Engine Test,waiting_failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11059...,False,2024-11-11 06:31:55.179000+00:00,2024-11-11 06:31:46.843000+00:00,NaT,NaT,NaT,NaT,False,,,,,,
715,019319eb-d727-4907-b356-9e126b283090,https://buildkite.com/vllm/ci-aws/builds/11059,https://api.buildkite.com/v2/organizations/vll...,11059,failed,,False,,019319eb-f895-4ff8-8048-10766ed3da20,AMD: LogitsProcessor Test,waiting_failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11059...,False,2024-11-11 06:31:55.179000+00:00,2024-11-11 06:31:46.843000+00:00,NaT,NaT,NaT,NaT,False,,,,,,
716,019319eb-d727-4907-b356-9e126b283090,https://buildkite.com/vllm/ci-aws/builds/11059,https://api.buildkite.com/v2/organizations/vll...,11059,failed,,False,,019319eb-f896-4079-bdc2-7bb429d4d58e,AMD: LoRA Test %N,waiting_failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11059...,False,2024-11-11 06:31:55.179000+00:00,2024-11-11 06:31:46.843000+00:00,NaT,NaT,NaT,NaT,False,,,,,,
717,019319eb-d727-4907-b356-9e126b283090,https://buildkite.com/vllm/ci-aws/builds/11059,https://api.buildkite.com/v2/organizations/vll...,11059,failed,,False,,019319eb-f897-40a1-8d50-2bae0b5e003c,AMD: Kernels Test %N,waiting_failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11059...,False,2024-11-11 06:31:55.179000+00:00,2024-11-11 06:31:46.843000+00:00,NaT,NaT,NaT,NaT,False,,,,,,
718,019319eb-d727-4907-b356-9e126b283090,https://buildkite.com/vllm/ci-aws/builds/11059,https://api.buildkite.com/v2/organizations/vll...,11059,failed,,False,,019319eb-f898-4d60-93de-d236e158fd77,AMD: Tensorizer Test,waiting_failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11059...,False,2024-11-11 06:31:55.179000+00:00,2024-11-11 06:31:46.843000+00:00,NaT,NaT,NaT,NaT,False,,,,,,
719,019319eb-d727-4907-b356-9e126b283090,https://buildkite.com/vllm/ci-aws/builds/11059,https://api.buildkite.com/v2/organizations/vll...,11059,failed,,False,,019319eb-f899-4f24-8fdd-53ff5aad580f,AMD: Benchmarks,waiting_failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11059...,False,2024-11-11 06:31:55.179000+00:00,2024-11-11 06:31:46.843000+00:00,NaT,NaT,NaT,NaT,False,,,,,,
720,019319eb-d727-4907-b356-9e126b283090,https://buildkite.com/vllm/ci-aws/builds/11059,https://api.buildkite.com/v2/organizations/vll...,11059,failed,,False,,019319eb-f89a-448f-8953-4ca4667f93d6,AMD: OpenAI-Compatible Tool Use,waiting_failed,https://api.buildkite.com/v2/organizations/vll...,https://buildkite.com/vllm/ci-aws/builds/11059...,False,2024-11-11 06:31:55.179000+00:00,2024-11-11 06:31:46.843000+00:00,NaT,NaT,NaT,NaT,False,,,,,,


In [366]:
failed_jobs_from_diff_builds.agent_web_url.values

array(['https://buildkite.com/organizations/vllm/unclustered/agents/0190a38f-f36c-4168-8350-f5f09c6ab9fd',
       'https://buildkite.com/organizations/vllm/unclustered/agents/01930c9a-5726-4226-b6e8-d51eb466c550'],
      dtype=object)

In [290]:
#tmp = pd.DataFrame({'runnable_at':['2024-11-08T01:00:00.986Z', '2024-11-08T01:00:00.420Z'],'started_at':['2024-11-08T01:05:00.986Z', None], 'waited_seconds':[14401, 300], 'number':[1,2], 'web_url_job':['sdsd', 'dsf'], 'name':['name1', 'name2'], 'agent.name':['agent1', 'agent2']})
#tmp

In [291]:
#failed_jobs_from_diff_builds = jobs_to_sql[(jobs_to_sql.state_job=='failed') & (jobs_to_sql.soft_failed==False)].groupby(['agent_id', 'agent_name', 'agent_web_url'], as_index=False).agg(unique_builds=('id_build', 'nunique'))

In [292]:
#failed_jobs_from_diff_builds['alert_sent'] = False
#failed_jobs_from_diff_builds

In [293]:
#jobs_to_sql.columns

In [294]:
#failed_jobs_from_diff_builds.to_csv('agents')

In [295]:
try:
    alerts_sent = pd.read_csv('alerts_sent.csv')
except:
    alerts_sent = pd.DataFrame()

In [296]:
alerts_sent

In [None]:
def alert(df, alerts_sent=alerts_sent, wait_time_thr=WAITING_TIME_ALERT_THR, agent_failed_builds_thr=AGENT_FAILED_BUILDS_THR):
    #!!!!!!!!!!
    wait_time_thr = 1
    agent_failed_builds_thr = 1
    #!!!!!!!!!!
    alerts = []
    now = datetime.now().isoformat()
    alerts_df = pd.DataFrame(columns=['time_utc','alert_type', 'id_job', 'state_job', 'name', 'number', 'id_build', 'waited_seconds', 'web_url_job', 'agent_id', 'agent_name', 'agent_web_url', 'nunique_failed_builds', 'failed_builds'] )
    # job waiting time alert:
    for _, row in df.iterrows():
        if row['waited_seconds'] > wait_time_thr and row['state_job']!='canceled' and pd.isna(row['started_at']): #RETURN THIS CONDITION earlier it was finished at!!!!!!!!!!!
            if not alerts_sent.empty:
                value_exists_in_column = alerts_sent['id_job'].isin([row['id_job']]).any()
                if value_exists_in_column:
                    print('this job exists')
                    continue
            new_row = pd.DataFrame.from_records({'time_utc': now, 'alert_type': 'job', 'id_job': row['id_job'], 'state_job': row['state_job'], 'name': row['name'], 'number': row['number'], 'id_build': row['id_build'], 'waited_seconds': row['waited_seconds'],  
                       'web_url_job': row['web_url_job'], 'agent_id': row['agent_id'], 'agent_name': row['agent_name'], 'agent_web_url': row['agent_web_url'], 'nunique_failed_builds': np.NaN, 'failed_builds': [[]]}) 
            alert_message = f"Job {row['name']} from build number {row['number']} has been waiting for {row['waited_seconds']} seconds (more than {wait_time_thr} seconds or {wait_time_thr/3600} hours). More info at {row['web_url_job']}"
            alerts.append(alert_message)
            alerts_df = pd.concat([alerts_df, new_row], ignore_index=True)
    
    
    # agent health alert:
    failed_jobs_from_diff_builds = df[(df.state_job=='failed') & (df.soft_failed==False)].groupby(['agent_id', 'agent_name', 'agent_web_url'], as_index=False).agg(nunique_failed_builds=('id_build', 'nunique'), failed_builds=('id_build', 'unique'))
    
    unhealthy_agents = failed_jobs_from_diff_builds[failed_jobs_from_diff_builds.nunique_failed_builds>=agent_failed_builds_thr]
    for _, row in unhealthy_agents.iterrows():
        if not alerts_sent.empty:
                value_exists_in_column = alerts_sent['agent_id'].isin([row['agent_id']]).any()
                #print('Agent id Exists')
                agent = row['agent_id']
                
                if value_exists_in_column:
                    #sent_failed_builds = [s.strip() for s in alerts_sent.loc[alerts_sent['agent_id'] == row['agent_id'], 'failed_builds'].values[0].extract(r'([a-f0-9\-]').split(',')]#.replace('[', '').replace(']', '').replace('\'', '').split(',')]
                    sent_failed_builds = alerts_sent.loc[alerts_sent['agent_id'] == row['agent_id'], 'failed_builds'].str.findall(r'([a-f0-9\-]{36})').values[0]#.split(',')]#.replace('[', '').replace(']', '').replace('\'', '').split(',')]

                    print(sent_failed_builds)
        
                    # Check if there is any intersection between row['failed_builds'] and sent_failed_builds
                    if any(failed_build in sent_failed_builds for failed_build in row['failed_builds']):
                        continue
        new_row = pd.DataFrame({'time_utc': now, 'alert_type': 'agent', 'id_job': np.NaN, 'state_job': np.NaN, 'name': np.NaN, 'number': np.NaN, 'id_build': np.NaN, 'waited_seconds': np.NaN,  
                        'web_url_job': np.NaN, 'agent_id': row['agent_id'], 'agent_name': row['agent_name'], 'agent_web_url': row['agent_web_url'], 'nunique_failed_builds': row['nunique_failed_builds'], 'failed_builds': [row['failed_builds'].tolist()]}, index=[0])
        
        alert_message = f"Agent {row['agent_name']} has failed jobs from {row['nunique_failed_builds']} unique builds. More info at {row['agent_web_url']}"
        alerts.append(alert_message)
        alerts_df = pd.concat([alerts_df, new_row], ignore_index=True)

    return alerts, alerts_df

In [298]:

alerts, alerts_df = alert(result_df_amd)
alerts

Unnamed: 0,agent_id,agent_name,agent_web_url,nunique_failed_builds,failed_builds
0,0190a38f-f36c-4168-8350-f5f09c6ab9fd,rocm-framework-build-04-4,https://buildkite.com/organizations/vllm/unclu...,1,[019319eb-d727-4907-b356-9e126b283090]
1,01930c9a-5726-4226-b6e8-d51eb466c550,gpud559.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,1,[019319f5-04e3-4459-ac05-ec733a787df5]


['Agent rocm-framework-build-04-4 has failed jobs from 1 unique builds. More info at https://buildkite.com/organizations/vllm/unclustered/agents/0190a38f-f36c-4168-8350-f5f09c6ab9fd',
 'Agent gpud559.jax.cs.cpe.ice.amd.com-1 has failed jobs from 1 unique builds. More info at https://buildkite.com/organizations/vllm/unclustered/agents/01930c9a-5726-4226-b6e8-d51eb466c550']

In [299]:
alerts_df

Unnamed: 0,time,alert_type,id_job,state_job,name,number,id_build,waited_seconds,web_url_job,agent_id,agent_name,agent_web_url,nunique_failed_builds,failed_builds
0,2024-11-11T09:35:18.192737,agent,,,,,,,,0190a38f-f36c-4168-8350-f5f09c6ab9fd,rocm-framework-build-04-4,https://buildkite.com/organizations/vllm/unclu...,1,[019319eb-d727-4907-b356-9e126b283090]
1,2024-11-11T09:35:18.192737,agent,,,,,,,,01930c9a-5726-4226-b6e8-d51eb466c550,gpud559.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,1,[019319f5-04e3-4459-ac05-ec733a787df5]


In [300]:
alerts_df.to_csv('alerts_sent.csv')

In [60]:
#result_df_amd[result_df_amd.started_at.isna()][['number', 'name', 'state_job','scheduled_at','runnable_at', 'started_at', 'waited_seconds']] # doesn't seem to have nan values in only one of runnable_at or started_at


In [61]:
#cursor.close()
#connection.close()


In [107]:
agent_failed_builds_thr = 1
    #!!!!!!!!!!
alerts = []
now = datetime.now().isoformat()
alerts_df = pd.DataFrame(columns=['time','alert_type', 'id_job', 'state_job', 'name', 'number', 'id_build', 'waited_seconds', 'web_url_job', 'agent_id', 'agent_name', 'agent_web_url', 'nunique_failed_builds', 'failed_builds'] )

failed_jobs_from_diff_builds = result_df_amd[(result_df_amd.state_job=='failed') & (result_df_amd.soft_failed==False)].groupby(['agent_id', 'agent_name', 'agent_web_url'], as_index=False).agg(nunique_failed_builds=('id_build', 'nunique'), failed_builds=('id_build', 'unique'))
    
unhealthy_agents = failed_jobs_from_diff_builds[failed_jobs_from_diff_builds.nunique_failed_builds>=agent_failed_builds_thr]
display(unhealthy_agents)


Unnamed: 0,agent_id,agent_name,agent_web_url,nunique_failed_builds,failed_builds
0,0190a38f-f36c-4168-8350-f5f09c6ab9fd,rocm-framework-build-04-4,https://buildkite.com/organizations/vllm/unclu...,1,[019319eb-d727-4907-b356-9e126b283090]
1,01930c9a-5726-4226-b6e8-d51eb466c550,gpud559.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,1,[019319f5-04e3-4459-ac05-ec733a787df5]


In [125]:
alerts_sent.loc[0, 'failed_builds'] = "['019319f5-04e3-4459-ac05-ec733a787df5']"
alerts_sent

Unnamed: 0.1,Unnamed: 0,time,alert_type,id_job,state_job,name,number,id_build,waited_seconds,web_url_job,agent_id,agent_name,agent_web_url,nunique_failed_builds,failed_builds
0,0,2024-11-11T08:12:52.465635,agent,,,,,,,,0190a38f-f36c-4168-8350-f5f09c6ab9fd,rocm-framework-build-04-4,https://buildkite.com/organizations/vllm/unclu...,1,['019319f5-04e3-4459-ac05-ec733a787df5']
1,1,2024-11-11T08:12:52.465635,agent,,,,,,,,01930c9a-5726-4226-b6e8-d51eb466c550,gpud559.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,1,['019319f5-04e3-4459-ac05-ec733a787df5']


In [126]:
#unhealthy_agents.loc[0, 'failed_builds'] = array(['019319eb-d727-4907-b356-9e126b283090', '019319f5-04e3-4459-ac05-ec733a787df5', '019319f5-04e3-4459-ac05-ec733a787df6'], dtype=object)
unhealthy_agents

Unnamed: 0,agent_id,agent_name,agent_web_url,nunique_failed_builds,failed_builds
0,0190a38f-f36c-4168-8350-f5f09c6ab9fd,rocm-framework-build-04-4,https://buildkite.com/organizations/vllm/unclu...,1,[019319eb-d727-4907-b356-9e126b283090]
1,01930c9a-5726-4226-b6e8-d51eb466c550,gpud559.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,1,[019319f5-04e3-4459-ac05-ec733a787df5]


In [132]:
for _, row in unhealthy_agents.iterrows():
    if not alerts_sent.empty:
            value_exists_in_column = alerts_sent['agent_id'].isin([row['agent_id']]).any()
            #print('Agent id Exists')
            agent = row['agent_id']
            
            if value_exists_in_column:
                #sent_failed_builds = [s.strip() for s in alerts_sent.loc[alerts_sent['agent_id'] == row['agent_id'], 'failed_builds'].values[0].extract(r'([a-f0-9\-]').split(',')]#.replace('[', '').replace(']', '').replace('\'', '').split(',')]
                sent_failed_builds = alerts_sent.loc[alerts_sent['agent_id'] == row['agent_id'], 'failed_builds'].str.findall(r'([a-f0-9\-]{36})').values[0]#.split(',')]#.replace('[', '').replace(']', '').replace('\'', '').split(',')]

                print(sent_failed_builds)
    
                # Check if there is any intersection between row['failed_builds'] and sent_failed_builds
                if any(failed_build in sent_failed_builds for failed_build in row['failed_builds']):
                    print(f'This agent {agent} exists with overlapping failed builds')
                    continue
    new_row = pd.DataFrame({'time': now, 'alert_type': 'agent', 'id_job': np.NaN, 'state_job': np.NaN, 'name': np.NaN, 'number': np.NaN, 'id_build': np.NaN, 'waited_seconds': np.NaN,  
                    'web_url_job': np.NaN, 'agent_id': row['agent_id'], 'agent_name': row['agent_name'], 'agent_web_url': row['agent_web_url'], 'nunique_failed_builds': row['nunique_failed_builds'], 'failed_builds': [row['failed_builds'].tolist()]}, index=[0])
    
    alert_message = f"Agent {row['agent_name']} has failed jobs from {row['nunique_failed_builds']} unique builds. More info at {row['agent_web_url']}"
    alerts.append(alert_message)
    alerts_df = pd.concat([alerts_df, new_row], ignore_index=True)

['019319f5-04e3-4459-ac05-ec733a787df5']
['019319f5-04e3-4459-ac05-ec733a787df5']
This agent 01930c9a-5726-4226-b6e8-d51eb466c550 exists with overlapping failed builds


In [133]:
alerts_df

Unnamed: 0,time,alert_type,id_job,state_job,name,number,id_build,waited_seconds,web_url_job,agent_id,agent_name,agent_web_url,nunique_failed_builds,failed_builds
0,2024-11-11T08:31:48.034651,agent,,,,,,,,0190a38f-f36c-4168-8350-f5f09c6ab9fd,rocm-framework-build-04-4,https://buildkite.com/organizations/vllm/unclu...,1,[019319eb-d727-4907-b356-9e126b283090]
1,2024-11-11T08:31:48.034651,agent,,,,,,,,01930c9a-5726-4226-b6e8-d51eb466c550,gpud559.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,1,[019319f5-04e3-4459-ac05-ec733a787df5]
2,2024-11-11T08:31:48.034651,agent,,,,,,,,0190a38f-f36c-4168-8350-f5f09c6ab9fd,rocm-framework-build-04-4,https://buildkite.com/organizations/vllm/unclu...,1,[019319eb-d727-4907-b356-9e126b283090]
3,2024-11-11T08:31:48.034651,agent,,,,,,,,01930c9a-5726-4226-b6e8-d51eb466c550,gpud559.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,1,[019319f5-04e3-4459-ac05-ec733a787df5]
4,2024-11-11T08:31:48.034651,agent,,,,,,,,0190a38f-f36c-4168-8350-f5f09c6ab9fd,rocm-framework-build-04-4,https://buildkite.com/organizations/vllm/unclu...,1,[019319eb-d727-4907-b356-9e126b283090]
5,2024-11-11T08:31:48.034651,agent,,,,,,,,01930c9a-5726-4226-b6e8-d51eb466c550,gpud559.jax.cs.cpe.ice.amd.com-1,https://buildkite.com/organizations/vllm/unclu...,1,[019319f5-04e3-4459-ac05-ec733a787df5]
6,2024-11-11T08:31:48.034651,agent,,,,,,,,0190a38f-f36c-4168-8350-f5f09c6ab9fd,rocm-framework-build-04-4,https://buildkite.com/organizations/vllm/unclu...,1,[019319eb-d727-4907-b356-9e126b283090]
7,2024-11-11T08:31:48.034651,agent,,,,,,,,0190a38f-f36c-4168-8350-f5f09c6ab9fd,rocm-framework-build-04-4,https://buildkite.com/organizations/vllm/unclu...,1,[019319eb-d727-4907-b356-9e126b283090]


In [123]:
alerts_sent.loc[alerts_sent['agent_id'] == '0190a38f-f36c-4168-8350-f5f09c6ab9fd', 'failed_builds'].str.findall(r'([a-f0-9\-]{36})').values[0]

['019319eb-d727-4907-b356-9e126b283090',
 '019319f5-04e3-4459-ac05-ec733a787df5']

In [119]:
alerts_sent['failed_builds'].values

array(["['019319eb-d727-4907-b356-9e126b283090', '019319f5-04e3-4459-ac05-ec733a787df5']",
       "['019319f5-04e3-4459-ac05-ec733a787df5']"], dtype=object)

In [120]:
alerts_sent['failed_builds'].str.findall(r'([a-f0-9\-]{36})')

0    [019319eb-d727-4907-b356-9e126b283090, 019319f...
1               [019319f5-04e3-4459-ac05-ec733a787df5]
Name: failed_builds, dtype: object