In [109]:
from prometheus_api_client import PrometheusConnect, MetricsList
from prometheus_api_client.metric_range_df import MetricRangeDataFrame
from prometheus_api_client.utils import parse_datetime
import pandas as pd

The purpose of this notebook is to explore prometheus_api_client behaviour when extracting metric data, and accordingly decide what will be the "right" behaviour of query_to_df() in utils and the affected architecture of the project, specifically decide if it allowed to get as input a metric or query that result in multiple time series, such as summary metric. in such cases custom_query_range will result in sequences of jsons where each json is reffered to unique combination of labels (for example, unique combination of job, instance and quantile.

In [85]:
prom = PrometheusConnect(url="http://localhost:9090", disable_ssl=True)

define function to extract query range and format it as dataframe:

In [86]:
def query2df(query, columns=None):
    query_range = prom.custom_query_range(query ,
                            start_time=start_time,
                            end_time=end_time,
                            step=step)
    return MetricRangeDataFrame(query_range,columns=columns)

lets choose a summary metric:

In [87]:
start_time = parse_datetime("5min")
end_time = parse_datetime("now")
step = "5s"

query = "prometheus_target_interval_length_seconds"

df = query2df(query)
df.head()

Unnamed: 0_level_0,__name__,instance,interval,job,quantile,value
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1638643944,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.9846831
1638643949,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.9846831
1638643954,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.9846831
1638643959,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.9846831
1638643964,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.9846831


In [88]:
df.describe()

Unnamed: 0,__name__,instance,interval,job,quantile,value
count,305,305,305,305,305.0,305.0
unique,1,1,1,1,5.0,25.0
top,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.99,5.015134
freq,305,305,305,305,61.0,38.0


we can see that the query results in additional columns due to the different quantiles. lets find all the unique combination of labels columns:

In [89]:
list(df.columns[1:-1])

['instance', 'interval', 'job', 'quantile']

In [90]:
groups = df.groupby(list(df.columns[1:-1]))
groups.groups.keys()

dict_keys([('localhost:9090', '5s', 'prometheus', '0.01'), ('localhost:9090', '5s', 'prometheus', '0.05'), ('localhost:9090', '5s', 'prometheus', '0.5'), ('localhost:9090', '5s', 'prometheus', '0.9'), ('localhost:9090', '5s', 'prometheus', '0.99')])

As expected there are 5 distinct metrics, against the 5 quantiles in summary metrics. In general it possible that there will be more metrics due to different instances or other labels.

Ofcourse we can define unique query that gives only one metric:

In [91]:
query = 'prometheus_target_interval_length_seconds{instance="localhost:9090", job="prometheus", quantile="0.01"}'

df = query2df(query)
df.head()


Unnamed: 0_level_0,__name__,instance,interval,job,quantile,value
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1638643944,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.9846831
1638643949,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.9846831
1638643954,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.9846831
1638643959,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.9846831
1638643964,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.9846831


In [92]:
df.describe()

Unnamed: 0,__name__,instance,interval,job,quantile,value
count,61,61,61,61,61.0,61.0
unique,1,1,1,1,1.0,4.0
top,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.982313
freq,61,61,61,61,61.0,27.0


In [93]:
groups = df.groupby(['instance', 'job', 'quantile'])
groups.groups.keys()

dict_keys([('localhost:9090', 'prometheus', '0.01')])

Note, that if we pass specific columns to custom_query_range() we still get all the 305 values but without the the quantile label. No aggregation is done:

In [94]:
df = query2df("prometheus_target_interval_length_seconds", columns=['timestamp','instance','value'])
df.head()

Unnamed: 0_level_0,instance,value
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
1638643944,localhost:9090,4.9846831
1638643949,localhost:9090,4.9846831
1638643954,localhost:9090,4.9846831
1638643959,localhost:9090,4.9846831
1638643964,localhost:9090,4.9846831


In [95]:
df.describe()

Unnamed: 0,instance,value
count,305,305.0
unique,1,25.0
top,localhost:9090,5.015134
freq,305,38.0


lets load again the summary metric to explore the manipulation need to be done to get the desired output to be fitted with prophet:

In [96]:
query = "prometheus_target_interval_length_seconds"
df = query2df(query)

In [97]:
df = df.reset_index().rename(columns={'value':'y', 'timestamp':'ds'}).astype({'y':'float'})
df.head()

Unnamed: 0,ds,__name__,instance,interval,job,quantile,y
0,1638643944,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683
1,1638643949,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683
2,1638643954,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683
3,1638643959,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683
4,1638643964,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683


In [98]:
df.dtypes

ds            int64
__name__     object
instance     object
interval     object
job          object
quantile     object
y           float64
dtype: object

In [99]:
df['ds'] = pd.to_datetime(df['ds'],unit='s').astype('datetime64[ns, Asia/Jerusalem]').dt.tz_localize(None)
df.head()

Unnamed: 0,ds,__name__,instance,interval,job,quantile,y
0,2021-12-04 20:52:24,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683
1,2021-12-04 20:52:29,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683
2,2021-12-04 20:52:34,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683
3,2021-12-04 20:52:39,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683
4,2021-12-04 20:52:44,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683


so these are the identical manipulation as we do in query_to_df(). let see how we get each group:

In [101]:
dfgb = df.groupby(list(df.columns[1:-1]))

for example the first group (quantile 0.01):

In [108]:
dfgb.get_group(list(dfgb.groups.keys())[0])

Unnamed: 0,ds,__name__,instance,interval,job,quantile,y
0,2021-12-04 20:52:24,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683
1,2021-12-04 20:52:29,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683
2,2021-12-04 20:52:34,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683
3,2021-12-04 20:52:39,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683
4,2021-12-04 20:52:44,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.984683
...,...,...,...,...,...,...,...
56,2021-12-04 20:57:04,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.980465
57,2021-12-04 20:57:09,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.980465
58,2021-12-04 20:57:14,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.980465
59,2021-12-04 20:57:19,prometheus_target_interval_length_seconds,localhost:9090,5s,prometheus,0.01,4.980465


but maybe there is more efficient way to extract the data. lets explore MetricList:

In [110]:
query_range = prom.custom_query_range(query ,
                            start_time=start_time,
                            end_time=end_time,
                            step=step)

metric_object_list = MetricsList(query_range)

In [111]:
metric_object_list

[<prometheus_api_client.metric.Metric at 0x18da7354848>,
 <prometheus_api_client.metric.Metric at 0x18da6a6cac8>,
 <prometheus_api_client.metric.Metric at 0x18da6a515c8>,
 <prometheus_api_client.metric.Metric at 0x18da73ac9c8>,
 <prometheus_api_client.metric.Metric at 0x18da6f1aa48>]

In [113]:
for item in metric_object_list:
    print(item.metric_name, "\n")

prometheus_target_interval_length_seconds 

prometheus_target_interval_length_seconds 

prometheus_target_interval_length_seconds 

prometheus_target_interval_length_seconds 

prometheus_target_interval_length_seconds 



In [112]:
for item in metric_object_list:
    print(item.metric_name, item.label_config, "\n")

prometheus_target_interval_length_seconds {'instance': 'localhost:9090', 'interval': '5s', 'job': 'prometheus', 'quantile': '0.01'} 

prometheus_target_interval_length_seconds {'instance': 'localhost:9090', 'interval': '5s', 'job': 'prometheus', 'quantile': '0.05'} 

prometheus_target_interval_length_seconds {'instance': 'localhost:9090', 'interval': '5s', 'job': 'prometheus', 'quantile': '0.5'} 

prometheus_target_interval_length_seconds {'instance': 'localhost:9090', 'interval': '5s', 'job': 'prometheus', 'quantile': '0.9'} 

prometheus_target_interval_length_seconds {'instance': 'localhost:9090', 'interval': '5s', 'job': 'prometheus', 'quantile': '0.99'} 



so we can see that MetricList save the name and the labels of the metric. but the big thing is that it saves also the metric values in the exact format that we need in Prophet:

In [126]:
for item in metric_object_list:
    print(item)

{
metric_name: 'prometheus_target_interval_length_seconds'
label_config: {'instance': 'localhost:9090', 'interval': '5s', 'job': 'prometheus', 'quantile': '0.01'}
metric_values:                     ds         y
0  2021-12-04 18:52:24  4.984683
1  2021-12-04 18:52:29  4.984683
2  2021-12-04 18:52:34  4.984683
3  2021-12-04 18:52:39  4.984683
4  2021-12-04 18:52:44  4.984683
..                 ...       ...
56 2021-12-04 18:57:04  4.980465
57 2021-12-04 18:57:09  4.980465
58 2021-12-04 18:57:14  4.980465
59 2021-12-04 18:57:19  4.980465
60 2021-12-04 18:57:24  4.982313

[61 rows x 2 columns]
}
{
metric_name: 'prometheus_target_interval_length_seconds'
label_config: {'instance': 'localhost:9090', 'interval': '5s', 'job': 'prometheus', 'quantile': '0.05'}
metric_values:                     ds         y
0  2021-12-04 18:52:24  4.988024
1  2021-12-04 18:52:29  4.988024
2  2021-12-04 18:52:34  4.988024
3  2021-12-04 18:52:39  4.988024
4  2021-12-04 18:52:44  4.988024
..                 ...   

the metric values are saved as df with the exact column names, ds and y, and the exact format: datetime and float!

In [143]:
tmp = metric_object_list[0]

In [133]:
tmp.metric_values

Unnamed: 0,ds,y
0,2021-12-04 18:52:24,4.984683
1,2021-12-04 18:52:29,4.984683
2,2021-12-04 18:52:34,4.984683
3,2021-12-04 18:52:39,4.984683
4,2021-12-04 18:52:44,4.984683
...,...,...
56,2021-12-04 18:57:04,4.980465
57,2021-12-04 18:57:09,4.980465
58,2021-12-04 18:57:14,4.980465
59,2021-12-04 18:57:19,4.980465


In [152]:
tmp.metric_values.dtypes

ds    datetime64[ns]
y            float64
dtype: object

also, we can manipulate the label config dict to a string that can be called in prometheus api:

In [149]:
str(tmp.label_config).replace(", '",", ").replace("': ","=").replace("{'","{")

"{instance='localhost:9090', interval='5s', job='prometheus', quantile='0.01'}"

but life isn't so easy. when the query is not raw metric but rather some expression we get error when trying to create MetricList object:

In [170]:
query = "rate(go_memstats_alloc_bytes_total[1m])"

In [171]:
query_range = prom.custom_query_range(query ,
                            start_time=start_time,
                            end_time=end_time,
                            step=step)



In [172]:
metric_object_list = MetricsList(query_range)

KeyError: '__name__'

Of course: expressions doesn't have \_\_name\_\_ property... hence it seems that MetricList is suited only to basic metrics. what about MetricRangeDataFrame()? let's check:

In [173]:
df = MetricRangeDataFrame(query_range)

In [174]:
df.head()

Unnamed: 0_level_0,instance,job,value
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1638643944,localhost:9090,prometheus,59872.70875763747
1638643949,localhost:9090,prometheus,61841.57227787576
1638643954,localhost:9090,prometheus,61986.18181818181
1638643959,localhost:9090,prometheus,61616.62515226996
1638643964,localhost:9090,prometheus,63621.77692111866


so MetricRangeDataFrame can handle this cases by ommiting the \_\_name\_\_ column

let's see if it can handle expressions that output multiple time-series as in expression on summary metric:

In [176]:
query = "rate(prometheus_target_interval_length_seconds[1m])"
query_range = prom.custom_query_range(query ,
                            start_time=start_time,
                            end_time=end_time,
                            step=step)

In [177]:
len(query_range)

5

In [178]:
query_range

[{'metric': {'instance': 'localhost:9090',
   'interval': '5s',
   'job': 'prometheus',
   'quantile': '0.01'},
  'values': [[1638643944, '0'],
   [1638643949, '0'],
   [1638643954, '0'],
   [1638643959, '0'],
   [1638643964, '0'],
   [1638643969, '0'],
   [1638643974, '0'],
   [1638643979, '0'],
   [1638643984, '0'],
   [1638643989, '0'],
   [1638643994, '0.09062221111191912'],
   [1638643999, '0.0906156213641652'],
   [1638644004, '0.09062730627977164'],
   [1638644009, '0.09062565860043997'],
   [1638644014, '0.09062895401901784'],
   [1638644019, '0.09061412652245046'],
   [1638644024, '0.0906306018181818'],
   [1638644029, '0.0906421380903024'],
   [1638644034, '0.09064213809030242'],
   [1638644039, '0.09061083218206936'],
   [1638644044, '0.1813055541410541'],
   [1638644049, '0.09061906848219319'],
   [1638644054, '0.0906437863689264'],
   [1638644059, '0.09061397407696922'],
   [1638644064, '0.09061067968224538'],
   [1638644069, '0.18124596934043752'],
   [1638644074, '0.1812

In [179]:
metric_object_list = MetricsList(query_range)

KeyError: '__name__'

again, MetricsList failed but MetricRangeDataFrame can handle it:

In [180]:
df = MetricRangeDataFrame(query_range)
df.head()

Unnamed: 0_level_0,instance,interval,job,quantile,value
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1638643944,localhost:9090,5s,prometheus,0.01,0
1638643949,localhost:9090,5s,prometheus,0.01,0
1638643954,localhost:9090,5s,prometheus,0.01,0
1638643959,localhost:9090,5s,prometheus,0.01,0
1638643964,localhost:9090,5s,prometheus,0.01,0


and again it ommits the \_\_name\_\_ column.

On one hand, we can just require from the user that all the queries will be specific such that they yield only one time-series. This will ease significantly all the process of make forecasts in the trainer, save them and extract them in the detector, along with the real-time data. On the other hand, maybe this is too restrictive requirement, since a lot of expresions in prometheus are natively designed to yield multiple time-series... So lets see what are the implications of supporting multiple time-series queries in our trainer-detector architecture:

Lets count the relevant processes in the trainer and the detector that need special attention to support multi-time-series queries:
1. extract the data from each query - need to use at least custom_query_range. Possibly pass the output to MetricRangeDataFrame, but not MetricsList (which not support non-metric expressions). In the general case we will get multiple metrics with unique labels values combination for each metric.
2. fit_predict - for each query need to run an inner loop and fit_predict each time-series.
3. save the forecasts - in some format... it can be csv for each individual metric with naming convention which includes the query and all the labels... can cause to more than 250 characters... not so good... maybe dump all the query_range data or the whole dataframe and the filenames will be serial number or shortened unique name (as done in Taboola).
4. getting real time-data - to get the labels of each time-series we need to call custom_query_range first to get the real-time data (last 10 minutes).
5. get forecasts - after getting all the labels combinations we need to extract their corresponding forecasts. for this we can compare the custom_query_range results above to the saved forecast data and use the data with the corresponding matched labels.


So supporting multiple time-series queries seems to be duable but need quite complex handling, so we can keep it as future feature and for now we restrict the queries to yield single time-series.

In [183]:
st = ""
len(st)

0