Skip to content

Commit

Permalink
feat(centraldashboard): Displaying metrics from Prometheus (kubeflow#…
Browse files Browse the repository at this point in the history
…7116)

* # backend

Adding /api/metrics endpoint for `resourceChartsLink` and `resourceChartsLinkText` field.

Adding PrometheusMetricsService class implementing MetricsService

Adding `prometheus-query` package

## MetricsService

Adding `getChartsLink(): MetricsInfo` to MetricsService. It will be used to send client the info about the metrics (resourceChartsLink and resourceChartsLinkText).

## env

- PROMETHEUS_URL: url for prometheus query
- METRICS_DASHBOARD: if defined, will add a button under graphs to go to the url defined

## Prometheus query

- getNodeCpuUtilization: `sum(rate(node_cpu_seconds_total[5m])) by (instance)`
- getPodCpuUtilization: `sum(rate(container_cpu_usage_seconds_total[5m]))`
- getPodMemoryUsage: `sum(container_memory_usage_bytes)`

# frontend

Adding iron-ajax to `/api/metrics`.
Adding `metrics` properties to main-page and dashboard-view.
Making footer of `ressource-chart.js` hidden if resourceChartsLink is undefined.

* Fixing package-lock.json

* Adding Unit Tests and Improvements
  • Loading branch information
axel7083 authored and tzstoyanov committed Jun 14, 2023
1 parent e3e3f2b commit 0fc20c3
Show file tree
Hide file tree
Showing 16 changed files with 340 additions and 27 deletions.
10 changes: 10 additions & 0 deletions components/centraldashboard/app/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {KubernetesService} from './k8s_service';
import {Interval, MetricsService} from './metrics_service';

export const ERRORS = {
no_metrics_service_configured: 'No metrics service configured',
operation_not_supported: 'Operation not supported',
invalid_links_config: 'Cannot load dashboard menu link',
invalid_settings: 'Cannot load dashboard settings'
Expand All @@ -28,6 +29,15 @@ export class Api {
*/
routes(): Router {
return Router()
.get('/metrics', async (req: Request, res: Response) => {
if (!this.metricsService) {
return apiError({
res, code: 405,
error: ERRORS.operation_not_supported,
});
}
res.json(this.metricsService.getChartsLink());
})
.get(
'/metrics/:type((node|podcpu|podmem))',
async (req: Request, res: Response) => {
Expand Down
30 changes: 25 additions & 5 deletions components/centraldashboard/app/api_test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,22 @@ describe('Main API', () => {
port = addressInfo.port;
});

it('Should return a 405 status code', (done) => {
get(`http://localhost:${port}/api/metrics/podcpu`, (res) => {
expect(res.statusCode).toBe(405);
done();
it('Should return a 405 status code', async () => {
const metricsEndpoint = new Promise((resolve) => {
get(`http://localhost:${port}/api/metrics`, (res) => {
expect(res.statusCode).toBe(405);
resolve();
});
});

const metricsTypeEndpoint = new Promise((resolve) => {
get(`http://localhost:${port}/api/metrics/podcpu`, (res) => {
expect(res.statusCode).toBe(405);
resolve();
});
});

await Promise.all([metricsEndpoint, metricsTypeEndpoint]);
});
});

Expand All @@ -47,7 +58,7 @@ describe('Main API', () => {
mockK8sService = jasmine.createSpyObj<KubernetesService>(['']);
mockProfilesService = jasmine.createSpyObj<DefaultApi>(['']);
mockMetricsService = jasmine.createSpyObj<MetricsService>([
'getNodeCpuUtilization', 'getPodCpuUtilization', 'getPodMemoryUsage'
'getNodeCpuUtilization', 'getPodCpuUtilization', 'getPodMemoryUsage', 'getChartsLink'
]);

testApp = express();
Expand All @@ -64,6 +75,15 @@ describe('Main API', () => {
}
});

it('Should retrieve charts link in Metrics service', (done) => {
get(`http://localhost:${port}/api/metrics`, (res) => {
expect(res.statusCode).toBe(200);
expect(mockMetricsService.getChartsLink)
.toHaveBeenCalled();
done();
});
});

it('Should retrieve Node CPU Utilization for default 15m interval',
async () => {
const defaultInterval = new Promise((resolve) => {
Expand Down
11 changes: 11 additions & 0 deletions components/centraldashboard/app/metrics_service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ export interface TimeSeriesPoint {
value: number;
}

export interface MetricsInfo {
resourceChartsLink: string | undefined;
resourceChartsLinkText: string;
}

/**
* Interface definition for implementers of metrics services capable of
* returning time-series resource utilization metrics for the Kubeflow system.
Expand All @@ -39,4 +44,10 @@ export interface MetricsService {
* @param interval
*/
getPodMemoryUsage(interval: Interval): Promise<TimeSeriesPoint[]>;

/**
* Return a MetricsInfo object containing the url of the metric dashboard and the
* text to display for the redirect button.
*/
getChartsLink(): MetricsInfo;
}
90 changes: 90 additions & 0 deletions components/centraldashboard/app/prometheus_metrics_service.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import {Interval, MetricsInfo, MetricsService, TimeSeriesPoint} from "./metrics_service";
import {PrometheusDriver, RangeVector, ResponseType} from 'prometheus-query';

export class PrometheusMetricsService implements MetricsService {
private readonly prometheusDriver: PrometheusDriver;
private readonly dashboardUrl: string | undefined;

constructor(prometheusDriver: PrometheusDriver, dashboardUrl: string | undefined) {
this.prometheusDriver = prometheusDriver;
this.dashboardUrl = dashboardUrl;
}

async getNodeCpuUtilization(interval: Interval): Promise<TimeSeriesPoint[]> {
const query = `sum(rate(node_cpu_seconds_total[5m])) by (instance)`;
const result = await this.queryPrometheus(query, this.getCorrespondingTime(interval));
return this.convertToTimeSeriesPoints(result);
}

async getPodCpuUtilization(interval: Interval): Promise<TimeSeriesPoint[]> {
const query = `sum(rate(container_cpu_usage_seconds_total[5m]))`;
const result = await this.queryPrometheus(query, this.getCorrespondingTime(interval));
return this.convertToTimeSeriesPoints(result);
}

async getPodMemoryUsage(interval: Interval): Promise<TimeSeriesPoint[]> {
const query = `sum(container_memory_usage_bytes)`;
const result = await this.queryPrometheus(query, this.getCorrespondingTime(interval));
return this.convertToTimeSeriesPoints(result);
}

private async queryPrometheus(query: string, start: number, end: number = Date.now()): Promise<RangeVector[]> {
const result = await this.prometheusDriver.rangeQuery(query, start, end, 10);
if(result.resultType !== ResponseType.MATRIX) {
console.warn(`The prometheus server returned invalid result type: ${result.resultType}`);
return [];
}
return result.result as RangeVector[];
}

private getCorrespondingTime(interval: Interval): number {
let minutes = 0;
switch (interval) {
case Interval.Last5m:
minutes = 5;
break;
case Interval.Last15m:
minutes = 15;
break;
case Interval.Last30m:
minutes = 30;
break;
case Interval.Last60m:
minutes = 60;
break;
case Interval.Last180m:
minutes = 180;
break;
default:
console.warn("unknown interval.");
}
return Date.now() - minutes * 60 * 1000;
}

private convertToTimeSeriesPoints(series: RangeVector[]): TimeSeriesPoint[] {
const timeSeriesPoints: TimeSeriesPoint[] = [];
series.forEach(serie => {

const label = Object.entries(serie.metric.labels).map((entry) => {
return entry[0] + "=" + entry[1];
}).join(",");

// The `public/components/resource-chart.js` is multiplying the timestamp by 1000 and the value by 100
serie.values.forEach(value => {
timeSeriesPoints.push({
timestamp: value.time.getTime() / 1000,
label,
value: value.value / 100,
});
});
});
return timeSeriesPoints;
}

getChartsLink(): MetricsInfo {
return {
resourceChartsLink: this.dashboardUrl,
resourceChartsLinkText: 'View in dashboard'
};
}
}
142 changes: 142 additions & 0 deletions components/centraldashboard/app/prometheus_metrics_service_test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import {Metric, PrometheusDriver, QueryResult, ResponseType} from "prometheus-query";
import {PrometheusMetricsService} from "./prometheus_metrics_service";
import {Interval, MetricsService, TimeSeriesPoint} from "./metrics_service";
import {SampleValue} from "prometheus-query/dist/types";

type MetricsServiceKeys = keyof MetricsService;
const methods: MetricsServiceKeys[] = ["getNodeCpuUtilization", "getPodCpuUtilization", "getPodMemoryUsage"];
const queries: {[id: string]: string} = {
"getNodeCpuUtilization": "sum(rate(node_cpu_seconds_total[5m])) by (instance)",
"getPodCpuUtilization": "sum(rate(container_cpu_usage_seconds_total[5m]))",
"getPodMemoryUsage": "sum(container_memory_usage_bytes)"
};

const fixedDate = 1557705600000;

const emptyDataSet: QueryResult = {"resultType": ResponseType.MATRIX,"result":[]};
const singleInstanceDataSet: QueryResult = {
"resultType": ResponseType.MATRIX,
"result":[
{
"metric": {"labels": {"instance":"one"}} as Metric,
"values":[
{
time: new Date(fixedDate),
value: 95.5,
} as SampleValue
]
}
]
};
const multipleInstancesDataSet: QueryResult = {
"resultType": ResponseType.MATRIX,
"result":[
{
"metric": {"labels": {"instance":"one"}} as Metric,
"values":[
{
time: new Date(fixedDate),
value: 1.0,
} as SampleValue
]
},
{
"metric": {"labels": {"instance":"two"}} as Metric,
"values":[
{
time: new Date(fixedDate),
value: 2.0,
} as SampleValue
]
},
{
"metric": {"labels": {"instance":"three"}} as Metric,
"values":[
{
time: new Date(fixedDate),
value: 3.0,
} as SampleValue
]
}
]
};

describe('PrometheusMetricsService', () => {
let prometheusDriverClient: jasmine.SpyObj<PrometheusDriver>;
let service: PrometheusMetricsService;

beforeEach(() => {
jasmine.clock().install();
jasmine.clock().mockDate(new Date(1557705600000));
prometheusDriverClient = jasmine.createSpyObj<PrometheusDriver>(
'prometheusDriverClient', ['rangeQuery']);

service =
new PrometheusMetricsService(prometheusDriverClient, undefined);
});

// Iterate over all methods since they have the same behavior
methods.forEach((method) => {
describe(method, async () => {
it('Empty return', async () => {
prometheusDriverClient.rangeQuery.withArgs(
queries[method],
Date.now() - 5 * 60 * 1000,
Date.now(),
10
).and.returnValue(Promise.resolve(emptyDataSet));

const emptyResult = await service[method](Interval.Last5m);
expect(emptyResult).toEqual(Array.of<TimeSeriesPoint>());
});

it('One instance', async () => {
prometheusDriverClient.rangeQuery.withArgs(
queries[method],
Date.now() - 5 * 60 * 1000,
Date.now(),
10
).and.returnValue(Promise.resolve(singleInstanceDataSet));

const singleInstanceResult = await service[method](Interval.Last5m);
expect(singleInstanceResult).toEqual(Array.of<TimeSeriesPoint>({
timestamp: fixedDate / 1000,
value: 0.955,
label: "instance=one"
}));
});

it('Multiple instances', async () => {
prometheusDriverClient.rangeQuery.withArgs(
queries[method],
Date.now() - 5 * 60 * 1000,
Date.now(),
10
).and.returnValue(Promise.resolve(multipleInstancesDataSet));

const singleInstanceResult = await service[method](Interval.Last5m);
expect(singleInstanceResult).toEqual(
Array.of<TimeSeriesPoint>({
timestamp: fixedDate / 1000,
value: 0.010,
label: "instance=one"
},
{
timestamp: fixedDate / 1000,
value: 0.020,
label: "instance=two"
},
{
timestamp: fixedDate / 1000,
value: 0.030,
label: "instance=three"
})
);
});
});
});

afterEach(() => {
jasmine.clock().uninstall();
});
});
10 changes: 9 additions & 1 deletion components/centraldashboard/app/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import {DefaultApi} from './clients/profile_controller';
import {WorkgroupApi} from './api_workgroup';
import {KubernetesService} from './k8s_service';
import {getMetricsService} from './metrics_service_factory';
import {PrometheusMetricsService} from "./prometheus_metrics_service";
import {PrometheusDriver} from "prometheus-query";

const isProduction = process.env.NODE_ENV === 'production';
const codeEnvironment = isProduction?'production':'development';
Expand All @@ -29,6 +31,8 @@ const {
USERID_HEADER = 'X-Goog-Authenticated-User-Email',
USERID_PREFIX = 'accounts.google.com:',
REGISTRATION_FLOW = "true",
PROMETHEUS_URL = undefined,
METRICS_DASHBOARD = undefined,
} = process.env;


Expand All @@ -41,7 +45,11 @@ async function main() {

const app: express.Application = express();
const k8sService = new KubernetesService(new KubeConfig());
const metricsService = await getMetricsService(k8sService);

const metricsService = PROMETHEUS_URL
? new PrometheusMetricsService(new PrometheusDriver({ endpoint: PROMETHEUS_URL }), METRICS_DASHBOARD)
: await getMetricsService(k8sService);

console.info(`Using Profiles service at ${profilesServiceUrl}`);
const profilesService = new DefaultApi(profilesServiceUrl);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import * as monitoring from '@google-cloud/monitoring';
import fetch from 'node-fetch';

import {Interval, MetricsService, TimeSeriesPoint} from './metrics_service';
import {Interval, MetricsInfo, MetricsService, TimeSeriesPoint} from './metrics_service';

const CLUSTER_NAME_URL =
'http://metadata.google.internal/computeMetadata/v1/instance/attributes/cluster-name';
Expand Down Expand Up @@ -194,4 +194,11 @@ export class StackdriverMetricsService implements MetricsService {
}
return this.clusterName;
}

getChartsLink(): MetricsInfo {
return {
resourceChartsLink: `https://app.google.stackdriver.com/kubernetes?project=${this.projectId}`,
resourceChartsLinkText: 'View in Stackdriver'
};
}
}
Loading

0 comments on commit 0fc20c3

Please sign in to comment.