In [None]:
import sklearn
import pandas as pd
import plotly.express as px
import os
import mlflow
import numpy as np
import datetime as dt
import requests
import json
MLFLOW_SERVER = os.getenv("MLFLOW_SERVER")
PROM_SERVER = os.getenv("PROM_SERVER")
# The following environment variables are needed for auth to S3
#AWS_ACCESS_KEY_ID
#AWS_SECRET_ACCESS_KEY
#MLFLOW_S3_ENDPOINT_URL

## Get Network Metrics for few days

In [None]:
from prometheus_api_client import MetricRangeDataFrame
import datetime as dt
from prometheus_api_client import PrometheusConnect

prom = PrometheusConnect(url=f"http://{PROM_SERVER}:9090") # Replace with your Prometheus URL

start_time = dt.datetime(2025, 6, 5, 0, 0, 0)
end_time = dt.datetime(2025, 6, 10, 0, 0, 0)
chunk_size = dt.timedelta(minutes=1) # Optional: for large data ranges

In [None]:

from datetime import datetime
def range_query_to_df(metric: str, rate_base: str, start: datetime, end: datetime, date_range_step: str) -> pd.DataFrame:
    start_string = start.strftime('%Y-%m-%dT%H:%M:%SZ')
    end_string = end.strftime('%Y-%m-%dT%H:%M:%SZ')
    prom_query = f'http://{PROM_SERVER}:9090/api/v1/query_range?query=rate({metric}[{rate_base}])&start={start_string}&end={end_string}&step={date_range_step}'
    resp = requests.get(prom_query)
    data = json.loads(resp.text)
    all_data = []
    for result in data['data']['result']:
        metric_labels = result['metric']
        device = metric_labels.get('device', 'unknown')
        
        for timestamp, value in result['values']:
            all_data.append({
                'timestamp': pd.to_datetime(timestamp, unit='s'),
                'device': device,
                'value': float(value)
            })

    df = pd.DataFrame(all_data)
    return df.pivot(index='timestamp', columns='device', values='value')

## node_network_receive_bytes_total

In [None]:
node_network_receive_bytes_total = range_query_to_df(metric = 'node_network_receive_bytes_total', rate_base = '1m', start = start_time, end = end_time, date_range_step = '60s')
node_network_receive_bytes_total = node_network_receive_bytes_total[['br-0c6e79cd156e', 'br-8fbdbc8e9f6a', 'docker0', 'eth0', 'lo']]
node_network_receive_bytes_total.columns = ['br-0c6e79cd156e - Rx in', 'br-8fbdbc8e9f6a - Rx in', 'docker0 - Rx in', 'eth0 - Rx in', 'lo - Rx in']

## node_network_transmit_bytes_total

In [None]:
node_network_transmit_bytes_total = range_query_to_df(metric = 'node_network_transmit_bytes_total', rate_base = '1m', start = start_time, end = end_time, date_range_step = '60s')
node_network_transmit_bytes_total = node_network_transmit_bytes_total[['br-0c6e79cd156e', 'br-8fbdbc8e9f6a', 'docker0', 'eth0', 'lo']]
node_network_transmit_bytes_total.columns = ['br-0c6e79cd156e - Tx out', 'br-8fbdbc8e9f6a - Tx out', 'docker0 - Tx out', 'eth0 - Tx out', 'lo - Tx out']

## node_network_receive_packets_total

In [None]:
node_network_receive_packets_total = range_query_to_df(metric = 'node_network_receive_packets_total', rate_base = '1m', start = start_time, end = end_time, date_range_step = '60s')
node_network_receive_packets_total = node_network_receive_packets_total[['br-0c6e79cd156e', 'br-8fbdbc8e9f6a', 'docker0', 'eth0', 'lo']]
node_network_receive_packets_total.columns = ['br-0c6e79cd156e -  Pck in', 'br-8fbdbc8e9f6a -  Pck in', 'docker0 -  Pck in', 'eth0 -  Pck in', 'lo -  Pck in']

## node_network_transmit_packets_total

In [None]:
node_network_transmit_packets_total = range_query_to_df(metric = 'node_network_transmit_packets_total', rate_base = '1m', start = start_time, end = end_time, date_range_step = '60s')
node_network_transmit_packets_total = node_network_transmit_packets_total[['br-0c6e79cd156e', 'br-8fbdbc8e9f6a', 'docker0', 'eth0', 'lo']]
node_network_transmit_packets_total.columns = ['br-0c6e79cd156e -  Pck out', 'br-8fbdbc8e9f6a -  Pck out', 'docker0 -  Pck out', 'eth0 -  Pck out', 'lo -  Pck out']

## Join Dataframes

In [74]:
network_df = pd.merge(node_network_receive_bytes_total, node_network_transmit_bytes_total, left_index=True, right_index=True)
network_df = pd.merge(network_df, node_network_receive_packets_total, left_index=True, right_index=True)
network_df = pd.merge(network_df, node_network_transmit_packets_total, left_index=True, right_index=True)

## Some Clean Up

In [None]:
network_df.drop(columns='timestamp', inplace=True)
network_df = network_df[network_df.columns.sort_values()]
network_df.dropna(inplace=True)

## Log an MLFlow Expirement

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import KFold
import numpy as np
from scipy.stats import entropy

def KL_divergence(a, b):
    hist_a = np.histogram(a, bins=200, range=(-1,1.0))[0]
    hist_b = np.histogram(b, bins=200, range=(-1,1.0))[0]
    hist_b = np.where(hist_b == 0.0, 1e-6, hist_b)
    return entropy(hist_a, hist_b)

remote_server_uri = f"http://{MLFLOW_SERVER}:5000"  # set to your server URI, e.g. http://127.0.0.1:8080
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("/Network_Bytes_and_Packets_Isolation_Forest")
mlflow.sklearn.autolog()
model_kld_scores = []
estimators = [5, 10, 50, 100, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000]
for estimator in estimators:
    with mlflow.start_run(run_name=f'{estimator}'):
        kf = KFold()
        klds = []
        for train_index, test_index in kf.split(network_df):
            # Split the data into training and testing sets
            X_train, X_test = network_df.iloc[train_index], network_df.iloc[test_index]
            clf = IsolationForest(max_samples=estimator, random_state=0)
            clf.fit(X_train)
            # Use a KL divergence to detect overfitting
            pred_train_set = clf.decision_function(X_train)
            pred_test_set = clf.decision_function(X_test)
            kld = KL_divergence(pred_train_set, pred_test_set)
            print(f'kld is {kld}')
            klds.append(kld)
        print(f'mean of klds for {estimator} estimators is {np.mean(klds)}')
        model_kld_scores.append(np.mean(klds))
        mlflow.log_metric('average kld', np.mean(klds))


## Visualize KL over number of trees

In [None]:
fig = px.line(x=estimators, y=model_kld_scores)
fig.show()

## Test the best number of trees and visualize against traffic

In [None]:
clf = IsolationForest(max_samples=2000, random_state=0)
clf.fit(network_df.values)
y_pred = clf.decision_function(network_df.values)
fig = px.line(x=network_df.index, y=y_pred)
fig.update_layout(showlegend=False)
fig.show()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplots
fig = make_subplots(rows=2, cols=1)

# Add traces to subplots
fig.add_trace(go.Line(x=network_df.index, y=y_pred, name='anomaly'), row=1, col=1)
for column in network_df.columns:
    fig.add_trace(go.Line(x=network_df.index, y=network_df[column], name=column), row=2, col=1)

# Update layout
fig.update_layout(title_text="Network Anomaly and Traffic")

# Show figure
fig.show()