In [1]:
import pandas as pd
from prometheus_api_client import PrometheusConnect, MetricSnapshotDataFrame
from datetime import datetime, timedelta, timezone
import os
import re



In [2]:
start = datetime.now()

In [3]:
# Create a connection to prometheus
try:
    prom = PrometheusConnect(url="http://localhost:9090", disable_ssl=True)
    print("Connected to Prometheus")
except Exception as e:
    print(f"Error connecting to Prometheus: {e}")

Connected to Prometheus


In [4]:
current_time = datetime.now()

# Set start_time as the last processed time (or now - 10 seconds for a buffer)
start_time = (current_time - timedelta(seconds=10)).strftime("%Y-%m-%dT%H:%M:%SZ")
end_time = current_time.strftime("%Y-%m-%dT%H:%M:%SZ")

step = "50ms"

start_time = datetime.strptime(start_time, "%Y-%m-%dT%H:%M:%SZ")
end_time = datetime.strptime(end_time, "%Y-%m-%dT%H:%M:%SZ")

In [5]:
print(f"Start time: {start_time}")
print(f"End time: {end_time}")

Start time: 2025-03-19 23:56:22
End time: 2025-03-19 23:56:32


In [6]:
# List of important metrics
metrics = [
    # Open5GS metrics
    "fivegs_amffunction_amf_authreject",
    "fivegs_amffunction_amf_authreq",
    "fivegs_amffunction_rm_reginitsucc", # for registration success
    "fivegs_ep_n3_gtp_indatapktn3upf", # for incoming data packets
    "fivegs_ep_n3_gtp_outdatapktn3upf", # for outgoing data packets
    "fivegs_upffunction_upf_sessionnbr", # for session number
    "ues_active", # for active UEs

    # NetData metrics for network traffic
    'netdata_net_packets_packets_persec_average{device="br-02c136a167f8", dimension="received"}',
    'netdata_net_packets_packets_persec_average{device="br-02c136a167f8", dimension="sent"}',
    'netdata_net_net_kilobits_persec_average{device="br-02c136a167f8", dimension="received"}',
    'netdata_net_net_kilobits_persec_average{device="br-02c136a167f8", dimension="sent"}',
    'netdata_cgroup_cpu_percentage_average{cgroup_name="cd2487a23764"}',
    ]

In [7]:
# Create an empty DataFrame to store all metrics
df_list = []

In [8]:
import pytz  # For timezone conversion

# Define your local timezone (change this if necessary)
LOCAL_TZ = pytz.timezone("Europe/Bratislava")  # Change if needed

# Fetch metrics and transform timestamps
for metric in metrics:
    try:
        response = prom.custom_query_range(
            metric, start_time=start_time, end_time=end_time, step=step
        )

        # Ensure response is not empty
        if not response:
            print(f"⚠️ Warning: No data for metric {metric}")
            continue

        # Process each metric entry
        for entry in response:
            base_metric_name = entry["metric"]["__name__"]

            # Extract metadata if available
            dimension = entry["metric"].get("dimension", None)
            device = entry["metric"].get("device", None)
            cgroup_name = entry["metric"].get("cgroup_name", None)

            # Construct metric name properly
            metric_name = base_metric_name
            if device and dimension:
                metric_name = f"{base_metric_name}_{device}_{dimension}"
            elif device:
                metric_name = f"{base_metric_name}_{device}"
            elif cgroup_name:
                metric_name = f"{base_metric_name}_{cgroup_name}"

            # Extract and format values with timezone conversion
            if "values" in entry and isinstance(entry["values"], list):
                extracted_values = [
                   (datetime.utcfromtimestamp(int(ts)).replace(tzinfo=pytz.utc).astimezone(LOCAL_TZ), float(val))
                    for ts, val in entry["values"]
                ]

                # Create DataFrame
                metric_df = pd.DataFrame(extracted_values, columns=["timestamp", "value"])
                metric_df["metric_name"] = metric_name

                # Append to list
                df_list.append(metric_df)
            else:
                print(f"⚠️ Warning: No valid values found for {metric}")

    except Exception as e:
        print(f"❌ Error fetching {metric}: {e}")


In [9]:
if df_list:
    final_df = pd.concat(df_list, ignore_index=True)

In [10]:
# Remove +01:00 from the timestamp
final_df['timestamp'] = final_df['timestamp'].astype(str).str.replace(r'\+\d{2}:\d{2}', '', regex=True)

In [11]:
final_df.head()

Unnamed: 0,timestamp,value,metric_name
0,2025-03-19 23:56:22,0.0,fivegs_amffunction_amf_authreject
1,2025-03-19 23:56:22,0.0,fivegs_amffunction_amf_authreject
2,2025-03-19 23:56:22,0.0,fivegs_amffunction_amf_authreject
3,2025-03-19 23:56:22,0.0,fivegs_amffunction_amf_authreject
4,2025-03-19 23:56:22,0.0,fivegs_amffunction_amf_authreject


In [12]:
log_dir = "../log/"

# Example:
# 03/19 11:20:11.151: [amf] INFO: ngap_server() [172.22.0.10]:38412 (../src/amf/ngap-sctp.c:61)
# 03/19 11:20:11.154: [sctp] INFO: AMF initialize...done (../src/amf/app.c:33)
# 03/19 11:20:11.174: [sbi] INFO: [bd5d91d4-04ab-41f0-8871-a9dc3c5ef804] NF registered [Heartbeat:10s] (../lib/sbi/nf-sm.c:208)
# 03/19 11:20:11.179: [sbi] INFO: NF EndPoint(addr) setup [172.22.0.12:7777] (../lib/sbi/nnrf-handler.c:949)


log_pattern = re.compile(r"(\d{2}/\d{2} \d{2}:\d{2}:\d{2}\.\d{3}):\s+\[(\w+)\]\s+(\w+):\s*(.+)")

log_data = []

In [13]:
# Iterate over all log files in the directory
for log_file in os.listdir(log_dir):
    log_path = os.path.join(log_dir, log_file)
    
    with open(log_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            match = log_pattern.match(line)
            if match:
                timestamp_str, application, log_level, log_message = match.groups()

                # Convert timestamp to datetime (add missing year)
                log_timestamp = datetime.strptime(timestamp_str, "%m/%d %H:%M:%S.%f")
                log_timestamp = log_timestamp.replace(year=start_time.year)  # Assign correct year

                # 🔹 Remove milliseconds to match Prometheus format
                log_timestamp = log_timestamp.replace(tzinfo=pytz.utc).astimezone(LOCAL_TZ)

                # Check if the log timestamp later than the start time
                if log_timestamp > start_time.replace(tzinfo=pytz.utc).astimezone(LOCAL_TZ):
                    log_data.append({
                        "timestamp": log_timestamp,
                        "application": application,
                        "log_level": log_level,
                        "log_message": log_message
                    })


In [14]:
log_data = pd.DataFrame(log_data)
log_data.head()

In [15]:
if not log_data.empty:
	log_data["timestamp"] = pd.to_datetime(log_data["timestamp"])
	log_data["timestamp"] = log_data["timestamp"].dt.round("1s")
	# Update the timezone we need to subtract 1 hour
	log_data["timestamp"] = log_data["timestamp"] - timedelta(hours=1)
	log_data['timestamp'] = log_data['timestamp'].astype(str).str.replace(r'\+\d{2}:\d{2}', '', regex=True)

else:
	print("log_data is empty")

log_data is empty


In [16]:
# Remove +01:00 from the timestamp
log_data.head()

In [17]:
final_df["timestamp"] = pd.to_datetime(final_df["timestamp"])
final_df.head(1)

Unnamed: 0,timestamp,value,metric_name
0,2025-03-19 23:56:22,0.0,fivegs_amffunction_amf_authreject


log_data and final_df
----------------------
save()

In [18]:
log_data.to_csv("log_data.csv", index=False)
final_df.to_csv("metrics_data.csv", index=False)

In [19]:
# Load logs separately before merging
logs = log_data
metrics = final_df

In [20]:
# 🔹 Aggregate NetData metrics (choose appropriate aggregation: mean, sum, max, etc.)
netdata_aggregated = metrics.groupby(["timestamp", "metric_name"])["value"].mean().reset_index()

# Pivot NetData metrics so each metric has its own column
netdata_pivot = netdata_aggregated.pivot(index="timestamp", columns="metric_name", values="value")

# Flatten column names
netdata_pivot.columns = [f"{col}_value" for col in netdata_pivot.columns]

# Reset index to bring timestamp back
netdata_pivot.reset_index(inplace=True)

print("✅ NetData metrics aggregated and pivoted successfully!")
netdata_pivot.head(3)

✅ NetData metrics aggregated and pivoted successfully!


Unnamed: 0,timestamp,fivegs_amffunction_amf_authreject_value,fivegs_amffunction_amf_authreq_value,fivegs_amffunction_rm_reginitsucc_value,fivegs_ep_n3_gtp_indatapktn3upf_value,fivegs_ep_n3_gtp_outdatapktn3upf_value,fivegs_upffunction_upf_sessionnbr_value,netdata_cgroup_cpu_percentage_average_cd2487a23764_value,netdata_net_net_kilobits_persec_average_br-02c136a167f8_received_value,netdata_net_net_kilobits_persec_average_br-02c136a167f8_sent_value,netdata_net_packets_packets_persec_average_br-02c136a167f8_received_value,netdata_net_packets_packets_persec_average_br-02c136a167f8_sent_value,ues_active_value
0,2025-03-19 23:56:22,0.0,19.0,19.0,0.0,0.0,1.0,2.298613,120.800839,-124.606787,8.067402,-13.667402,1.0
1,2025-03-19 23:56:23,0.0,19.0,19.0,0.0,0.0,1.0,2.298613,120.800839,-124.606787,8.067402,-13.667402,1.0
2,2025-03-19 23:56:24,0.0,19.0,19.0,0.0,0.0,1.0,2.553254,121.0247,-125.85828,8.132454,-15.664904,1.0


In [21]:
# Define function to classify log messages
def classify_log_message(message):
    if isinstance(message, str):  # Ensure it's a string before applying .lower()
        if "connect" in message.lower():
            return "connect"
        elif "request" in message.lower():
            return "request"
        elif "reject" in message.lower():
            return "reject"
        else:
            return "nothing"
    return "nothing"  # Handle missing or NaN values

In [22]:
# Ensure log_message column exists before applying classification
if "log_message" in logs.columns:
	logs["log_type"] = logs["log_message"].apply(classify_log_message)
else:
	print("Error: 'log_message' column not found in logs DataFrame")

Error: 'log_message' column not found in logs DataFrame


In [23]:
logs.head(1)

In [24]:
# Check if logs DataFrame is not empty before selecting columns
if not logs.empty:
	# Keep only necessary columns from logs
	logs_short = logs[["timestamp", "application", "log_type"]]
else:
	# Create an empty DataFrame if logs is empty
	logs_short = pd.DataFrame(columns=["timestamp", "application", "log_type"])

In [25]:
logs_short.head(1)

Unnamed: 0,timestamp,application,log_type


In [26]:
# Ensure start_time is timezone-aware
start_time_tz_aware = start_time.replace(tzinfo=pytz.utc).astimezone(LOCAL_TZ)

# Convert logs_short timestamp to datetime and ensure it is timezone-aware
logs_short["timestamp"] = pd.to_datetime(logs_short["timestamp"]).dt.tz_localize('UTC').dt.tz_convert(LOCAL_TZ)

# Remove logs before the start time
logs_short = logs_short[logs_short["timestamp"] >= start_time_tz_aware]


In [27]:
netdata_pivot.head(1)

Unnamed: 0,timestamp,fivegs_amffunction_amf_authreject_value,fivegs_amffunction_amf_authreq_value,fivegs_amffunction_rm_reginitsucc_value,fivegs_ep_n3_gtp_indatapktn3upf_value,fivegs_ep_n3_gtp_outdatapktn3upf_value,fivegs_upffunction_upf_sessionnbr_value,netdata_cgroup_cpu_percentage_average_cd2487a23764_value,netdata_net_net_kilobits_persec_average_br-02c136a167f8_received_value,netdata_net_net_kilobits_persec_average_br-02c136a167f8_sent_value,netdata_net_packets_packets_persec_average_br-02c136a167f8_received_value,netdata_net_packets_packets_persec_average_br-02c136a167f8_sent_value,ues_active_value
0,2025-03-19 23:56:22,0.0,19.0,19.0,0.0,0.0,1.0,2.298613,120.800839,-124.606787,8.067402,-13.667402,1.0


In [28]:
# Convert logs_short timestamp to UTC
logs_short["timestamp"] = logs_short["timestamp"].dt.tz_convert("UTC").dt.tz_localize(None)

# Merge logs with NetData metrics
merged_data = pd.merge(netdata_pivot, logs_short, on="timestamp", how="outer")


In [29]:
merged_data.head()

Unnamed: 0,timestamp,fivegs_amffunction_amf_authreject_value,fivegs_amffunction_amf_authreq_value,fivegs_amffunction_rm_reginitsucc_value,fivegs_ep_n3_gtp_indatapktn3upf_value,fivegs_ep_n3_gtp_outdatapktn3upf_value,fivegs_upffunction_upf_sessionnbr_value,netdata_cgroup_cpu_percentage_average_cd2487a23764_value,netdata_net_net_kilobits_persec_average_br-02c136a167f8_received_value,netdata_net_net_kilobits_persec_average_br-02c136a167f8_sent_value,netdata_net_packets_packets_persec_average_br-02c136a167f8_received_value,netdata_net_packets_packets_persec_average_br-02c136a167f8_sent_value,ues_active_value,application,log_type
0,2025-03-19 23:56:22,0.0,19.0,19.0,0.0,0.0,1.0,2.298613,120.800839,-124.606787,8.067402,-13.667402,1.0,,
1,2025-03-19 23:56:23,0.0,19.0,19.0,0.0,0.0,1.0,2.298613,120.800839,-124.606787,8.067402,-13.667402,1.0,,
2,2025-03-19 23:56:24,0.0,19.0,19.0,0.0,0.0,1.0,2.553254,121.0247,-125.85828,8.132454,-15.664904,1.0,,
3,2025-03-19 23:56:25,0.0,19.0,19.0,0.0,0.0,1.0,2.553254,121.0247,-125.85828,8.132454,-15.664904,1.0,,
4,2025-03-19 23:56:26,0.0,19.0,19.0,0.0,0.0,1.0,2.553254,121.0247,-125.85828,8.132454,-15.664904,1.0,,


In [30]:
# Define a mapping for renaming columns
column_rename_mapping = {
    "fivegs_amffunction_amf_authreject_value": "Auth Reject Count",
    "fivegs_amffunction_amf_authreq_value": "Auth Request Count",
    "fivegs_amffunction_rm_reginitsucc_value": "Registration Success",
    "fivegs_ep_n3_gtp_outdatapktn3upf_value": "Outgoing Data Packets",
    "fivegs_ep_n3_gtp_indatapktn3upf_value": "Incoming Data Packets",
    "fivegs_upffunction_upf_sessionnbr_value": "Session Number",

    # NetData metrics
    "netdata_cgroup_cpu_percentage_average_cd2487a23764_value": "CPU Usage (Open5GS)",
    "netdata_net_net_kilobits_persec_average_br-02c136a167f8_received_value": "Network Traffic In (kbps)",
    "netdata_net_net_kilobits_persec_average_br-02c136a167f8_sent_value": "Network Traffic Out (kbps)",
    "netdata_net_packets_packets_persec_average_br-02c136a167f8_received_value": "Packets Received (pps)",
    "netdata_net_packets_packets_persec_average_br-02c136a167f8_sent_value": "Packets Sent (pps)",

    # UEs and logs
    "ues_active_value": "Active UEs",
    "application": "Application Name",
    "log_type": "Log Type"
}

# Apply renaming
merged_data.rename(columns=column_rename_mapping, inplace=True)

# Print updated column names for verification
print("✅ Updated Column Names:", merged_data.columns)


✅ Updated Column Names: Index(['timestamp', 'Auth Reject Count', 'Auth Request Count',
       'Registration Success', 'Incoming Data Packets',
       'Outgoing Data Packets', 'Session Number', 'CPU Usage (Open5GS)',
       'Network Traffic In (kbps)', 'Network Traffic Out (kbps)',
       'Packets Received (pps)', 'Packets Sent (pps)', 'Active UEs',
       'Application Name', 'Log Type'],
      dtype='object')


In [31]:
# Append the merged data to a CSV file

# Check if the file exists
if os.path.exists("merged_data.csv"):
    existing_data = pd.read_csv("merged_data.csv")
else:
    existing_data = pd.DataFrame()

merged_data = pd.concat([existing_data, merged_data], ignore_index=True)

# Save the merged data to a CSV file
merged_data.to_csv("merged_data.csv", index=False)

end = datetime.now()
print(f"✅ Data merged successfully in {end - start}")

✅ Data merged successfully in 0:00:00.466193
