# Digital Twin of 5G network - Bachelor Thesis
## Dataset Creation - David Truhlar, 2025

In [144]:
import pandas as pd
from prometheus_api_client import PrometheusConnect
from datetime import datetime, timedelta
import os
import re
import pytz
from pathlib import Path
from metrics_config import metrics

In [145]:
# Global variables
PROMETHEUS_PORT = 9090                                                                            # Port for Prometheus metrics

STEP = "1s"                                                                                   # Time step for the simulation
TIMEDELTA_SECONDS = 10                                                                            # Time delta for the simulation
LOCAL_TZ = pytz.timezone("Europe/Bratislava")                                                     # TZ
LOG_DIR = "../log/"                                                                               # Directory with logs
LOG_PATTERN = re.compile(r"(\d{2}/\d{2} \d{2}:\d{2}:\d{2}\.\d{3}):\s+\[(\w+)\]\s+(\w+):\s*(.+)")  # Regex pattern for log lines
ENCODING = "utf-8"                                                                                # Encoding for log files

In [146]:
# Create a connection to prometheus
try:
    prom = PrometheusConnect(url=f"http://localhost:{PROMETHEUS_PORT}", disable_ssl=True)
except Exception as e:
    print(f"Error connecting to Prometheus: {e}")

In [147]:
# Time 
end_time = datetime.now()
start_time = end_time - timedelta(seconds=TIMEDELTA_SECONDS)

In [148]:
print(f"Start time: {start_time}")
print(f"End time: {end_time}")

Start time: 2025-04-09 14:38:36.064967
End time: 2025-04-09 14:38:46.064967


In [149]:
uc1_range = {
    "from": "2025-04-09 13:30:00",
    "to": "2025-04-09 13:40:00"
}

In [150]:
from datetime import datetime

# Input range (UTC)
start_str = "2025-04-09 14:34:07"
end_str = "2025-04-09 14:37:59"

# Parse as UTC datetime
start_utc = datetime.fromisoformat(start_str)
end_utc = datetime.fromisoformat(end_str)

# Convert to local time with microseconds
start_local = start_utc.astimezone().replace(microsecond=0)
end_local = end_utc.astimezone().replace(microsecond=0)

print("Start time:", start_utc)
print("End time:", end_utc)

end_time = end_utc
start_time = start_utc

Start time: 2025-04-09 14:34:07
End time: 2025-04-09 14:37:59


### Fetch and process metrics from Prometheus

In [151]:
# Create an empty list to hold metric DataFrames
df_list = []

# Iterate through metric groups
for group, metric_list in metrics.items():
    for metric in metric_list:
        try:
            response = prom.custom_query_range(
                metric, start_time=start_time, end_time=end_time, step=STEP
            )

            if not response:
                print(f"⚠️ No data: {group}/{metric}")
                continue

            for entry in response:
                base_metric_name = entry["metric"].get("__name__", metric)

                if "values" in entry and isinstance(entry["values"], list):
                    extracted_values = [
                        (
                            datetime.utcfromtimestamp(int(ts))
                            .replace(tzinfo=pytz.utc)
                            .astimezone(LOCAL_TZ),
                            float(val)
                        )
                        for ts, val in entry["values"]
                    ]

                    metric_df = pd.DataFrame(extracted_values, columns=["timestamp", "value"])
                    metric_df["metric_name"] = base_metric_name
                    metric_df["group"] = group

                    df_list.append(metric_df)
                else:
                    print(f"⚠️ No valid values found: {group}/{metric}")

        except Exception as e:
            print(f"❌ Error fetching {group}/{metric}: {e}")

# Combine all metrics into one DataFrame
if df_list:
    final_df = pd.concat(df_list, ignore_index=True)
    final_df['timestamp'] = final_df['timestamp'].astype(str).str.replace(r'\+\d{2}:\d{2}', '', regex=True)
    final_df["timestamp"] = pd.to_datetime(final_df["timestamp"])
else:
    print("❌ No data collected for any metric.")


#### Process and extract logs from Open5gs functions

In [152]:
log_data = []

for log_path in Path(LOG_DIR).glob("*.log"):
    print(f"Processing {log_path.name} from {LOG_DIR}, range: {start_time} - {end_time}")
    try:
        with open(log_path, "r", encoding=ENCODING, errors="ignore") as f:
            for line in f:
                match = LOG_PATTERN.match(line)
                if match:
                    timestamp_str, application, log_level, log_message = match.groups()

                    # Convert timestamp to datetime (add missing year)
                    log_timestamp = datetime.strptime(timestamp_str, "%m/%d %H:%M:%S.%f")
                    log_timestamp = log_timestamp.replace(year=start_time.year, microsecond=0)

                    # Keep only logs after Prometheus start time
                    if log_timestamp > start_time and log_timestamp < end_time:
                        log_data.append({
                            "timestamp": log_timestamp,
                            "application": application,
                            "log_level": log_level,
                            "log_message": log_message
                        })
    except Exception as e:
        print(f"❌ Failed to process {log_path.name}: {e}")

log_data = pd.DataFrame(log_data)


Processing sgwu.log from ../log/, range: 2025-04-09 14:34:07 - 2025-04-09 14:37:59
Processing sgwc.log from ../log/, range: 2025-04-09 14:34:07 - 2025-04-09 14:37:59
Processing pcrf.log from ../log/, range: 2025-04-09 14:34:07 - 2025-04-09 14:37:59
Processing scp.log from ../log/, range: 2025-04-09 14:34:07 - 2025-04-09 14:37:59
Processing bsf.log from ../log/, range: 2025-04-09 14:34:07 - 2025-04-09 14:37:59
Processing upf.log from ../log/, range: 2025-04-09 14:34:07 - 2025-04-09 14:37:59
Processing ausf.log from ../log/, range: 2025-04-09 14:34:07 - 2025-04-09 14:37:59
Processing pcf.log from ../log/, range: 2025-04-09 14:34:07 - 2025-04-09 14:37:59
Processing amf.log from ../log/, range: 2025-04-09 14:34:07 - 2025-04-09 14:37:59
Processing udr.log from ../log/, range: 2025-04-09 14:34:07 - 2025-04-09 14:37:59
Processing mme.log from ../log/, range: 2025-04-09 14:34:07 - 2025-04-09 14:37:59
Processing hss.log from ../log/, range: 2025-04-09 14:34:07 - 2025-04-09 14:37:59
Processing n

#### Aggregate and transform metrics data


In [153]:
# 🔹 Aggregate metrics (choose appropriate aggregation: mean, sum, max, etc.)
data_agg = final_df.groupby(["timestamp", "metric_name"])["value"].mean().reset_index()
data_pivot = data_agg.pivot(index="timestamp", columns="metric_name", values="value")

# Flatten column names
data_pivot.columns = [f"{col}_value" for col in data_pivot.columns]

# Reset index to bring timestamp back
data_pivot.reset_index(inplace=True)

In [154]:
# data pivot to csv
data_pivot.to_csv("metrics.csv", index=False)

In [155]:
data_pivot.head()

Unnamed: 0,timestamp,amf_session_value,bearers_active_value,fivegs_amffunction_amf_authreject_value,fivegs_amffunction_amf_authreq_value,fivegs_amffunction_mm_confupdate_value,fivegs_amffunction_mm_confupdatesucc_value,fivegs_amffunction_mm_paging5greq_value,fivegs_amffunction_mm_paging5gsucc_value,fivegs_amffunction_rm_regemergreq_value,...,process_cpu_seconds_total_value,process_max_fds_value,process_open_fds_value,process_resident_memory_bytes_value,process_start_time_seconds_value,process_virtual_memory_bytes_value,process_virtual_memory_max_bytes_value,ran_ue_value,s5c_rx_createsession_value,s5c_rx_parse_failed_value
0,2025-04-09 14:34:07,4.0,4.0,0.0,35.5,82.5,0.0,0.0,0.0,0.0,...,24.78,655744.0,24.375,52654592.0,260263900.0,1397296000.0,-1.0,0.0,0.0,0.0
1,2025-04-09 14:34:08,4.0,4.0,0.0,35.5,82.5,0.0,0.0,0.0,0.0,...,24.78125,655744.0,24.375,52654592.0,260263900.0,1397296000.0,-1.0,0.0,0.0,0.0
2,2025-04-09 14:34:09,4.0,4.0,0.0,35.5,82.5,0.0,0.0,0.0,0.0,...,24.78125,655744.0,24.375,52654592.0,260263900.0,1397296000.0,-1.0,0.0,0.0,0.0
3,2025-04-09 14:34:10,4.0,4.0,0.0,35.5,82.5,0.0,0.0,0.0,0.0,...,24.78125,655744.0,24.375,52654592.0,260263900.0,1397296000.0,-1.0,0.0,0.0,0.0
4,2025-04-09 14:34:11,4.0,4.0,0.0,35.5,82.5,0.0,0.0,0.0,0.0,...,24.78125,655744.0,24.375,52654592.0,260263900.0,1397296000.0,-1.0,0.0,0.0,0.0


In [156]:
# Define function to classify log messages
patterns = {
    "connect": re.compile(r"connect", re.IGNORECASE),
    "request": re.compile(r"request", re.IGNORECASE),
    "reject": re.compile(r"reject", re.IGNORECASE),
    "error": re.compile(r"error", re.IGNORECASE),
    "warning": re.compile(r"warning", re.IGNORECASE),
    "info": re.compile(r"release", re.IGNORECASE),
}

def classify_log_message(message):
    if not isinstance(message, str):
        return "nothing"
    for label, pattern in patterns.items():
        if pattern.search(message):
            return label
    return "nothing"

# Ensure log_message column exists before applying classification
if "log_message" in log_data.columns:
	log_data["log_type"] = log_data["log_message"].apply(classify_log_message)

In [157]:
logs_short = log_data[["timestamp", "application", "log_type"]] \
	if not log_data.empty \
	else pd.DataFrame(columns=["timestamp", "application", "log_type"])

In [158]:
# Convert logs_short timestamp to UTC datetime (assumes naive timestamps are UTC)
logs_short["timestamp"] = pd.to_datetime(logs_short["timestamp"], errors='coerce').dt.tz_localize("UTC")

# Filter logs after the UTC start_time (also localized to UTC for comparison)
logs_short = logs_short[logs_short["timestamp"] >= start_time.replace(tzinfo=pytz.utc)]

# Drop timezone info to match Prometheus format
logs_short["timestamp"] = logs_short["timestamp"].dt.tz_localize(None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  logs_short["timestamp"] = pd.to_datetime(logs_short["timestamp"], errors='coerce').dt.tz_localize("UTC")


In [159]:
# Merge logs with metrics
merged_data = pd.merge(data_pivot, logs_short, on="timestamp", how="outer")

In [160]:
csv_file = "merged_data.csv"

# Determine whether to write header
write_header = not os.path.exists(csv_file) or os.path.getsize(csv_file) == 0

# Try filtering only new records if file exists and is not empty
if not write_header:
    try:
        last_timestamp = pd.read_csv(csv_file, usecols=["timestamp"])["timestamp"].max()
        merged_data = merged_data[merged_data["timestamp"] > last_timestamp]
    except Exception as e:
        print(f"⚠️ Issue reading existing CSV: {e}. Proceeding without filtering.")

# Append new data
if not merged_data.empty:
    merged_data.to_csv(csv_file, mode="a", index=False, header=write_header)