# Digital Twin of 5G network - Bachelor Thesis
## Dataset Creation - David Truhlar, 2025

In [4]:
import pandas as pd
from prometheus_api_client import PrometheusConnect
from datetime import datetime, timedelta
import os
import re
import pytz
from pathlib import Path
from metrics_config import metrics



In [5]:
# Global variables
PROMETHEUS_PORT = 9090                                                                            # Port for Prometheus metrics

STEP = "1s"                                                                                   # Time step for the simulation
TIMEDELTA_SECONDS = 10                                                                            # Time delta for the simulation
LOCAL_TZ = pytz.timezone("Europe/Bratislava")                                                     # TZ
LOG_DIR = "../log/"                                                                               # Directory with logs
LOG_PATTERN = re.compile(r"(\d{2}/\d{2} \d{2}:\d{2}:\d{2}\.\d{3}):\s+\[(\w+)\]\s+(\w+):\s*(.+)")  # Regex pattern for log lines
ENCODING = "utf-8"                                                                                # Encoding for log files

In [6]:
LOG_DIR = "Model/logs_real_5G"

In [131]:
# Create a connection to prometheus
try:
    prom = PrometheusConnect(url=f"http://localhost:{PROMETHEUS_PORT}", disable_ssl=True)
except Exception as e:
    print(f"Error connecting to Prometheus: {e}")

In [9]:
# Time 
end_time = datetime.now()
start_time = end_time - timedelta(seconds=TIMEDELTA_SECONDS)

In [99]:
from datetime import datetime

# Input range (UTC)
start_str = "2025-04-14 12:52:30"
end_str = "2025-04-14 12:56:30"

# Parse as UTC datetime
start_utc = datetime.fromisoformat(start_str)
end_utc = datetime.fromisoformat(end_str)

# Convert to local time with microseconds
start_local = start_utc.astimezone().replace(microsecond=0)
end_local = end_utc.astimezone().replace(microsecond=0)

print("Start time:", start_utc)
print("End time:", end_utc)

end_time = end_utc
start_time = start_utc

Start time: 2025-04-14 12:52:30
End time: 2025-04-14 12:56:30


### Fetch and process metrics from Prometheus

In [132]:
# Create an empty list to hold metric DataFrames
df_list = []

# Iterate through metric groups
for group, metric_list in metrics.items():
    for metric in metric_list:
        try:
            response = prom.custom_query_range(
                metric, start_time=start_time, end_time=end_time, step=STEP
            )

            if not response:
                print(f"⚠️ No data: {group}/{metric}")
                continue

            for entry in response:
                base_metric_name = entry["metric"].get("__name__", metric)

                if "values" in entry and isinstance(entry["values"], list):
                    extracted_values = [
                        (
                            datetime.utcfromtimestamp(int(ts))
                            .replace(tzinfo=pytz.utc)
                            .astimezone(LOCAL_TZ),
                            float(val)
                        )
                        for ts, val in entry["values"]
                    ]

                    metric_df = pd.DataFrame(extracted_values, columns=["timestamp", "value"])
                    metric_df["metric_name"] = base_metric_name
                    metric_df["group"] = group

                    df_list.append(metric_df)
                else:
                    print(f"⚠️ No valid values found: {group}/{metric}")

        except Exception as e:
            print(f"❌ Error fetching {group}/{metric}: {e}")

# Combine all metrics into one DataFrame
if df_list:
    final_df = pd.concat(df_list, ignore_index=True)
    final_df['timestamp'] = final_df['timestamp'].astype(str).str.replace(r'\+\d{2}:\d{2}', '', regex=True)
    final_df["timestamp"] = pd.to_datetime(final_df["timestamp"])
else:
    print("❌ No data collected for any metric.")


❌ Error fetching open5gs/bearers_active: HTTPConnectionPool(host='localhost', port=9090): Max retries exceeded with url: /api/v1/query_range?query=bearers_active&start=1744627950&end=1744628190&step=1s (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11488ec40>: Failed to establish a new connection: [Errno 61] Connection refused'))
❌ Error fetching fivegs_amf/amf_session: HTTPConnectionPool(host='localhost', port=9090): Max retries exceeded with url: /api/v1/query_range?query=amf_session&start=1744627950&end=1744628190&step=1s (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1145fcd00>: Failed to establish a new connection: [Errno 61] Connection refused'))


KeyboardInterrupt: 

#### Process and extract logs from Open5gs functions

In [10]:
log_data = []

# Define allowed applications
allowed_applications = {"amf", "upf", "smf", "udm", "gmm"}

for log_path in Path(LOG_DIR).glob("*.log"):
    try:
        with open(log_path, "r", encoding=ENCODING, errors="ignore") as f:
            for line in f:
                match = LOG_PATTERN.match(line)
                if match:
                    timestamp_str, application, log_level, log_message = match.groups()

                    # Filter by allowed applications
                    if application.lower() not in allowed_applications:
                        continue

                    # Convert timestamp to datetime (add missing year)
                    log_timestamp = datetime.strptime(timestamp_str, "%m/%d %H:%M:%S.%f")
                    log_timestamp = log_timestamp.replace(year=start_time.year, microsecond=0)

                    # Keep only logs after Prometheus start time
                    if log_timestamp > start_time and log_timestamp < end_time:
                        log_data.append({
                            "timestamp": log_timestamp,
                            "application": application,
                            "log_level": log_level,
                            "log_message": log_message
                        })
                    else:
                        log_data.append({
                            "timestamp": log_timestamp,
                            "application": application,
                            "log_level": log_level,
                            "log_message": log_message
                        })
    except Exception as e:
        print(f"❌ Failed to process {log_path.name}: {e}")

log_data = pd.DataFrame(log_data)

In [13]:
log_data.head(30)

Unnamed: 0,timestamp,application,log_level,log_message
0,2025-04-10 13:00:05,amf,INFO,[imsi-999700000071632:5:13][0:0:NULL] /nsmf-pd...
1,2025-04-10 13:00:05,amf,INFO,UE Context Release [Action:2] (../src/amf/ngap...
2,2025-04-10 13:00:05,amf,INFO,RAN_UE_NGAP_ID[16802081] AMF_UE_NGAP_ID[124] (...
3,2025-04-10 13:00:05,amf,INFO,SUCI[suci-0-999-70-0-0-0-0000071632] (../src/a...
4,2025-04-10 13:00:05,amf,INFO,[Removed] Number of gNB-UEs is now 0 (../src/a...
5,2025-04-10 13:00:54,amf,WARNING,UnRef NF EndPoint(addr) [192.168.50.4:7777] (....
6,2025-04-10 13:00:54,amf,INFO,Setup NF EndPoint(addr) [192.168.50.4:7777] (....
7,2025-04-10 13:00:55,amf,INFO,InitialUEMessage (../src/amf/ngap-handler.c:437)
8,2025-04-10 13:00:55,amf,INFO,[Added] Number of gNB-UEs is now 1 (../src/amf...
9,2025-04-10 13:00:55,amf,INFO,[suci-0-999-70-0-0-0-0000071632] 5G-S_TMSI[...


#### Aggregate and transform metrics data


In [102]:
# 🔹 Aggregate metrics (choose appropriate aggregation: mean, sum, max, etc.)
data_agg = final_df.groupby(["timestamp", "metric_name"])["value"].mean().reset_index()
data_pivot = data_agg.pivot(index="timestamp", columns="metric_name", values="value")

# Flatten column names
data_pivot.columns = [f"{col}_value" for col in data_pivot.columns]

# Reset index to bring timestamp back
data_pivot.reset_index(inplace=True)

In [111]:
data_pivot.head()

Unnamed: 0,timestamp,amf_session_value,bearers_active_value,fivegs_amffunction_amf_authreject_value,fivegs_amffunction_amf_authreq_value,fivegs_amffunction_mm_confupdate_value,fivegs_amffunction_mm_confupdatesucc_value,fivegs_amffunction_mm_paging5greq_value,fivegs_amffunction_mm_paging5gsucc_value,fivegs_amffunction_rm_regemergreq_value,...,process_cpu_seconds_total_value,process_max_fds_value,process_open_fds_value,process_resident_memory_bytes_value,process_start_time_seconds_value,process_virtual_memory_bytes_value,process_virtual_memory_max_bytes_value,ran_ue_value,s5c_rx_createsession_value,s5c_rx_parse_failed_value
0,2025-04-14 12:52:30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,109.5,1024.0,27.75,41246720.0,150171816.5,1404078000.0,-1.0,0.0,0.0,0.0
1,2025-04-14 12:52:31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,109.5,1024.0,27.75,41246720.0,150171816.5,1404078000.0,-1.0,0.0,0.0,0.0
2,2025-04-14 12:52:32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,109.5,1024.0,27.75,41246720.0,150171816.5,1404078000.0,-1.0,0.0,0.0,0.0
3,2025-04-14 12:52:33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,109.5,1024.0,27.75,41246720.0,150171816.5,1404078000.0,-1.0,0.0,0.0,0.0
4,2025-04-14 12:52:34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,109.5,1024.0,27.75,41246720.0,150171816.5,1404078000.0,-1.0,0.0,0.0,0.0


In [12]:
real_5G_csv = "Model/real5g_scenarios1-5.csv"
real_5G_df = pd.read_csv(real_5G_csv, sep=",", encoding=ENCODING)
real_5G_df["timestamp"] = pd.to_datetime(real_5G_df["timestamp"])

real_5G_df.head()

Unnamed: 0,timestamp,amf_session_value,bearers_active_value,fivegs_amffunction_amf_authreject_value,fivegs_amffunction_amf_authreq_value,fivegs_amffunction_mm_confupdate_value,fivegs_amffunction_mm_confupdatesucc_value,fivegs_amffunction_mm_paging5greq_value,fivegs_amffunction_mm_paging5gsucc_value,fivegs_amffunction_rm_regemergreq_value,...,process_cpu_seconds_total_value,process_max_fds_value,process_open_fds_value,process_resident_memory_bytes_value,process_start_time_seconds_value,process_virtual_memory_bytes_value,process_virtual_memory_max_bytes_value,ran_ue_value,s5c_rx_createsession_value,s5c_rx_parse_failed_value
0,2025-04-10 12:28:14,2.0,2.0,0.0,10.0,599.0,499.0,1034.0,498.0,0.0,...,43.25,1024.0,29.75,50106368.0,118464611.5,1404078000.0,-1.0,0.0,0.0,0.0
1,2025-04-10 12:28:15,2.0,2.0,0.0,10.0,599.0,499.0,1034.0,498.0,0.0,...,43.25,1024.0,29.75,50106368.0,118464611.5,1404078000.0,-1.0,0.0,0.0,0.0
2,2025-04-10 12:28:16,2.0,2.0,0.0,10.0,599.0,499.0,1034.0,498.0,0.0,...,43.25,1024.0,29.75,50106368.0,118464611.5,1404078000.0,-1.0,0.0,0.0,0.0
3,2025-04-10 12:28:17,2.0,2.0,0.0,10.0,599.0,499.0,1034.0,498.0,0.0,...,43.25,1024.0,29.75,50106368.0,118464611.5,1404078000.0,-1.0,0.0,0.0,0.0
4,2025-04-10 12:28:18,2.0,2.0,0.0,10.0,599.0,499.0,1034.0,498.0,0.0,...,43.25,1024.0,29.75,50106368.0,118464611.5,1404078000.0,-1.0,0.0,0.0,0.0


In [None]:
# csv_file = "1_6.csv"

# # Determine whether to write header
# write_header = not os.path.exists(csv_file) or os.path.getsize(csv_file) == 0

# # Try filtering only new records if file exists and is not empty
# if not write_header:
#     try:
#         last_timestamp = pd.read_csv(csv_file, usecols=["timestamp"])["timestamp"].max()
#         data_pivot = data_pivot[data_pivot["timestamp"] > last_timestamp]
#     except Exception as e:
#         print(f"⚠️ Issue reading existing CSV: {e}. Proceeding without filtering.")

# # Append new data
# if not data_pivot.empty:
#     data_pivot.to_csv(csv_file, mode="a", index=False, header=write_header)

In [14]:
# Define function to classify log messages
patterns = {
    "remove": re.compile(r"\b(Removed|Deregister|De-register|Implicit De-registered)\b", re.IGNORECASE),
    "refused": re.compile(r"\b(refused|connection refused)\b", re.IGNORECASE),
    "number_of_sessions_or_ues": re.compile(r"\b(Number of (gNBs|AMF-UEs|AMF-Sessions|gNB-UEs))\b", re.IGNORECASE),
    "registration": re.compile(r"\b(Registration request|InitialUEMessage|Added|Unknown UE by SUCI|SUCI)\b", re.IGNORECASE),
    "error": re.compile(r"\b(ERROR)\b", re.IGNORECASE),
    "warning": re.compile(r"\b(WARNING)\b", re.IGNORECASE),
}

def classify_log_message(message):
    if not isinstance(message, str):
        return "nothing"
    for label, pattern in patterns.items():
        if pattern.search(message):
            return label
    return "nothing"

# Ensure log_message column exists before applying classification
if "log_message" in log_data.columns:
	log_data["log_type"] = log_data["log_message"].apply(classify_log_message)

In [15]:
logs_short = log_data[["timestamp", "application", "log_type"]] \
	if not log_data.empty \
	else pd.DataFrame(columns=["timestamp", "application", "log_type"])

In [16]:
# Convert logs_short timestamp to UTC datetime (assumes naive timestamps are UTC)
logs_short["timestamp"] = pd.to_datetime(logs_short["timestamp"], errors='coerce').dt.tz_localize("UTC")

# Filter logs after the UTC start_time (also localized to UTC for comparison)
# logs_short = logs_short[logs_short["timestamp"] >= start_time.replace(tzinfo=pytz.utc)]

# Drop timezone info to match Prometheus format
logs_short["timestamp"] = logs_short["timestamp"].dt.tz_localize(None)

logs_short.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  logs_short["timestamp"] = pd.to_datetime(logs_short["timestamp"], errors='coerce').dt.tz_localize("UTC")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  logs_short["timestamp"] = logs_short["timestamp"].dt.tz_localize(None)


Unnamed: 0,timestamp,application,log_type
0,2025-04-10 13:00:05,amf,nothing
1,2025-04-10 13:00:05,amf,nothing
2,2025-04-10 13:00:05,amf,nothing
3,2025-04-10 13:00:05,amf,registration
4,2025-04-10 13:00:05,amf,remove
5,2025-04-10 13:00:54,amf,nothing
6,2025-04-10 13:00:54,amf,nothing
7,2025-04-10 13:00:55,amf,registration
8,2025-04-10 13:00:55,amf,number_of_sessions_or_ues
9,2025-04-10 13:00:55,amf,registration


In [17]:
# drop duplicates
logs_short = logs_short.drop_duplicates(subset=["timestamp", "application", "log_type"])
logs_short = logs_short.sort_values(by=["timestamp", "application", "log_type"])
logs_short = logs_short.reset_index(drop=True)

logs_short.head(20)

Unnamed: 0,timestamp,application,log_type
0,2025-04-10 13:00:05,amf,nothing
1,2025-04-10 13:00:05,amf,registration
2,2025-04-10 13:00:05,amf,remove
3,2025-04-10 13:00:54,amf,nothing
4,2025-04-10 13:00:55,amf,nothing
5,2025-04-10 13:00:55,amf,number_of_sessions_or_ues
6,2025-04-10 13:00:55,amf,registration
7,2025-04-10 13:00:55,gmm,nothing
8,2025-04-10 13:00:55,gmm,registration
9,2025-04-10 13:01:05,amf,nothing


In [18]:
merge_d = pd.merge(real_5G_df, logs_short, on="timestamp", how="outer")

In [20]:

# drop the rows where the amf_session_value is Missing
merge_d = merge_d.dropna(subset=["amf_session_value"])

merge_d


Unnamed: 0,timestamp,amf_session_value,bearers_active_value,fivegs_amffunction_amf_authreject_value,fivegs_amffunction_amf_authreq_value,fivegs_amffunction_mm_confupdate_value,fivegs_amffunction_mm_confupdatesucc_value,fivegs_amffunction_mm_paging5greq_value,fivegs_amffunction_mm_paging5gsucc_value,fivegs_amffunction_rm_regemergreq_value,...,process_open_fds_value,process_resident_memory_bytes_value,process_start_time_seconds_value,process_virtual_memory_bytes_value,process_virtual_memory_max_bytes_value,ran_ue_value,s5c_rx_createsession_value,s5c_rx_parse_failed_value,application,log_type
0,2025-04-10 12:28:14,2.0,2.0,0.0,10.0,599.0,499.0,1034.0,498.0,0.0,...,29.75,50106368.0,118464611.5,1.404078e+09,-1.0,0.0,0.0,0.0,,
1,2025-04-10 12:28:15,2.0,2.0,0.0,10.0,599.0,499.0,1034.0,498.0,0.0,...,29.75,50106368.0,118464611.5,1.404078e+09,-1.0,0.0,0.0,0.0,,
2,2025-04-10 12:28:16,2.0,2.0,0.0,10.0,599.0,499.0,1034.0,498.0,0.0,...,29.75,50106368.0,118464611.5,1.404078e+09,-1.0,0.0,0.0,0.0,,
3,2025-04-10 12:28:17,2.0,2.0,0.0,10.0,599.0,499.0,1034.0,498.0,0.0,...,29.75,50106368.0,118464611.5,1.404078e+09,-1.0,0.0,0.0,0.0,,
4,2025-04-10 12:28:18,2.0,2.0,0.0,10.0,599.0,499.0,1034.0,498.0,0.0,...,29.75,50106368.0,118464611.5,1.404078e+09,-1.0,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7490,2025-04-14 12:56:26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
7491,2025-04-14 12:56:27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
7492,2025-04-14 12:56:28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
7493,2025-04-14 12:56:29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [None]:
# Merge logs with metrics
merged_data = pd.merge(data_pivot, logs_short, on="timestamp", how="outer")



In [22]:
# Fill missing values in 'application' and 'log_type' columns with 0
merged_data["application"] = merged_data["application"].fillna(0)
merged_data["log_type"] = merged_data["log_type"].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data["application"] = merged_data["application"].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data["log_type"] = merged_data["log_type"].fillna(0)


In [21]:
merged_data = merge_d

In [24]:
merged_data

#to csv
merged_data.to_csv("realnetwork.csv", index=False, encoding=ENCODING)

In [109]:
# Read the current_uc value
current_uc = None

with open("../current_uc.txt", "r") as f:
    current_uc = f.read().strip()

# Add current_uc as a new column to the merged_data DataFrame
if not merged_data.empty:
    merged_data["current_uc"] = current_uc

In [110]:
# csv_file = "running_data.csv"

# # Determine whether to write header
# write_header = not os.path.exists(csv_file) or os.path.getsize(csv_file) == 0

# # Try filtering only new records if file exists and is not empty
# if not write_header:
#     try:
#         last_timestamp = pd.read_csv(csv_file, usecols=["timestamp"])["timestamp"].max()
#         merged_data = merged_data[merged_data["timestamp"] > last_timestamp]
#     except Exception as e:
#         print(f"⚠️ Issue reading existing CSV: {e}. Proceeding without filtering.")

# # Append new data
# if not merged_data.empty:
#     merged_data.to_csv(csv_file, mode="a", index=False, header=write_header)