# Tvorba datasetu

### Obsah<a class='anchor' id='top'></a>
* [Globálne premenné](#globalne-premenne)
* [Pripojenie na Prometheus](#pripojenie-na-prometheus)
* [Časové vymedzenie](#casove-vymedzenie)
* [Načítanie a spracovanie metrík z Prometheus](#nacitanie-a-spracovanie-metrik-z-prometheus)
* [Spracovanie a extrakcia logov z funkcií Open5gs](#spracovanie-a-extrakcia-logov-z-funkcii-open5gs)
* [Agregácia a transformácia dát](#agregacia-a-transformacia-dat)
* [Triedenie logov](#triedenie-logov)
* [Spojenie metrík a logov](#spojenie-metrik-a-logov)
* [Mapovanie kategorických dát](#mapovanie-kategorickych-dat)
* [Vloženie informácie o akutálnom UC](#vlozenie-informacie-o-akutalnom-uc)
* [Uloženie datasetu](#ulozenie-datasetu)

In [None]:
import pandas as pd
from prometheus_api_client import PrometheusConnect
from datetime import datetime, timedelta
import os
import re
import pytz
from pathlib import Path
import json
import sys


In [None]:
metrics = {
    "fivegs_smf": [
        "fivegs_smffunction_sm_n4sessionreportsucc",
        "fivegs_smffunction_sm_pdusessioncreationreq",
        "fivegs_smffunction_sm_qos_flow_nbr",
    ],
    "fivegs_pcf": [
        "fivegs_pcffunction_pa_policysmassosucc",
        "fivegs_pcffunction_pa_sessionnbr",
    ],
}

In [None]:
# Globálne premenné
PROMETHEUS_PORT = 9090                                                                            # Port for Prometheus metrics

STEP = "1s"                                                                                       # Time step for the simulation
TIMEDELTA_SECONDS = 10                                                                            # Time delta for the simulation
LOCAL_TZ = pytz.timezone("Europe/Bratislava")                                                     # TZ
LOG_DIR = "/open5gs/install/var/log/open5gs"                                                                               # Directory with logs
LOG_PATTERN = re.compile(r"(\d{2}/\d{2} \d{2}:\d{2}:\d{2}\.\d{3}):\s+\[(\w+)\]\s+(\w+):\s*(.+)")  # Regex pattern for log lines
ENCODING = "utf-8"                                                                                # Encoding for log files

<div class="alert alert-info">
<b>Príklad logu:</b> 04/02 11:05:03.836: [amf] INFO: [Added] Number of gNB-UEs is now 3 (../src/amf/context.c:2678)
</div>

In [None]:
# Vytvorenie spojenia s Prometheus serverom
try:
    prom = PrometheusConnect(url=f"http://metrics:{PROMETHEUS_PORT}", disable_ssl=True)
except Exception as e:
    print(f"Error connecting to Prometheus: {e}")

<div class="alert alert-warning">
<b>Prometheus:</b> Pre správne fungovanie je potrebné sa uistiť, že Docker container 'metrics' je spustený a beží na porte 9090.<br>
60f374bfb155   docker_metrics           "/bin/bash -c /mnt/m…"   2 days ago     Up 30 minutes   0.0.0.0:9090->9090/tcp   
</div>

In [27]:
# Časový interval  
end_time = datetime.now()
start_time = end_time - timedelta(seconds=TIMEDELTA_SECONDS)

In [28]:
# from datetime import datetime

# # Input range (UTC)
# start_str = "2025-04-14 12:52:30"
# end_str = "2025-04-14 12:56:30"

# # Parse as UTC datetime
# start_utc = datetime.fromisoformat(start_str)
# end_utc = datetime.fromisoformat(end_str)

# # Convert to local time with microseconds
# start_local = start_utc.astimezone().replace(microsecond=0)
# end_local = end_utc.astimezone().replace(microsecond=0)

# print("Start time:", start_utc)
# print("End time:", end_utc)

# end_time = end_utc
# start_time = start_utc

### Načítanie a spracovanie metrík z Promethus

In [29]:
# Create an empty list to hold metric DataFrames
df_list = []

# Iterate through metric groups
for group, metric_list in metrics.items():
    for metric in metric_list:
        try:
            response = prom.custom_query_range(
                metric, start_time=start_time, end_time=end_time, step=STEP
            )

            if not response:
                print(f"⚠️ No data: {group}/{metric}")
                continue

            for entry in response:
                base_metric_name = entry["metric"].get("__name__", metric)

                if "values" in entry and isinstance(entry["values"], list):
                    extracted_values = [
                        (
                            datetime.utcfromtimestamp(int(ts))
                            .replace(tzinfo=pytz.utc)
                            .astimezone(LOCAL_TZ),
                            float(val)
                        )
                        for ts, val in entry["values"]
                    ]

                    metric_df = pd.DataFrame(extracted_values, columns=["timestamp", "value"])
                    metric_df["metric_name"] = base_metric_name
                    metric_df["group"] = group

                    df_list.append(metric_df)
                else:
                    print(f"⚠️ No valid values found: {group}/{metric}")

        except Exception as e:
            print(f"❌ Error fetching {group}/{metric}: {e}")

# Combine all metrics into one DataFrame
if df_list:
    final_df = pd.concat(df_list, ignore_index=True)
    final_df['timestamp'] = final_df['timestamp'].astype(str).str.replace(r'\+\d{2}:\d{2}', '', regex=True)
    final_df["timestamp"] = pd.to_datetime(final_df["timestamp"])
else:
    print("❌ No data collected for any metric.")

⚠️ No data: fivegs_smf/fivegs_smffunction_sm_pdusessioncreationreq
⚠️ No data: fivegs_smf/fivegs_smffunction_sm_qos_flow_nbr
⚠️ No data: fivegs_pcf/fivegs_pcffunction_pa_policysmassosucc
⚠️ No data: fivegs_pcf/fivegs_pcffunction_pa_sessionnbr


<div class="alert alert-info">
<b>final_df:</b> je dataframe, ktorý obsahuje všetky potrebné metriky z Prometheus.<br>
</div>

#### Spracovanie a extrakcia logov z funkcií Open5gs

In [30]:
log_data = []

# Aplikácie, ktoré chceme sledovať
allowed_applications = {"amf", "upf", "smf", "udm", "gmm"}

for log_path in Path(LOG_DIR).glob("*.log"):
    try:
        with open(log_path, "r", encoding=ENCODING, errors="ignore") as f:
            for line in f:
                match = LOG_PATTERN.match(line)
                if match:
                    timestamp_str, application, log_level, log_message = match.groups()

                    if application.lower() not in allowed_applications:
                        continue

                    # Konvertujeme timestamp na datetime
                    log_timestamp = datetime.strptime(timestamp_str, "%m/%d %H:%M:%S.%f")
                    log_timestamp = log_timestamp.replace(year=start_time.year, microsecond=0)

                    # Nechávame len logy v časovom intervale
                    if log_timestamp > start_time and log_timestamp < end_time:
                        log_data.append({
                            "timestamp": log_timestamp,
                            "application": application,
                            "log_level": log_level,
                            "log_message": log_message
                        })
                    else:
                        continue
    except Exception as e:
        print(f"❌ Failed to process {log_path.name}: {e}")

log_data = pd.DataFrame(log_data)

<div class="alert alert-info">
<b>log_data:</b> je dataframe, ktorý obsahuje všetky potrebné logy z Open5gs.<br>

#### Agregácia a transformácia údajov o metrikách

In [31]:
# Aggregácia dát z Prometheus metrík
data_agg = final_df.groupby(["timestamp", "metric_name"])["value"].mean().reset_index()
data_pivot = data_agg.pivot(index="timestamp", columns="metric_name", values="value")

# Do názvov stĺpcov pridáme "_value"
data_pivot.columns = [f"{col}_value" for col in data_pivot.columns]

# Pridanie stĺpca s časom
data_pivot.reset_index(inplace=True)

In [32]:
# real_5G_csv = "Model/real5g_scenarios1-5.csv"
# real_5G_df = pd.read_csv(real_5G_csv, sep=",", encoding=ENCODING)
# real_5G_df["timestamp"] = pd.to_datetime(real_5G_df["timestamp"])

# real_5G_df.head()

In [33]:
# csv_file = "1_6.csv"

# # Determine whether to write header
# write_header = not os.path.exists(csv_file) or os.path.getsize(csv_file) == 0

# # Try filtering only new records if file exists and is not empty
# if not write_header:
#     try:
#         last_timestamp = pd.read_csv(csv_file, usecols=["timestamp"])["timestamp"].max()
#         data_pivot = data_pivot[data_pivot["timestamp"] > last_timestamp]
#     except Exception as e:
#         print(f"⚠️ Issue reading existing CSV: {e}. Proceeding without filtering.")

# # Append new data
# if not data_pivot.empty:
#     data_pivot.to_csv(csv_file, mode="a", index=False, header=write_header)

In [34]:
# Funkcia na klasifikáciu logov
patterns = {
    "remove": re.compile(r"\b(Removed|Deregister|De-register|Implicit De-registered)\b", re.IGNORECASE),
    "refused": re.compile(r"\b(refused|connection refused)\b", re.IGNORECASE),
    "number_of_sessions_or_ues": re.compile(r"\b(Number of (gNBs|AMF-UEs|AMF-Sessions|gNB-UEs))\b", re.IGNORECASE),
    "registration": re.compile(r"\b(Registration request|InitialUEMessage|Added|Unknown UE by SUCI|SUCI)\b", re.IGNORECASE),
    "error": re.compile(r"\b(ERROR)\b", re.IGNORECASE),
    "warning": re.compile(r"\b(WARNING)\b", re.IGNORECASE),
}

def classify_log_message(message):
    if not isinstance(message, str):
        return "nothing"
    for label, pattern in patterns.items():
        if pattern.search(message):
            return label
    return "nothing"

# Predtým ako pridáme stĺpec "log_type", skontrolujeme, či stĺpec "log_message" existuje
if "log_message" in log_data.columns:
	log_data["log_type"] = log_data["log_message"].apply(classify_log_message)

In [35]:
# Vytvoríme skrátenú verziu dát z logov
logs_short = log_data[["timestamp", "application", "log_type"]] \
	if not log_data.empty \
	else pd.DataFrame(columns=["timestamp", "application", "log_type"])

In [36]:
# Logy sú v UTC, takže ich lokalizujeme
logs_short["timestamp"] = pd.to_datetime(logs_short["timestamp"], errors='coerce').dt.tz_localize("UTC")

# Filtrujeme logy podľa časového intervalu
logs_short = logs_short[logs_short["timestamp"] >= start_time.replace(tzinfo=pytz.utc)]

# Odstránime časovú zónu z timestampu, aby sme mali rovnaký formát ako v Prometheus dátach
logs_short["timestamp"] = logs_short["timestamp"].dt.tz_localize(None)

In [37]:
# Duplikáty odstránime na základe timestampu, aplikácie a typu logu
logs_short = logs_short.drop_duplicates(subset=["timestamp", "application", "log_type"])
logs_short = logs_short.sort_values(by=["timestamp", "application", "log_type"])
logs_short = logs_short.reset_index(drop=True)

<div class="alert alert-info">
<b>logs_short:</b> tento dataframe budeme spájať s dataframe-om final_df.
</div>

In [38]:
# merge_d = pd.merge(real_5G_df, logs_short, on="timestamp", how="outer")

In [39]:

# # drop the rows where the amf_session_value is Missing
# merge_d = merge_d.dropna(subset=["amf_session_value"])

# merge_d


In [40]:
# Metriky a logy spojíme na základe timestampu
merged_data = pd.merge(data_pivot, logs_short, on="timestamp", how="outer")

In [None]:
featured_metrics = {"features": ["fivegs_smffunction_sm_n4sessionreportsucc_value", "fivegs_pcffunction_pa_sessionnbr_value", "fivegs_pcffunction_pa_policysmassosucc_value", "fivegs_smffunction_sm_pdusessioncreationreq_value", "fivegs_smffunction_sm_qos_flow_nbr_value", "log_type", "application"]}
featured_metrics = featured_metrics["features"]

missing_metrics = []
for metric in featured_metrics:
    if metric not in merged_data.columns:
        missing_metrics.append(metric)
if missing_metrics:
    print("❌ Missing metrics in final DataFrame:")
    for metric in missing_metrics:
        print(f"  - {metric}")

# Add columns for missing metrics
for metric in missing_metrics:
    merged_data[metric] = 0.0


# Order the columns to match the selected features
ordered_columns = ["timestamp"] + featured_metrics
for col in merged_data.columns:
    if col not in ordered_columns:
        ordered_columns.append(col)
merged_data = merged_data[ordered_columns]

❌ Missing metrics in final DataFrame:
  - fivegs_pcffunction_pa_sessionnbr_value
  - fivegs_pcffunction_pa_policysmassosucc_value
  - fivegs_smffunction_sm_pdusessioncreationreq_value
  - fivegs_smffunction_sm_qos_flow_nbr_value


In [None]:
# Podľa preddefinovaných máp pre aplikácie a logy prevedieme hodnoty na číselné reprezentácie
APP_MAP = {"0": 0, "amf": 1, "gmm": 2, "udm": 3, "smf": 4, "upf": 5}
LOG_MAP = {"0": 0, "registration": 1, "number_of_sessions_or_ues": 2, "nothing": 3, "remove": 4, "error": 5}

merged_data['application'] = merged_data['application'].map(APP_MAP).fillna(0)
merged_data['log_type'] = merged_data['log_type'].map(LOG_MAP).fillna(0)

<div class="alert alert-warning">
<b>Mapovanie:</b> je potrebné zabezpečiť aby boli súbory 'app_map.json' a 'log_map.json' vytvorené a v správnom adresári.
</div>

In [None]:
current_uc = None

# Načítame aktuálny UC z textového súboru
with open("./data/current_uc.txt", "r") as f:
    current_uc = f.read().strip()

# Ak je aktuálny UC platný, pridáme ho do DataFrame
if not merged_data.empty:
    merged_data["current_uc"] = current_uc

UC_MAP = {"uc1": 0, "uc2": 1, "uc3": 2, "uc4": 3, "uc5": 4, "uc6": 5}

# Prevedieme hodnoty v stĺpci "current_uc" na číselné reprezentácie
merged_data['current_uc'] = merged_data['current_uc'].map(UC_MAP).fillna(0)

<div class="alert alert-warning">
<b> Mapovanie:</b> je potrebné zabezpečiť aby boli súbory 'uc_map.json' a 'current_uc.txt' vytvorené a v správnom adresári.

In [44]:
# #to csv
# merged_data.to_csv("data_2.csv", index=False, encoding=ENCODING)

In [45]:
merged_data

Unnamed: 0,timestamp,fivegs_smffunction_sm_n4sessionreportsucc_value,fivegs_pcffunction_pa_sessionnbr_value,fivegs_pcffunction_pa_policysmassosucc_value,fivegs_smffunction_sm_pdusessioncreationreq_value,fivegs_smffunction_sm_qos_flow_nbr_value,log_type,application,current_uc
0,2025-04-24 22:58:41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2025-04-24 22:58:42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2025-04-24 22:58:43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2025-04-24 22:58:44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2025-04-24 22:58:45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5,2025-04-24 22:58:46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,2025-04-24 22:58:47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,2025-04-24 22:58:48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
8,2025-04-24 22:58:49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
9,2025-04-24 22:58:50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
csv_file = "./data/running_data.csv"

# Rozhodneme, či zapíšeme hlavičku
write_header = not os.path.exists(csv_file) or os.path.getsize(csv_file) == 0

# Pokúsime sa filtrovať iba nové záznamy, ak súbor existuje a nie je prázdny
if not write_header:
    try:
        last_timestamp = pd.read_csv(csv_file, usecols=["timestamp"])["timestamp"].max()
        merged_data = merged_data[merged_data["timestamp"] > last_timestamp]
    except Exception as e:
        print(f"⚠️ Issue reading existing CSV: {e}. Proceeding without filtering.")

# Pridáme nové dáta
if not merged_data.empty:
    merged_data.to_csv(csv_file, mode="a", index=False, header=write_header)

<div class="alert alert-success">
<b>Uloženie datasetu:</b> je potrebné zabezpečiť aby bol súbor 'running_data.csv' vytvorený a v správnom adresári.
</div>