# Predspracovanie dát

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import json
from joblib import dump
import numpy as np

Vybrané metriky z EDA:

In [2]:
# Vybrané metriky sú vo selected_features.json, načítame ich:

SELECTED_FEATURES_JSON = 'selected_features.json'
with open(SELECTED_FEATURES_JSON, 'r') as f:
    SELECTED_FEATURES = json.load(f)

SELECTED_FEATURES = SELECTED_FEATURES['features']
SELECTED_FEATURES

['fivegs_smffunction_sm_n4sessionreportsucc_value',
 'fivegs_pcffunction_pa_sessionnbr_value',
 'fivegs_pcffunction_pa_policysmassosucc_value',
 'fivegs_smffunction_sm_pdusessioncreationreq_value',
 'fivegs_smffunction_sm_qos_flow_nbr_value',
 'log_type',
 'application']

<div class="alert alert-block alert-warning">  
<b>Upozornenie:</b> Ak by sa v priebehu bežania digitálneho dvojčaťa menili dôležité metriky, týmto spôsobom ich neodchytíme.  
</div>

Mapovanie katégorických premenných na číselné hodnoty:

In [3]:
# Mapy pre log_type a application a UC
with open('log_map.json', 'r') as f:
    LOG_MAP = json.load(f)

with open('app_map.json', 'r') as f:
    APP_MAP = json.load(f)

with open('uc_map.json', 'r') as f:
    UC_MAP = json.load(f)

In [4]:
def preprocess_df(df: pd.DataFrame, scaler=None, fit_scaler=False):
    df = df.copy()

    # Chýbajúce hodnoty
    df.fillna(df.mode().iloc[0], inplace=True)

    # Timestamp
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'])

    # Mapovanie pre application, log_type a current_uc
    if 'application' in df.columns:
        df['application'] = df['application'].map(APP_MAP)

    if 'log_type' in df.columns:
        df['log_type'] = df['log_type'].map(LOG_MAP)

    if 'current_uc' in df.columns:
        df['current_uc'] = df['current_uc'].map(UC_MAP)

    # Odstránenie riadkov s chýbajúcimi hodnotami v selected features
    df.dropna(subset=SELECTED_FEATURES, inplace=True)

    # Výber selected features
    X = df[SELECTED_FEATURES]

    # Normalizácia
    if scaler is None:
        scaler = StandardScaler()

    if fit_scaler:
        X_scaled = scaler.fit_transform(X)
    else:
        X_scaled = scaler.transform(X)

    return X_scaled, df.get('current_uc'), scaler


<div class="alert alert-block alert-info">  
<b>Predspracovanie dát:</b> Dáta predspracujeme tak, ako sme to robili v súbore EDA. 
</div>

In [6]:
# Príklad použitia:
df = pd.read_csv("../synthetic_data.csv")
X_scaled, y_scaled, fitted_scaler = preprocess_df(df, fit_scaler=True)

# predpokladáme že máš X_scaled a y pripravené
np.save("X_scaled.npy", X_scaled)
np.save("y_labels.npy", y_scaled)

# Save scaler pre použitie pri real-time inferencii
dump(fitted_scaler, 'scaler.joblib')

['scaler.joblib']

<div class="alert alert-block alert-success">  
<b>'scaler.joblib':</b> Uložíme aj scaler, aby sme mohli normalizovať dáta pred ich nahratím do digitálneho dvojčaťa.
</div>