#### Importing Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
import math
import joblib
from sklearn.metrics import mean_squared_error, mean_pinball_loss, r2_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTENC
import lightgbm as lgb
import json


#### Constants

In [None]:
PATH_TO_CVS = 'output/output-energy/'
GROUP_FILE_PATH = 'group_list.json'
# This constant contains the unique quantile to represent in the scatter plot of residuals 
ONLY_QUANTILE = 0.95
# Constant containing the partial name of all the metric of a function
FUNCTION_COLUMNS = ['rate_function_', 'success_rate_function_', 'cpu_usage_function_', 'ram_usage_function_', 'power_usage_function_', 'replica_', 'overloaded_function_', 'medium_latency_function_']
# A selection of node metrics to represent in the boxplot and in the scatter plot 
COL_TO_PLOT = ['cpu_usage_idle_node', 'cpu_usage_node', 'ram_usage_idle_node', 'ram_usage_node', 'power_usage_idle_node', 'power_usage_node']
# List containing the values whose quantile regression is to be calculated
QUANTILES = [0.05, 0.95]
NODE_TYPES = ["LIGHT", "MID", "HEAVY"]

#### Utilities functions

In [None]:
# Function used to fill NaN values within the dataframe X
def fill_NaN(X):
  for col in X:
    if(col.startswith('success_rate_')):
      X.loc[:, col] = X.loc[:, col].fillna(1)
    else:
      X.loc[:, col] = X.loc[:, col].fillna(0)
  return X

In [None]:
# Function to reweight of dataframe
def resample_dataset(X, y):
  X_resampled, y_resampled = resample(X, y, replace=True, random_state=42)
  return X_resampled, y_resampled

In [None]:
# Function used to split the dataset into training and test set
def split_dataset(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
  return X_train, X_test, y_train, y_test

In [None]:
def convert_dataset(X_train, X_test, y_train, y_test):
  X_train_nn = X_train.astype(np.float32)
  X_test_nn = X_test.astype(np.float32)
  y_train_nn = y_train.astype(np.float32)
  y_test_nn = y_test.astype(np.float32)
  
  return X_train_nn, X_test_nn, y_train_nn, y_test_nn

In [None]:
# Function used to calculate the weighted mean squared error
def wmse_score(y_true, y_pred):
  # Calculates the weight of classes for the first target  
  median_cpu = y_true['cpu_usage_node'].median()
  w_majority_cpu = y_true[y_true['cpu_usage_node'] <= median_cpu].shape[0] / y_true.shape[0]
  w_minority_cpu = y_true[y_true['cpu_usage_node'] > median_cpu].shape[0] / y_true.shape[0]

  # Calculate the weight of classes for the second target
  median_ram = y_true['ram_usage_node'].median()
  w_majority_ram = y_true[y_true['ram_usage_node'] <= median_ram].shape[0] / y_true.shape[0]
  w_minority_ram = y_true[y_true['ram_usage_node'] > median_ram].shape[0] / y_true.shape[0]

  # Calculates the MSE for both targets
  mse_cpu = mean_squared_error(y_true['cpu_usage_node'], y_pred['cpu_usage_node'])
  mse_ram = mean_squared_error(y_true['ram_usage_node'], y_pred['ram_usage_node'])

  # Calculates WMSE as a weighted average of the MSEs for the two targets
  wmse = (w_majority_cpu * mse_cpu * y_true.shape[0] / (w_majority_cpu * y_true[y_true['cpu_usage_node'] <= median_cpu].shape[0] + w_minority_cpu * y_true[y_true['cpu_usage_node'] > median_cpu].shape[0]) +
          w_majority_ram * mse_ram * y_true.shape[0] / (w_majority_ram * y_true[y_true['ram_usage_node'] <= median_ram].shape[0] + w_minority_ram * y_true[y_true['ram_usage_node'] > median_ram].shape[0])) / 2

  return wmse

In [None]:
# Function used to calculate metrics based on the task
def metrics(task_type, y_test, y_pred, quantile):
  if(task_type == 'regression'):
    mse = mean_squared_error(y_test, y_pred)
    print("mse:", mse)
    rmse = math.sqrt(mse)
    print("rmse:", rmse)
    r2 = r2_score(y_test, y_pred)
    print("R-squared score:", r2)
    std_dev = np.std(y_pred)
    print("Standard deviation:", std_dev)
    if quantile != 0:
      quantile_loss = mean_pinball_loss(y_test, y_pred, alpha=quantile)
      print("Quantile loss with library: ", quantile_loss)
    else:
      quantile_loss = "Not calculated for this target"
    return mse, rmse, r2, quantile_loss, std_dev   
  elif(task_type.endswith('classification')):
    report = classification_report(y_test, y_pred)
    print("Classification Report:\n", report)

In [None]:
# Function used to plot the regression lines for the 2 targets
def plot_regression(y_test, y_pred, target_name):
  # Calculate the regression lines
  m, q = np.polyfit(y_test.ravel(), y_pred.ravel(), 1)

  # Plot the regression lines
  plt.plot(y_test, y_pred, 'o', color='red', fillstyle='none')
  plt.plot(y_test, m*y_test + q, linestyle='--')
  plt.xlabel('Valori osservati')
  plt.ylabel('Valori predetti')
  plt.title('Regressione di ' + target_name)
  plt.xlim(0, 1) 
  plt.ylim(0, 1)  
  plt.show()

  # Calculate residuals
  residuals = y_test.flatten() - y_pred.flatten()

  # Scatter plot with regression line
  sns.scatterplot(x=y_test.flatten(), y=residuals, label=f'Osservazioni')

  # Add the horizontal line near the value 0.0 of the y-axis
  plt.axhline(y=0, color='black', linestyle='--', linewidth=1)
  plt.xlabel('Valori osservati')
  plt.ylabel('Residui (Valori osservati - Valori predetti)')
  plt.title(f'Regressione Standard - {target_name}')
  plt.legend()
  plt.show()

In [None]:
# Function used to plots quantile regression results.
def plot_quantile_regression(y_test, y_pred, target_name):    
    point = 1
    line = 3
    for col in y_pred:
        m, q = np.polyfit(y_test.ravel(), y_pred[col], 1)
        plt.plot(y_test, y_pred[col], 'o', fillstyle='none', label=f'Osservazioni - {col}')
        plt.plot(y_test, m*y_test + q, linestyle='--', zorder = line)
        point = point + 1
        line = line + 1
    plt.xlabel('Valori osservati')
    plt.ylabel('Valori predetti')
    plt.title('Regressione Quantile di ' + target_name)
    plt.xlim(0, 1)  
    plt.ylim(0, 1)  
    plt.legend()
    plt.show()

    # Calculate quantile residuals
    residuals = y_test.flatten() - y_pred[str(ONLY_QUANTILE)]

    # Scatter plot with regression line
    sns.scatterplot(x=y_test.flatten(), y=residuals, label=f'Osservazioni')

    # Add the horizontal line near the value 0.0 of the y-axis
    plt.axhline(y=0, color='black', linestyle='--', linewidth=1)
    plt.xlabel('Valori osservati')
    plt.ylabel('Residui quantili (Valori osservati - Valori predetti)')
    plt.title(f'Regressione Quantile {ONLY_QUANTILE} - {target_name}')
    plt.legend()
    plt.show()

In [None]:
# Function used to plot the confusion matrix
def plot_confusion_matrix(y_test, y_pred, target):
  # Calculate the confusion matrix
  cm = confusion_matrix(y_test[target], y_pred[target])

  # Plot the confusion matrix as heatmap
  sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
  plt.xlabel('Valori osservati')
  plt.ylabel('Valori predetti')
  plt.title('Confusion matrix')
  plt.show()

In [None]:
# Function used to train the desired model for a target variable
def train_model(target_name, X_train, y_train, quantile):
    # Classification task. Create model with autogluon
    if target_name.startswith('overloaded') or target_name.startswith('replica'):
        model_type = ""
        model = lgb.LGBMClassifier()
    # Quantile regression with LGBM
    elif quantile != 0:
        model = lgb.LGBMRegressor(objective='quantile', alpha=quantile)
        model_type = "quantile" + str(quantile).replace('.', '')
    # Regression with LGBM
    else:
        model = lgb.LGBMRegressor(objective='regression')
        model_type = "regression"
    
    model.fit(X_train, y_train)
    path = './system-forecaster-models/groups/' + target_name + "/" + model_type
    if not os.path.exists(path):
        os.makedirs(path)
    joblib.dump(model, path + "/model.joblib")

    return model



In [None]:
# Function to obtain a barchart of the type of nodes
def plot_node_type_distribution(df):
    """
    Visualizza il numero di dati per ogni tipologia di nodo in un DataFrame tramite un bar chart.

    :param df: DataFrame contenente la colonna 'node_type' con le tipologie di nodo.
    """
    # Count of occurrences for each node type
    node_counts = df['node_type'].value_counts()

    # Bar chart
    plt.figure(figsize=(10, 6))
    node_counts.plot(kind='bar')
    plt.title('Distribuzione del Numero di Dati per Tipologia di Nodo')
    plt.xlabel('Tipologia di Nodo (0 = Heavy, 1 = Mid, 2 = Light)')
    plt.ylabel('Numero di Righe')
    plt.xticks(rotation=0)  # Maintains names of horizontal node types
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    plt.show()

In [None]:
# Function to obtain a barchart of overloaded node distribution
def plot_overloaded_node_distribution(df):
    """
    Visualizza il numero di righe in cui 'overloaded_node' è 1 e quelle in cui è 0.

    :param df: DataFrame contenente la colonna 'overloaded_node'.
    """
    # Count of occurrences of 0 and 1 in the 'overloaded_node' column
    overloaded_counts = df['overloaded_node'].value_counts()

    # Bar chart
    plt.figure(figsize=(8, 5))
    overloaded_counts.plot(kind='bar')
    plt.title('Distribuzione di Nodi Sovraccaricati')
    plt.xlabel('Stato Sovraccarico (0 = No, 1 = Sì)')
    plt.ylabel('Numero di Righe')
    plt.xticks(rotation=0)  
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    plt.show()

#### Retrieving dataset

In [None]:
df = pd.DataFrame()
for node_type in NODE_TYPES:
    # Retrieve all files in the output folder
    file_csv = [file for file in os.listdir(PATH_TO_CVS + node_type) if file.endswith('.csv')]
    # Create the dataframe by concatenating all read files
    dataframes = []
    for file in file_csv:
        file_path = os.path.join(PATH_TO_CVS + node_type, file)
        df_temp = pd.read_csv(file_path)       
        # Remove the columns in the dataframe that begin with "function_"
        df_temp.drop(columns=[col for col in df_temp if col.startswith('function_')], inplace=True)        
        # Aggiungi la colonna "node_type" e assegna il valore di 'type' a tutte le righe
        if node_type == "HEAVY":
            df_temp["node_type"] = 0
        elif node_type == "MID":
            df_temp["node_type"] = 1
        else: 
            df_temp["node_type"] = 2
        
        dataframes.append(df_temp)

    df = pd.concat([df, *dataframes], axis=0, ignore_index=True)

df = fill_NaN(df)
print(df["node_type"].value_counts())

functions = [col[14:] for col in df if col.startswith('rate')]

for function in functions:
    df.loc[df['rate_function_' + function] == 0, ['cpu_usage_function_' + function, 'ram_usage_function_' + function, 'power_usage_function_' + function, 'replica_' + function]] = 0


In [None]:
# Plot node type distribution
plot_node_type_distribution(df)
# Plot overloaded node distribution
plot_overloaded_node_distribution(df)

#### Removing outliers

In [None]:
# Iterate over each target column and handle outliers
functions_column = [col for col in df if col.startswith('rate')]
targets = [col for col in df if (col.startswith('power_usage_') or col.startswith('cpu_usage_') or col.startswith('ram_usage_') or col.startswith('overloaded_node') or col.startswith('medium_latency')) and 'idle' not in col]
grouped = df.groupby(functions_column + ['node_type'])
threshold = 1
for target in targets:
    print(target)
    if target != 'overloaded_node':
        mean = grouped[target].transform('mean')
        std = grouped[target].transform('std')
        outliers = (df[target] > mean + threshold * std) | (df[target] < mean - threshold * std)
        print(outliers.sum())
        df[target] = df[target].where(~outliers, mean)
    else:
        new_overloaded = grouped[target].transform('all')
        df['overloaded_node'] = new_overloaded.astype(int)
        print(df["overloaded_node"].value_counts())
df_only_useful = df[functions_column + targets]

#### Create dataset of groups

In [None]:
# Get the groups definition from the group_list.json 
with open(GROUP_FILE_PATH, 'r') as json_file:
    groups_number = json.load(json_file)

# Map groups number with the correspondent name
groups = {}
for key, value in groups_number.items():
    if "figlet" in value:
        groups["LOW_USAGE"] = value
    elif "nmap" in value:
        groups["HIGH_USAGE"] = value
    else:
        groups["MEDIUM_USAGE"] = value

df_groups = pd.DataFrame()
columns = {}
for key, group in groups.items():
    for metric in FUNCTION_COLUMNS:
        temp_columns = [metric + fun for fun in group]
        col_name = metric.replace('function_', '')

        # Reaname the column name
        if col_name == 'rate_':
            col_name = 'rate_group_'

        # Creates a dataset where the 0 values in the selected columns are replaced with NaN
        df_no_zeros = df[temp_columns].mask(df[temp_columns] == 0)
        if (metric.__contains__('rate') or metric.__contains__('usage') or metric.__contains__('power') or metric.__contains__('replica')) and not metric.__contains__('success'):
            df_groups[col_name + key] = df[temp_columns].sum(axis=1)
        elif metric.__contains__('overloaded'):
            df_groups[col_name + key] = df[temp_columns].any(axis=1).astype(int)
        else:
            df_groups[col_name + key] = df_no_zeros.mean(axis=1)

df_groups = fill_NaN(df_groups)

# Gets all the node columns names present in df
node_metrics = [col for col in df if col.endswith('node') or 'node_type' in col] 

# Copy all the node columns in df_groups 
for metric in node_metrics:
    df_groups[metric] = df[metric]

#### Partitioning into features and targets and oversampling

In [None]:
# Dataframe division by features and output
targets = [col for col in df_groups if (col.startswith('power_usage_') or col.startswith('cpu_usage_') or col.startswith('ram_usage_') or col.startswith('overloaded_node') or col.startswith('medium_latency')) and 'idle' not in col]# or col.startswith('replica')
params = [col for col in df_groups if col.startswith('rate_') or 'node_type' in col]

# Initialize a dictionary to store target datasets
target_datasets = {}
features_datasets = {}
features_datasets['original'] = df_groups[params]
for target_name in targets:
    X = df_groups[params]
    y = df_groups[[target_name]]
    node_type_index = X.columns.get_loc("node_type")

    if "overloaded" in target_name:
        print("Status of target: " + target_name)
        print(y.value_counts())
        # Oversampling
        sm = SMOTENC(random_state=42, categorical_features=[node_type_index])
        try:
            X_res, y_res = sm.fit_resample(X, y)
            result_df = pd.concat([X_res, y_res], axis=1)
            print("Status after SMOTE:")
            print(y_res.value_counts())
            print(X_res["node_type"].value_counts())
            features_datasets[target_name] = X_res
            y = y_res
        except:
            print("It was not possible to perform SMOTE for target " + target_name)

    target_datasets[target_name] = y

#### Preprocessing

In [None]:
# Initialize dictionaries to store scaled data and train-test splits
x_train_dict = {}
x_test_dict = {}
y_train_dict = {}
y_test_dict = {}
y_scalers = {}
scaler_exist = False

for target_name in targets:
    # Get the target dataset for the current iteration
    target_dataset = target_datasets[target_name]

    # Apply scaling for x (features)
    if "overloaded" in target_name:
        X = features_datasets[target_name]
    else:
        X = features_datasets['original']
    scaler_x = MinMaxScaler()
    scaler_x.fit(X)
    X_scaled = scaler_x.transform(X)

    # Apply scaling for y (target)
    scaler_y = MinMaxScaler()
    scaler_y.fit(target_dataset)
    y_scalers[target_name] = scaler_y
    if target_name.startswith('overloaded') or target_name.startswith('replica'):
        y_scaled = target_dataset
    else:
        y_scaled = scaler_y.transform(target_dataset)

    # Save the scaler for x
    scaler_x_path = './scalers/groups/scaler_x/' 
    if not os.path.exists(scaler_x_path):
        os.makedirs(scaler_x_path)
    if target_name.startswith('overloaded'):
        joblib.dump(scaler_x, scaler_x_path + "/" + target_name + ".joblib")
    elif not scaler_exist:
        scaler_exist = True
        joblib.dump(scaler_x, scaler_x_path + "/features.joblib")

    # Save the scaler for y
    scaler_y_path = './scalers/groups/scaler_y/'
    if not os.path.exists(scaler_y_path):
        os.makedirs(scaler_y_path)
    if not target_name.startswith("overloaded"):
        joblib.dump(scaler_y, scaler_y_path + "/" + target_name + ".joblib")

    # Split the dataset into training and testing
    x_train, x_test, y_train, y_test = train_test_split(X_scaled, y_scaled)
    x_train_dict[target_name] = x_train
    x_test_dict[target_name] = x_test
    y_train_dict[target_name] = y_train
    y_test_dict[target_name] = y_test

#### Train models

In [None]:
#One Model for each target variable
#Initialize a dictionary to store models
trained_models = {}
for target_name in targets:
    #Get the corresponding x_train for the current target
    x_train = x_train_dict[target_name]

    # Get the corresponding y_train for the current target
    y_train = y_train_dict[target_name]

    #Trains the specific model for the current target
    model = train_model(target_name, x_train, y_train, 0)  # Use the specific target's y_train
    trained_models[target_name] = model
    if not(target_name.startswith('overloaded') or target_name.startswith('replica')):
        for quantile in QUANTILES:
            model = train_model(target_name, x_train, y_train, quantile)
            trained_models[target_name + " " + str(quantile)] = model

#### Predict

In [None]:
# Initialize dictionary to store predictions
all_predictions = {}  
for target_name in targets:

    # Get the corresponding x_train for the current target
    x_test = x_test_dict[target_name]

    # Get the test dataset for the current iteration
    y_test = y_test_dict[target_name]

    # Train model
    test_data = pd.DataFrame(np.column_stack((x_test, y_test)), columns=[*params, target_name])

    model = trained_models[target_name]
    y_pred = model.predict(test_data.drop(columns=[target_name]))
    all_predictions[target_name] = y_pred
    if not(target_name.startswith('overloaded') or target_name.startswith('replica')):
        y_pred_quantiles = {}
        for quantile in QUANTILES:
            model = trained_models[target_name + " " + str(quantile)]
            y_pred_quantiles[str(quantile)] = model.predict(test_data.drop(columns=[target_name]))
        all_predictions[target_name + " quantiles"] = y_pred_quantiles

#### Plot results

In [None]:
quantiles_string = [str(q) for q in QUANTILES]

for target_name, y_pred in all_predictions.items():
    print(f"Target: {target_name}")

    if target_name.startswith('overloaded'):
        task_type = 'binary classification'
    elif target_name.startswith('replica'):
        task_type = 'classification'
    else:
        task_type = 'regression'

    if target_name.endswith('quantiles'):
        # Get the test dataset for the current iteration
        trunc_target = target_name[0:target_name.index(' ')]
        y_test = y_test_dict[trunc_target]
        y_test_df = pd.DataFrame(y_test, columns=[trunc_target])
        y_pred_df = pd.DataFrame(y_pred, columns=quantiles_string)
        for quantile in QUANTILES:
            statistics = (y_pred_df[str(quantile)] > y_test_df[trunc_target]).value_counts(normalize=True)
            if statistics.size > 1:
                true_percentage, false_percentage = statistics[True], statistics[False]
                print(f"Prediction for alpha {quantile} are greater than true values in {true_percentage * 100} % of cases and less or equal in {false_percentage * 100} % of cases.")
            metrics(task_type, y_test_df, y_pred_df[str(quantile)], quantile) 
        plot_quantile_regression(y_test, y_pred_df, trunc_target)            
    else:
        y_test = y_test_dict[target_name]
        y_test_df = pd.DataFrame(y_test, columns=[target_name])
        y_pred_df = pd.DataFrame(y_pred, columns=[target_name])
        metrics(task_type, y_test_df, y_pred_df, 0)
        if task_type == 'regression':
            plot_regression(y_test, y_pred, target_name)   

    print("-" * 30)
