#### Importing Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import MultiOutputRegressor

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix, accuracy_score

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

from imblearn.over_sampling import SMOTE
from collections import Counter

#### Utilities functions

In [None]:
# Function used to fill NaN values within the dataframe X
def fill_NaN(X):
  for col in X:
    if(col.startswith('success_rate_')):
      X.loc[:, col] = X.loc[:, col].fillna(1)
    else:
      X.loc[:, col] = X.loc[:, col].fillna(0)
  return X

In [None]:
# Function to reweight of dataframe
def resample_dataset(X, y):
  X_resampled, y_resampled = resample(X, y, replace=True, random_state=42)
  return X_resampled, y_resampled

In [None]:
# Function used to split the dataset into training and test set
def split_dataset(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
  return X_train, X_test, y_train, y_test

In [None]:
def convert_dataset(X_train, X_test, y_train, y_test):
  X_train_nn = X_train.astype(np.float32)
  X_test_nn = X_test.astype(np.float32)
  y_train_nn = y_train.astype(np.float32)
  y_test_nn = y_test.astype(np.float32)
  
  return X_train_nn, X_test_nn, y_train_nn, y_test_nn

In [None]:
# Function used to calculate the weighted mean squared error
def wmse_score(y_true, y_pred):
  # Calculates the weight of classes for the first target  
  median_cpu = y_true['cpu_usage_node'].median()
  w_majority_cpu = y_true[y_true['cpu_usage_node'] <= median_cpu].shape[0] / y_true.shape[0]
  w_minority_cpu = y_true[y_true['cpu_usage_node'] > median_cpu].shape[0] / y_true.shape[0]

  # Calculate the weight of classes for the second target
  median_ram = y_true['ram_usage_node'].median()
  w_majority_ram = y_true[y_true['ram_usage_node'] <= median_ram].shape[0] / y_true.shape[0]
  w_minority_ram = y_true[y_true['ram_usage_node'] > median_ram].shape[0] / y_true.shape[0]

  # Calculates the MSE for both targets
  mse_cpu = mean_squared_error(y_true['cpu_usage_node'], y_pred['cpu_usage_node'])
  mse_ram = mean_squared_error(y_true['ram_usage_node'], y_pred['ram_usage_node'])

  # Calculates WMSE as a weighted average of the MSEs for the two targets
  wmse = (w_majority_cpu * mse_cpu * y_true.shape[0] / (w_majority_cpu * y_true[y_true['cpu_usage_node'] <= median_cpu].shape[0] + w_minority_cpu * y_true[y_true['cpu_usage_node'] > median_cpu].shape[0]) +
          w_majority_ram * mse_ram * y_true.shape[0] / (w_majority_ram * y_true[y_true['ram_usage_node'] <= median_ram].shape[0] + w_minority_ram * y_true[y_true['ram_usage_node'] > median_ram].shape[0])) / 2

  return wmse

In [None]:
# Function used to calculate metrics based on the task
def metrics(task_type, y_test, y_pred):
  if(task_type == 'regression'):
    mse = mean_squared_error(y_test, y_pred)
    print("mse:", mse)

    r2 = r2_score(y_test, y_pred)
    print("R-squared score:", r2)
    return mse, r2
    
  elif(task_type == 'mo-classification'):
    report = classification_report(y_test, y_pred)
    print("Classification Report:\n", report)

    accuracy = accuracy_score(y_test, y_pred)
    print('Accuracy: {}'.format(accuracy))
    
  elif(task_type == 'classification'):
    report = classification_report(y_test, y_pred)
    print("Classification Report:\n", report)

In [None]:
# Function used to plot the regression lines for the 2 targets
def plot_regression(y_test, y_pred):
  # Extract the values of cpu_usage_node and ram_usage_node from the dataFrame y_test
  y1_test = y_test['cpu_usage_node']
  y2_test = y_test['ram_usage_node']

  # Extract the cpu_usage_node and ram_usage_node values from the dataFrame y_pred
  y1_pred = y_pred['cpu_usage_node']
  y2_pred = y_pred['ram_usage_node']

  # Calculate the regression lines
  m1, q1 = np.polyfit(y1_test, y1_pred, 1)
  m2, q2 = np.polyfit(y2_test, y2_pred, 1)

  # Plot the regression lines
  plt.plot(y1_test, y1_pred, 'o', color='red', fillstyle='none', label='Utilizzo CPU')
  plt.plot(y1_test, m1*y1_test + q1, linestyle='--',  label='Regressione uso CPU')
  plt.plot(y2_test, y2_pred, '+', label='Utilizzo RAM')
  plt.plot(y2_test, m2*y2_test + q2, color= 'black', linestyle='-',  label='Regressione uso RAM')
  plt.xlabel('Valori osservati')
  plt.ylabel('Valori predetti')
  plt.legend()
  plt.show()

In [None]:
# Function used to plot the confusion matrix
def plot_confusion_matrix(y_test, y_pred, target):
  # Calculate the confusion matrix
  cm = confusion_matrix(y_test[target], y_pred[target])

  # Plot the confusion matrix as heatmap
  sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
  plt.xlabel('Valori osservati')
  plt.ylabel('Valori predetti')
  plt.title('Confusion matrix')
  plt.show()

#### Retrieving dataset

In [None]:
# Retrieve all files in the output folder
file_csv = [file for file in os.listdir('output') if file.endswith('.csv')]

# Create the dataframe by concatenating all read files
dataframes = []
for file in file_csv:
    file_path = os.path.join('output', file)
    dataframes.append(pd.read_csv(file_path))
df = pd.concat(dataframes)

# Remove the columns in the dataframe that begin with "function_"
to_drop = [df.drop(col, axis=1, inplace=True) for col in df if col.startswith('function_')]
df = fill_NaN(df)

#### Regression Task

CPU and RAM as targets

In [None]:
# Dataframe division by features and output
targets = [col for col in df if col.endswith('_usage_node')]
params = [col for col in df if not col.endswith('_usage_node')]
X = df[params]
y = df[targets]

print(X.shape, y.shape)

In [None]:
# Preprocessing
X = fill_NaN(X)
X_scaled = MinMaxScaler().fit_transform(X, y)
y_scaled = MinMaxScaler().fit_transform([[val1, val2] for val1, val2 in zip(y['cpu_usage_node'], y['ram_usage_node'])])
X_train, X_test, y_train, y_test = split_dataset(X_scaled, y_scaled)

In [None]:
# Creating the Gradient Boosting model
gb = MultiOutputRegressor(GradientBoostingRegressor(n_estimators=10, learning_rate=0.1, random_state=42))
gb.fit(X_train, y_train)

# Performance evaluation of the model on the test set
y_pred_gb = gb.predict(X_test)
y_pred_gb = pd.DataFrame(y_pred_gb, columns=['cpu_usage_node', 'ram_usage_node'])
y_test = pd.DataFrame(y_test, columns=['cpu_usage_node', 'ram_usage_node'])

metrics('regression', y_test, y_pred_gb)

plot_regression(y_test, y_pred_gb)

In [None]:
# Creating the Random Forest model
rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=10, random_state=42))
rf.fit(X_train, y_train)

# Performance evaluation of the model on the test set
y_pred_rf = rf.predict(X_test)
y_pred_rf = pd.DataFrame(y_pred_rf, columns=['cpu_usage_node', 'ram_usage_node'])

metrics('regression', y_test, y_pred_rf)

plot_regression(y_test, y_pred_rf)

In [None]:
# Creating the Neural Network model
X_train_nn, X_test_nn, y_train_nn, y_test_nn = convert_dataset(X_train, X_test, y_train, y_test)

nn = Sequential()
nn.add(Dense(64, input_dim=X_train_nn.shape[1], activation='relu'))
nn.add(Dense(32, activation='relu'))
nn.add(Dropout(0.2))
nn.add(Dense(16, activation='relu'))
nn.add(Dropout(0.2))
nn.add(Dense(2, activation='linear'))

nn.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])

early_stop = EarlyStopping(monitor='val_loss', patience=10)

nn.fit(X_train_nn, y_train_nn, batch_size=32,epochs=100, validation_data=(X_test_nn, y_test_nn), callbacks=[early_stop])

# Performance evaluation of the model on the test set
y_pred_nn = nn.predict(X_test_nn)
y_pred_nn = pd.DataFrame(y_pred_nn, columns=['cpu_usage_node', 'ram_usage_node'])

metrics('regression', y_test_nn, y_pred_nn)

plot_regression(y_test_nn, y_pred_nn)

#### Multi-output Classification Task

CPU and RAM as targets discretized

In [None]:
# Dataframe split by features and output
targets = [col for col in df if col.endswith('_usage_node')]
params = [col for col in df if not col.endswith('_usage_node')]
X = df[params]
y = df[targets]

print(X.shape, y.shape)

In [None]:
# Preprocessing
X = fill_NaN(X)
X_scaled = MinMaxScaler().fit_transform(X, y)
y_scaled = MinMaxScaler().fit_transform([[val1, val2] for val1, val2 in zip(y['cpu_usage_node'], y['ram_usage_node'])])
y_scaled = pd.DataFrame(y_scaled, columns=['cpu_usage_node', 'ram_usage_node'])

# Discretization of targets
y_scaled.loc[:, ('cpu_usage_node')] = pd.cut(y_scaled.loc[:, ('cpu_usage_node')], 3).cat.codes
y_scaled.loc[:, ('ram_usage_node')] = pd.cut(y_scaled.loc[:, ('ram_usage_node')], 6).cat.codes

# Transformation of target classes into binary values
y_scaled_cpu_dummies = pd.get_dummies(y_scaled['cpu_usage_node'], prefix='cpu_usage_node')
y_scaled_ram_dummies = pd.get_dummies(y_scaled['ram_usage_node'], prefix='ram_usage_node')

# Merge original dataframe with the one with binary columns
y_scaled = pd.concat([y_scaled, y_scaled_cpu_dummies], axis=1)
y_scaled = pd.concat([y_scaled, y_scaled_ram_dummies], axis=1)

# Removing original columns
y_scaled = y_scaled.drop('cpu_usage_node', axis=1)
y_scaled = y_scaled.drop('ram_usage_node', axis=1)

X_train, X_test, y_train, y_test = split_dataset(X_scaled, y_scaled)

In [None]:
# Creating the Gradient Boosting model
gb = MultiOutputClassifier(GradientBoostingClassifier(n_estimators=10, learning_rate=0.1, random_state=42))
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)

metrics('mo-classification', y_test, y_pred_gb)

In [None]:
# Creating the Random Forest model
rf = MultiOutputClassifier(RandomForestClassifier(n_estimators=10, random_state=42))
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

metrics('mo-classification', y_test, y_pred_rf)

In [None]:
# Creating the Neural Network model
nn = Sequential()
nn.add(Dense(64, input_shape=(X_train.shape[1],), activation='relu'))
nn.add(Dropout(0.2))
nn.add(Dense(32, activation='relu'))
nn.add(Dropout(0.2))
nn.add(Dense(16, activation='relu'))
nn.add(Dense(y_train.shape[1], activation='sigmoid'))

nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_loss', patience=10)

nn.fit(X_train, y_train, epochs=100, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stop])

y_pred_nn = nn.predict(X_test)
y_pred_nn = (y_pred_nn > 0.5).astype(int)

metrics('mo-classification', y_test, y_pred_nn)

### Classification Task

Overloaded Node as target

In [None]:
# Dataframe split by features and output
targets = [col for col in df if 'overloaded_node' in col]
params = [col for col in df if not 'overloaded_node' in col]

X = df[params]
y = df[targets]
y = y['overloaded_node'].values.ravel()
y = y.astype('int')

# Oversampling
sm = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=42)
X, y = sm.fit_resample(X, y)
Counter(y)

print(X.shape, y.shape)

In [None]:
# Preprocessing
X = fill_NaN(X)
X_scaled = MinMaxScaler().fit_transform(X, y)
X_train, X_test, y_train, y_test = split_dataset(X_scaled, y)

In [None]:
# Creating the Gradient Boosting model
gb = GradientBoostingClassifier(n_estimators=10, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)

metrics('classification', y_test, y_pred_gb)

cm = confusion_matrix(y_test, y_pred_gb)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Valori osservati')
plt.ylabel('Valori predetti')
plt.title('Confusion matrix')
plt.show()

In [None]:
# Creating the Random Forest model
rf = RandomForestClassifier(n_estimators=10, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

metrics('classification', y_test, y_pred_rf)

cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Valori osservati')
plt.ylabel('Valori predetti')
plt.title('Confusion matrix')
plt.show()

In [None]:
# Creating the Neural Network model
nn = Sequential()
nn.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
nn.add(Dropout(0.2))
nn.add(Dense(32, activation='relu'))
nn.add(Dropout(0.2))
nn.add(Dense(16, activation='relu'))
nn.add(Dense(1, activation="sigmoid"))
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stop = EarlyStopping(monitor='val_accuracy', patience=10)

nn.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stop])
nn.summary()

y_pred_nn = nn.predict(X_test)
y_pred_nn = (y_pred_nn > 0.5).astype(int)

metrics('classification', y_test, y_pred_nn)

cm = confusion_matrix(y_test, y_pred_nn)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')
plt.xlabel('Valori osservati')
plt.ylabel('Valori predetti')
plt.title('Confusion matrix')
plt.show()