Use this notebook to Train & Tune supervised and unsupervised algorithms on Training Set, and to perform prediction.

NB: save 1-d array of predictions in .npy files as it follows:


*  Supervised Learning with classifier "clf": **{clf_name}_test_prediction_supervised.npy**
*  Unsupervised Learning with k-means: **labels_test_prediction_k={k}_unsupervised.npy**



#Import Libraries and Functions

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import numpy as np
from collections import defaultdict

# Supervised Learning
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier

# Clustering
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples, completeness_score, homogeneity_score


# PREPROCESSING

def load_dataset(filename):
    """ Load the dataset and handle errors """
    try:
        dataset = pd.read_pickle(filename)
        print(f"Dataset Loaded with {len(dataset)} rows!")
        return dataset
    except Exception as e:
        print(f"Could not load the dataset, exiting! Error: {e}")
        exit(-1)


def split_and_save_dataset(dataset, extract_rate, filenames):
    """ Split dataset and save the samples """
    # Sampling a fraction of the dataset
    sampled_df = dataset.sample(frac=extract_rate, random_state=42)
    remaining_df = dataset.drop(sampled_df.index)

    # Resetting indices
    sampled_df = sampled_df.reset_index(drop=True)
    remaining_df = remaining_df.reset_index(drop=True)

    # Save to pickle
    sampled_df.to_pickle(filenames['testing'])
    remaining_df.to_pickle(filenames['training'])

    print(f"Training dataset saved with {len(remaining_df)} rows!")
    print(f"Testing dataset saved with {len(sampled_df)} rows!")


def plot_kpi(imsi_data, kpi_column, selected_imsi):
    """ Plot the KPI over time using a stem plot """
    # Adjust 'Timestamp' to start from 0 using the smallest value as reference
    imsi_data['Adjusted_Timestamp'] = (imsi_data['Timestamp'] - imsi_data['Timestamp'].min()).dt.total_seconds()

    # Plot
    plt.figure(figsize=(10, 6))
    plt.stem(imsi_data['Adjusted_Timestamp'], imsi_data[kpi_column], use_line_collection=True)
    plt.xlabel('Time [s]')  # Time in seconds
    plt.ylabel(kpi_column)
    plt.title(f'{kpi_column} Over Time for IMSI: {selected_imsi}')
    plt.grid(True)
    plt.tight_layout()
    #plt.show(block=False)


def plot_correlation_matrix(imsi_data):
    """ Plot the correlation matrix excluding certain columns """
    # Drop unwanted columns
    data_to_correlate = imsi_data.drop(columns=["Timestamp", "IMSI", "slice_id"], errors='ignore')

    # Compute the correlation matrix
    corr_matrix = data_to_correlate.corr()

    # Plot the correlation matrix
    plt.figure(figsize=(12, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
    plt.title("Correlation Matrix for Selected IMSI")
    plt.tight_layout()
    #plt.show(block=False)

# TRAINING AND PREDICTION


def normalize_dataset(X_train, X_test):
    mean_x, std_x = X_train.mean(), X_train.std()

    X_train_norm = (X_train - mean_x) / std_x
    X_test_norm = (X_test - mean_x) / std_x

    stats_x = [mean_x, std_x]

    return X_train_norm, X_test_norm, stats_x

# supervised learning

def grid_search(classifier, parameters, train, ground_truth, pred_input, cross_val=3):

    gscv = GridSearchCV(classifier, parameters, cv=cross_val,
                        n_jobs=12, return_train_score=False,
                        verbose=5, scoring='accuracy')
    gscv.fit(train, ground_truth)

    y_pred = gscv.predict(pred_input)

    return gscv, y_pred


def get_bestpar_list(bestpar_df):


    dd = defaultdict(list)
    for index, row in bestpar_df.iterrows():  # list input dicts

        bp = row['bestpars']
        for key, value in zip(list(bp.keys()), list(bp.values())):
            dd[key] = value

    return dd

def plot_metric_supervised(perf, metric, labels, colors=None):

    fig, ax = plt.subplots(figsize=(16, 10))
    ax.bar(labels, perf, color=colors)
    plt.grid(True)
    plt.ylabel(metric)
    plt.savefig(f'{metric}_validation.png')
    #plt.show(block=False)


def print_performance_supervised(clf_name, acc, f1, y_test, output):

    print(20 * '*')
    print(f'Performance for classifier: {clf_name}')
    print(f'Accuracy --> {acc}')
    print(f'F1 Score --> {f1}')

    classes = np.unique(output)
    print("Classification report: \n", (classification_report(y_test, output)))
    cm = confusion_matrix(y_test, output, labels=classes, normalize='true')
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=classes)
    disp.plot()
    plt.savefig(f'{clf_name}_confusion.png')
    #plt.show(block=False)
    print(20 * '*')

# unsupervised learning (clustering)


def sampling_silhouette(test_data, pred_labels, ns, runs=200):
    '''

    :param test_data: input for clustering (N samples x M Features)
    :param pred_labels: output of clustering applied on test_data (N samples)
    :param ns: number of samples to take from test_data and pred_labels to make an estimate of silhouette score
    :param runs: number of ietartions over which to average
    :return: the average silhouette score over rhe number of runs
    '''

    sscores = []

    for r in range(runs):

        idx = pd.Index(np.random.choice(test_data.reset_index().index, ns, replace=False))

        x = test_data.iloc[idx]
        l = pred_labels[idx.values]

        sscores.append(silhouette_score(x,l))

    return np.mean(sscores)

def kmeans_silhouette(data, min_cl, max_cl,initialization="k-means++", estimated=False):

    '''

    :param data: input for clustering (N samples x M Features)
    :param min_cl: minimum number of clusters to test
    :param max_cl: maximum number of clusters to test
    :param initialization: centroids initialization for k means algorithm. Check sciki-learn ref for more info.
    :param estimated: Boolean, whether to opt or not for an estimated version of silhouette score
    :return: number of clusters that maximizes the silhouette score
    '''

    print(f'SILHOUETTE EVALUATION')
    silhouette_avg = []
    for num_clusters in list(range(min_cl, max_cl)):

        print(f'Clustering for k={num_clusters}...')

        if initialization not in ['k-means++', 'random']:
            init = initialization[0][:num_clusters,:initialization[1]] #select first k components as initializing centroids
        else:
            init = initialization

        kmeans = KMeans(init=init, n_clusters=num_clusters, n_init='auto')
        kmeans.fit_predict(data)

        if not estimated:
            score = silhouette_score(data, kmeans.labels_)
        else:
            score = sampling_silhouette(data, kmeans.labels_, ns=1000)
        print(f'---- Score for k={num_clusters} --> {score}')
        silhouette_avg.append(score)

        print(f'DONE')

    best_k  = np.argmax(silhouette_avg)+min_cl

    plt.figure(figsize=(12, 8))
    plt.plot(np.arange(min_cl, max_cl), silhouette_avg, 'bx-')
    plt.xlabel('Values of K')
    plt.ylabel('Silhouette score')
    plt.title('Silhouette Analysis for Optimal k')
    _ = plt.xticks(np.arange(min_cl, max_cl))
    plt.grid(True)
    plt.savefig(f'Silhouette_Scores.png')
    plt.show(block=False)

    return best_k

def kmeans_helbow(data, max_cl, initialization="k-means++", ):

    '''

    :param data: input for clustering (N samples x M Features)
    :param min_cl: minimum number of clusters to test
    :param initialization: centroids initialization for k means algorithm. Check sciki-learn ref for more info.
    :return: helbow plot
    '''

    print(f'HELBOW RULE')
    distortions = []
    K = range(1, max_cl)
    for k in K:
        print(f'Clustering for k={k}...')

        if initialization not in ['k-means++', 'random']:
            init = initialization[0][:k,:initialization[1]] #select first k components as initializing centroids
        else:
            init = initialization

        kmeanModel = KMeans(init=init, n_clusters=k, n_init='auto')
        kmeanModel.fit(data)
        distortions.append(kmeanModel.inertia_)
        print(f'DONE')

    #plt.figure(figsize=(12, 8))
    fig, ax = plt.subplots(1, 1)
    ax.plot(K, distortions, 'bx-')
    plt.xlabel(r'k')
    plt.ylabel(r'Inertia')
    plt.title('Elbow Method for Optimal k')
    ax.grid(True)
    plt.savefig(f'Elbow_Rule.png')
    plt.show(block=False)

    return ax

def compute_unsupervised_performance(k, gt, test_data, pred_labels, ns):

    sscore = sampling_silhouette(test_data, pred_labels, ns=ns)
    completeness = completeness_score(gt, pred_labels)
    homogeneity = homogeneity_score(gt, pred_labels)

    print(20 * '#')
    print(f'Performance for clustering: k-means with k={k}')
    print(f'Estimated Silhouette Score --> {sscore}')
    print(f'Completeness --> {completeness}')
    print(f'Homogeneity --> {homogeneity}')
    print(20 * '#')

#Load and Preprocess/Visualize Train Data

In [None]:
# Column headers
columns_list = [
    "Timestamp", "IMSI", "slice_id", "slice_prb", "scheduling_policy", "dl_mcs",
    "dl_n_samples", "dl_buffer [bytes]", "tx_brate downlink [Mbps]", "tx_pkts downlink",
    "dl_cqi", "ul_mcs", "ul_n_samples", "ul_buffer [bytes]", "rx_brate uplink [Mbps]",
    "rx_pkts uplink", "rx_errors uplink (%)", "ul_sinr", "sum_requested_prbs", "sum_granted_prbs"
]

# Dataset filenames
dataset_filenames = {
    "training": "dataset_restart_training.pkl",
    "testing": "dataset_restart_testing.pkl",
}

rs = 42

In [None]:
 # Configurations
dataset_filename = dataset_filenames["testing"] # use "training"

# Load dataset
dataset = load_dataset(dataset_filename)

# Ensure the Timestamp column is in datetime format
dataset['Timestamp'] = pd.to_datetime(dataset['Timestamp'], errors='coerce')

Dataset Loaded with 1565185 rows!


In [None]:
selected_imsi = 1010123456004  # Example IMSI, update as needed
selected_kpi = 'tx_pkts downlink'  # Example KPI, update as needed

In [None]:
# Plotting

# Filter the data for the selected IMSI
imsi_data = dataset[dataset['IMSI'] == selected_imsi]

# Plot the KPI over time
plot_kpi(imsi_data, selected_kpi, selected_imsi)

# Plot the correlation matrix
plot_correlation_matrix(imsi_data)

In [None]:
# Prepare Data for Training and Validation Evaluation
X, X_test, y, y_test = train_test_split(dataset.drop(["Timestamp", "IMSI", "slice_id"], axis=1),
                                                    dataset.loc[:, 'slice_id'],
                                                    test_size=0.2, random_state=rs)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,
                                                  random_state=rs)  # 0.25

X_train_norm, X_val_norm, stats_val = normalize_dataset(X_train, X_val)
X_norm, X_test_norm, stats_test = normalize_dataset(X, X_test)

#Supervised Learning

In [None]:
# Select Classsifiers

classifiers = {
    "Linear Regression": RidgeClassifier(solver='svd'),
    "Random Forest": RandomForestClassifier(random_state=rs)
}

In [None]:
# Select Hyper-Parameters
params = {'Linear Regression': {'alpha': list(np.logspace(-2, 2, num=10))},
          'Random Forest': {'n_estimators': [50,200]}
          }

# Validation
perf_cols = ['f1', 'accuracy', 'gscv','bestpars']
perf_avg = {"Random Forest": pd.DataFrame(columns=perf_cols),
            'Linear Regression': pd.DataFrame(columns=perf_cols)}

In [None]:
# Utils
f1_list = []
acc_list = []
col = ['blue', 'red', 'green']
cross_val_k = 3 # number of inner cross validation iterations

In [None]:
for i, (clf_name, clf) in enumerate(classifiers.items()):

    print(10 * '-')

    # Validation

    print(f'Start GS procedure for {clf_name}...')

    gscv, y_pred = grid_search(clf, params[clf_name],
                                X_train_norm, y_train, X_val_norm,
                                cross_val=cross_val_k)

    print('... completed.')


    f1 = f1_score (y_val, y_pred, average='micro')
    acc = accuracy_score(y_val,y_pred)
    bestpar = gscv.best_params_

    print(f'Best HP Set for {clf_name}: {bestpar}')

    # Update Validation Performance
    vp = np.asarray([f1, acc, gscv, bestpar]).reshape(1, -1)
    perf_avg[clf_name] = pd.concat([perf_avg[clf_name],
                                    pd.DataFrame(columns=perf_cols, data=vp)])

    # Save performance of cross-validation procedure

    perf_avg[clf_name].to_pickle(f'{clf_name}_validation_supervised.pkl')

    f1_list.append(perf_avg[clf_name]['f1'].values[0])
    acc_list.append(perf_avg[clf_name]['accuracy'].values[0])


    # Testing

    print(f'First train the classifier with the optimized set of HP values...')
    clf_tuned = classifiers[clf_name].set_params(**bestpar)
    clf_tuned.fit(X_norm, y)

    print(f'... and secondly perform prediction on test set...')
    output = clf_tuned.predict(X_test_norm)

    with open(f'{clf_name}_test_prediction_supervised.npy', 'wb') as f:
        np.save(f, output)

    print('... Done!')

----------
Start GS procedure for Linear Regression...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
... completed.
Best HP Set for Linear Regression: {'alpha': 12.915496650148826}
First train the classifier with the optimized set of HP values...
... and secondly perform prediction on test set...
... Done!
----------
Start GS procedure for Random Forest...
Fitting 3 folds for each of 2 candidates, totalling 6 fits




KeyboardInterrupt: 

#Unsupervised Learning

In [None]:
k = None # Set to None for tuning of k, otherwise to int greater than 1 to perform clustering.

In [None]:
# Use Helbow Rule to select best k
min_cl_km = 2
max_cl_km = 8
kmeans_helbow(X_norm, max_cl_km, initialization="k-means++")

In [None]:
# Check how Silhouette Score varies with k
k_silhouette = kmeans_silhouette(X_norm, min_cl_km, max_cl_km,
                                 initialization = 'k-means++', estimated=True)

print(f"Best K Silhouette: {k_silhouette}") # extract k with best sil coeff

In [None]:
k_opt = None

In [None]:
if k_opt is not None:

  # Once that k is tuned, use it to perform clustering and generate labels on Test Set
  kmeans_model = KMeans(n_clusters=k_opt, init="k-means++", n_init='auto')
  kmeans_model.fit(X_norm)
  cluster_labels = kmeans_model.predict(X_test_norm)

  with open(f'labels_test_prediction_k={k_opt}_unsupervised.npy', 'wb') as f:
      np.save(f, cluster_labels)

#Unsupervised Learning: Explore Principal Component Analysis

In [None]:
### PERFORM PCA to check how clusters look like in PC plane

# Defining the number of principal components to generate
n = min(X_norm.shape[0], X_norm.shape[1])  # get maximum n of components accepted by scikit.PCA


# Finding principal components for the data
pca = PCA(n_components=n, random_state=42)
X_norm_pca = pd.DataFrame(pca.fit_transform(X_norm))


In [None]:
# The percentage of variance explained by each principal component
exp_var = pca.explained_variance_ratio_

# visualize the Explained Individual Components
plt.figure(figsize=(10, 10))
plt.plot(range(1, X_norm.shape[1] + 1), exp_var.cumsum(),
          marker='o', linestyle='--')
plt.title("Explained Variances by Components")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")
plt.grid(True)
plt.show(block=False)

In [None]:
# find the least number of components that can explain more than x% variance
xvar = 90

sum = 0
for ix, i in enumerate(exp_var):
    sum = sum + i
    if (100 * sum > xvar):
        print(f"Number of PCs that explain at least {xvar}% variance: ", ix + 1)
        nx = ix + 1
        break

In [None]:
X_norm_pca_nx = X_norm_pca.iloc[:,:nx]

fig, ax = plt.subplots(1,1,figsize=(10, 6))

ax.scatter(X_norm_pca_nx.iloc[:, 0], X_norm_pca_nx.iloc[:, 1],
            marker='o', s=4)

ticks = [-10,-5, 0, 5, 10]
ax.set_xticks(ticks, labels=[str(x) for x in ticks])
ax.set_yticks(ticks, labels=[str(x) for x in ticks])
ax.set_xlabel(r'PC1')
ax.set_ylabel(r'PC2')
plt.grid(True)
plt.show()

In [None]:
centroids = pca.transform(kmeans_model.cluster_centers_)
train_labels = kmeans_model.labels_
colors_clusters = ['red','blue','green','black','purple','orange']

fig, ax = plt.subplots(1, 1, figsize=(10,6))

for i, tl in enumerate(np.unique(train_labels)):

    idxs = train_labels == tl
    cl_data = X_norm_pca[idxs]

    ax.scatter(cl_data.iloc[:, 0], cl_data.iloc[:, 1],
                color=colors_clusters[i], marker='o', s=4)

    ax.scatter(
        centroids[tl, 0],
        centroids[tl, 1],
        marker="o",
        s=169,
        linewidths=4,
        facecolor=colors_clusters[i],
        zorder=10, edgecolor='white', linewidth=3)
ax.set_xlabel(r'PC1')
ax.set_ylabel(r'PC2')
plt.grid(True)
plt.savefig(f'k={k}_PC1_vs_PC2.png', bbox_inches='tight')
plt.show()

print('')