Name   : **`feature_ae_loss_scorer`**

Purpose: Compare all models with default hyperparameters (Uncontrolled Experiments with Defaults)

# ============== `Setup` ======================================

## **Environment**

### `Colab Default`

In [None]:
import platform
import sklearn
import tensorflow as tf
import tensorboard as tb
import importlib
import numpy as np
import pandas as pd

def check_env_setup():
    ### Checking
    print("-------------------- Setup completed! --------------------\n")
    print("OS                   :", platform.platform())
    print("Python Version       :", platform.python_version())
    print("TensorFlow           :", tf.__version__)
    print("TensorBoard:         :", tb.__version__)
    print("NumPy                :", np.__version__)
    print("Pandas               :", pd.__version__)
    print("Scikit-learn         :", sklearn.__version__)
    #print("Optuna               :", optuna.__version__)
    #print("Optuna-Integration   :", importlib.metadata.version('optuna-integration'))

    try:
        pass
        #print("Optuna-Integration  :", importlib.metadata.version('optuna-integration'))
    except:
        pass
        # print("optuna-integration: Not found")
check_env_setup()

## =============================

# CPU / GPU
def check_cpu_gpu():
    if tf.test.gpu_device_name():
        print(f"\nGPU is available: {tf.test.gpu_device_name()}")
    else:
        print("\nGPU is not available. Ensure you selected GPU runtime in Colab.")

    # Explicitly set GPU usage for TensorFlow (optional but useful for control)
    physical_devices = tf.config.list_physical_devices("GPU")
    if physical_devices:
        try:
            # Enable memory growth (prevents TensorFlow from allocating all GPU memory at once)
            for device in physical_devices:
                tf.config.experimental.set_memory_growth(device, True)
            print("Memory growth enabled for GPU.")
        except RuntimeError as e:
            print(f"Memory growth error: {e}")
    else:
        print("No GPU devices found. Ensure GPU runtime is enabled in Colab.")
check_cpu_gpu()

-------------------- Setup completed! --------------------

OS                   : Linux-6.6.97+-x86_64-with-glibc2.35
Python Version       : 3.12.11
TensorFlow           : 2.19.0
TensorBoard:         : 2.19.0
NumPy                : 2.0.2
Pandas               : 2.2.2
Scikit-learn         : 1.6.1

GPU is not available. Ensure you selected GPU runtime in Colab.
No GPU devices found. Ensure GPU runtime is enabled in Colab.


EXPERIMENT:

feawad      --> Python 3.6 environment

devnet      --> Python 3.6 environment

deepsad     --> Python 3.8 environment

dagmm       --> Python 3.8 environment

vae_feawad  --> Python 3.6 environment

---

### `Import`

In [1]:
import os
import sys
import numpy as np

import random
import subprocess
import pathlib
import importlib
import shutil
import pprint
import json

import torch
import tensorflow as tf

from copy import deepcopy
from datetime import datetime
from dotenv import load_dotenv

### `Global Seed`

In [None]:

FRAMEWORK   = 'TENSORFLOW'   # Options: ['TENSORFLOW', 'PYTORCH']
GLOBAL_SEED = 42

def set_global_seeds(seed = 42, framework = 'TENSORFLOW'):

    ## PYTHON & NUMPY
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

    ## TENSORFLOW
    if framework.upper() == 'TENSORFLOW':
        tf.random.set_seed(seed)
        os.environ['TF_DETERMINISTIC_OPS']   = '1'
        os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

    ## PYTORCH
    elif framework.upper() == 'PYTORCH':
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic  = True
        torch.backends.cudnn.benchmark      = False

    else: raise ValueError(f"Unknown framework: {framework}")
    print(f"Global seed set to {seed} for {framework}")

## SEED
set_global_seeds(GLOBAL_SEED, FRAMEWORK)

Global seed set to 42 for TENSORFLOW


### `Mount`

In [2]:
### GOOGLE DRIVE
# from google.colab import drive
# drive.mount('/content/drive')

def connect_google_drive(force_mount = False):
    if not os.path.exists('/content/drive/MyDrive') or force_mount:
        from google.colab import drive
        # Try unmounting first to ensure a clean mount
        try:
            drive.flush_and_unmount()
            print('\nAll changes made in this colab session should now be visible in Drive.')
        except ValueError:
            pass

        if os.path.exists('/content/drive'):
            # !rm -rf /content/drive  # Remove the directory and its contents
            print("Removed existing '/content/drive' directory.")
            print("'/content/drive' directory found, uncomment to remove & re-mount!!!")

        drive.mount('/content/drive', force_remount = True)

    else:
        print("Google Drive already mounted.")
connect_google_drive(force_mount = False)

Drive not mounted, so nothing to flush and unmount.

All changes made in this colab session should now be visible in Drive.
Mounted at /content/drive


## **Helpers**

### `Common Utils`

In [None]:
## JSON SERIALIZER
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.bool_):
            return bool(obj)
        if isinstance(obj, (np.floating, np.complexfloating)):
            return float(obj)
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        # if isinstance(obj, np.string_):
        if isinstance(obj, np.bytes_): # Changed np.string_ to np.bytes_
            return str(obj)
        if isinstance(obj, (datetime, date)):
            return obj.isoformat()
        if isinstance(obj, timedelta):
            return str(obj)
        if isinstance(obj, tf.keras.optimizers.Optimizer): # Handle Optimizer objects
            return obj.get_config() # Serialize the configuration instead of the object
        return super(NpEncoder, self).default(obj)

### `Dev Utils`

In [3]:
######################################################################################################################################
if True: ##  FUNCTIONS  (DRIVE)
    def get_drive_helper_paths(
            dr_utils_dir_path = "/content/drive/MyDrive/MSc_AAiT/project_setup_helper/utils",
            path_configs_file    = "drive_path_configs.json"
        ):
        if not os.path.exists('/content/drive/MyDrive'): connect_google_drive(force_mount = False)
        drive_utils_file_path = os.path.join(dr_utils_dir_path, path_configs_file)
        with open(drive_utils_file_path, "r") as f:
            drive_paths_json = json.load(f)
        return drive_paths_json
    def drive_setup_helper_syspath(
            dr_setup_helper_path    = "/content/drive/MyDrive/MSc_AAiT/project_setup_helper",
            util_modules_dir        = None, ## "utils",
            util_module_name        = None  ## "git_ssh_utils", "xxxxx"
        ):

        if not os.path.exists('/content/drive/MyDrive'): connect_google_drive(force_mount = False)

        utils_path = os.path.join(dr_setup_helper_path, util_modules_dir) if util_modules_dir is not None else dr_setup_helper_path
        if not os.path.exists(utils_path): raise FileNotFoundError(f"Path not found: {utils_path}")
        if utils_path not in sys.path: sys.path.append(utils_path) ### Ensure path is in sys.path
        print(f"Util modules path '{utils_path}' added to sys.path")

        ### Try importing
        module = None
        if util_module_name is not None:
            module = importlib.import_module(util_module_name)
            importlib.reload(module)
            print(f"Imported {util_module_name} from {utils_path}")
        return module
    def manage_gitssh_colab_github(
            project_envs,
            git_ssh_helper,
            start_ssh_session   = False,
            show_pub_key        = False,
            force_new_keys_gen  = False
        ):

        if not project_envs:   raise ValueError("Enviroment variable key-pair 'project_envs' is not defined!")
        if not git_ssh_helper: raise ValueError("Git & SSH helper script 'git_ssh_helper' is not defined!")
        if not start_ssh_session and not show_pub_key and not force_new_keys_gen:
            raise ValueError("At least one of the following flags must be True: ['start_ssh_session', 'show_pub_key', 'force_new_keys_gen']")

        # ============================================================
        print(f"\n{'=' * 20} Git global config (In colab) {'=' * 20}")
        git_ssh_helper.git_global_user_config(
            project_envs.GIT_USER_NAME,
            project_envs.GIT_USER_EMAIL,
            user_scope  = '--global',
            verbose     = project_envs.VERBOSE
        )

        # ================================================================================================================
        if project_envs.SSH_FORCE_GENERATION: force_new_keys_gen = project_envs.SSH_FORCE_GENERATION

        if force_new_keys_gen:  print(f"\n{'=' * 20} New ssh key-pair generation {'=' * 20}")
        elif show_pub_key:      print(f"\n{'=' * 20} Show ssh public key {'=' * 20}")
        elif start_ssh_session: print(f"\n{'=' * 20} Start new ssh session (colab-github) {'=' * 20}")

        if force_new_keys_gen or show_pub_key: ### TO FORCE QUIQUE RE/GENERATE KEY-PAIR (WITHOUT GOING & SET/CONFIG in 'project_envs')
            git_ssh_helper.setup_ssh_colab_github(
                ssh_priv_key_path   = project_envs.SSH_KEY_STORE_PATH,
                git_user_email      = project_envs.GIT_USER_EMAIL,
                ssh_added_to_github = False,                        ## QUIQUE MANUAL SETTING  (WITHOUT SETTING/CONFIG IN 'proj_env')
                force_generate_keys = force_new_keys_gen,           ## QUIQUE MANUAL SETTING
                verbose             = project_envs.VERBOSE
            )
        elif start_ssh_session:                ### TO START NEW-SESSION (AND OR BASED 'project_envs' SETTING)
            git_ssh_helper.setup_ssh_colab_github(
                ssh_priv_key_path   = project_envs.SSH_KEY_STORE_PATH,
                git_user_email      = project_envs.GIT_USER_EMAIL,
                ssh_added_to_github = project_envs.SSH_ADDED_TO_GITHUB,  # 'False': to display (get/copy) stored Public-Key without session setup.
                force_generate_keys = project_envs.SSH_FORCE_GENERATION, # 'True':  to delete & regenerate Kay-pairs.
                verbose             = project_envs.VERBOSE
            )
        else: raise ValueError(f"Invalid args!")

    def clone_repo_from_github(    ### First-time push of scaffold (Drive --> GitHub).
            project_envs,
            git_ssh_helper,
            colab_repo_path     = None,     # Path to the cloned GitHub repo in Colab
            verbose             = None,
            force_clone         = False,
        ):

        m_colab_repo_path = colab_repo_path or os.path.join(project_envs.COLAB_HOME_PATH, project_envs.GITHUB_REPO_NAME) ## Path to the cloned GitHub repo in Colab
        m_verbose         = verbose         or project_envs.VERBOSE

        if force_clone:
            if os.path.exists(m_colab_repo_path):
                print(f"\n=> *** Repo is cloned before & found at: {m_colab_repo_path}, removing .... ***")
                shutil.rmtree(m_colab_repo_path)
                if os.path.exists(m_colab_repo_path): raise RuntimeError(f"Failed to remove existing repo: {m_colab_repo_path}")
            else: print(f"\n=> Repo is not cloned before, procede to cloning ...")
        else:
            if os.path.exists(m_colab_repo_path):
                print(f"\n=> **** Repo is cloned before & 'force cloning' is not set. Existed repo found at: {m_colab_repo_path} ***")
                return

        # ==================================================================================================
        if False: print(f"\n{'=' * 20} Resolve gitHub repo url {'=' * 20}")
        github_repo_url =  git_ssh_helper.get_github_repo_url( ### Authenticated Template Repo URL
            gh_repo_auth = project_envs.GITHUB_REPO_AUTH,         ## Options: [None, 'PAT', 'SSH'] # If gh_repo_auth == None: 'GitHub Repo Auth is Anonymous (https based Un-auth access)'
            gh_pat_token = project_envs.GITHUB_PAT_TOKEN,         ## GitHub 'Personal Access Token (PAT)':
            gh_username  = project_envs.GITHUB_USERNAME,
            gh_repo_name = project_envs.GITHUB_REPO_NAME          ## GITHUB_TEMPLATE_NAME     # GITHUB_REPO_NAME    ##"anomaly-detection-msc-thesis",   # msc-thesis-template
        )
        print(f"\n=> github_repo_url: {github_repo_url}")

        # ==================================================================================================
        print(f"\n{'=' * 20} Clone the repository (github --> colab) {'=' * 20}")
        git_ssh_helper.clone_github_repo(
            gh_repo_url     = github_repo_url,
            gh_repo_name    = project_envs.GITHUB_REPO_NAME,
            local_root_dir  = project_envs.COLAB_HOME_PATH,
            ssh_temp_dir    = project_envs.SSH_SESS_PATH,
            explicit_ssh    = project_envs.SSH_EXPLICIT_CLONE,
            force_clone     = project_envs.GITHUB_FORCE_CLONE,
            verbose         = m_verbose
        )

        os.chdir(m_colab_repo_path)
        if m_verbose:
            git_config_path = os.path.join(m_colab_repo_path, ".git")
            print("\n=> Cloned repo is git-repo (.git file):")
            !ls -al {git_config_path}
        if not git_ssh_helper.is_git_repo(): raise EnvironmentError(f"{m_colab_repo_path} is not a Git repository ('.git' missing)")
        print(f"\n=> Current working directory (cloned repo): {os.getcwd()}")

        return github_repo_url
    def git_remote_and_branch_config(
            project_envs,
            git_ssh_helper,
            github_repo_url,
            branch_name         = "main",
            br_remote_name      = "origin",
            base_branch         = "main",       # Base branch if creating a new branch
            auto_commit         = True,
            auto_push           = True,
            verbose             = None,
            commit_msg          = "Update repo files"
        ):


        verbose = verbose or project_envs.VERBOSE
        print(f"\n{'=' * 20} Git remote repository config {'=' * 20}")
        git_ssh_helper.git_remote_repo_config(   ### Git remote repository config
            gh_repo_url     = github_repo_url,
            branch_name     = branch_name,    ### Branch to push scaffold
            git_push        = False,          ### Skip now, push after syncing files
            auto_commit     = auto_commit,
            verbose         = verbose
        )

        # if False:
        git_ssh_helper.checkout_or_create_branch(
            branch_name     = branch_name,       # Branch to push scaffold
            br_remote_name  = br_remote_name,
            base_branch     = base_branch,       # Base branch if creating a new branch
            verbose         = verbose,
        )

        if auto_push:

            if False:
                print(f"\n-> Pushing branch '{branch_name}' to remote '{br_remote_name}' ...")
                _, err, code = git_ssh_helper.run_shell_cmd(f"git push -u {br_remote_name} {branch_name}", verbose = verbose)
                if code == 0: print(f"-> Branch '{branch_name}' successfully pushed and upstream set!")
                else:         print(f"-> Push failed! Error: {err}")

            git_ssh_helper.git_commit_push_changes(
                commit_msg          = commit_msg,
                branch_name         = branch_name,
                remote_name         = br_remote_name,
                git_push            = auto_push,
                verbose             = verbose,
                check               = False,
                resolve_conflicts   = True,  ### auto handle conflicts
            )

        print("\n=> Remote && branch configuration completed.")
        return branch_name, br_remote_name, base_branch

    #================================================================================================================================
    def setup_experiment_path(
            model_name,
            repo_name,
            exper_id    = 1,
            base_path   = f"/content/drive/MyDrive/Colab_Notebooks/Thesis/Code/FEAWAD_Reproducing2/experiments/baseline"
        ):

        if not os.path.exists("/content/drive/MyDrive"): connect_google_drive()
        if not os.path.exists(base_path): os.makedirs(base_path)

        # model_path = f"{base_path}/{repo_name}/{model_name}/{exper_id}/model"
        model_path   = f"{base_path}/{repo_name}/{model_name}/model"
        result_path  = f"{base_path}/{repo_name}/{model_name}/result"

        if not os.path.exists(model_path) : os.makedirs(model_path)
        if not os.path.exists(result_path): os.makedirs(result_path)

        return model_path, result_path
    def writeResults(name, n_samples_trn,  n_outliers, n_samples_test,test_outliers ,test_inliers, avg_AUC_ROC, avg_AUC_PR, std_AUC_ROC,std_AUC_PR, path):
        csv_file = open(path, 'a')
        row = name + ","  + n_samples_trn + ','+n_outliers  + ','+n_samples_test+','+test_outliers+','+test_inliers+','+avg_AUC_ROC+','+avg_AUC_PR+','+std_AUC_ROC+','+std_AUC_PR + "\n"
        csv_file.write(row)
    def save_experiment(                ### Save experiment configs + checkpoints in Drive
            exp_data,
            dataset,
            model,
            variant,
            drive_experiments,
            checkpoint_path     = None,
            uid                 = None
        ):

        uid      = uid or datetime.now().strftime("%Y%m%d_%H%M%S")
        save_dir = Path(drive_experiments) / model / dataset / variant / uid
        save_dir.mkdir(parents = True, exist_ok = True)

        ### Save JSON
        save_json = save_dir / "experiment.json"
        with open(save_json, "w") as f: json.dump(exp_data, f, indent = 2)
        print(f"Experiment JSON saved: {save_json}")

        ### Save checkpoint
        # if checkpoint_path:
            # if not os.path.exists(checkpoint_path): os.mkdir(os.path.dirname(checkpoint_path))
            # checkpoint_path = Path(checkpoint_path)
        if checkpoint_path and os.path.exists(checkpoint_path):
            ckpt_dst = save_dir / Path(checkpoint_path).name
            shutil.copy2(checkpoint_path, ckpt_dst)
            print(f"Checkpoint copied: {ckpt_dst}")
        return str(save_dir)
######################################################################################################################################

## ===================================================================================================================================
if True: ##  DRIVE PATHS
    _drive_paths_config  = get_drive_helper_paths(
        dr_utils_dir_path = "/content/drive/MyDrive/MSc_AAiT/project_setup_helper/utils",
        path_configs_file = "drive_path_configs.json"
    )

## ===================================================================================================================================
if True: ##  IMPORT HELPERS

    _ = drive_setup_helper_syspath(dr_setup_helper_path = _drive_paths_config.get('helpers_path'))  ## SYS.PATHS

    import utils.project_env as m_proj_env
    importlib.reload(m_proj_env)

    import utils.project_setup_utils as m_proj_setup_utils
    importlib.reload(m_proj_setup_utils)

    import utils.git_ssh_utils as m_git_ssh_tools
    importlib.reload(m_git_ssh_tools)

    assert m_proj_env.COLAB_HOME_PATH is not None, "proj_configs.COLAB_HOME_PATH is not set. Check .env or load_dotenv path."
    print(f"\nproj_configs.COLAB_HOME_PATH: {m_proj_env.COLAB_HOME_PATH}")

## ===================================================================================================================================
if True: ## VARIABLES  (GLOBAL)

    m_current_exper_nbook_dir = '2_comparisons'
    m_current_exper_name      = 'feature_ae_loss_scorer'
    m_current_nbook_subpath   = os.path.join('notebooks', m_current_exper_nbook_dir, f"{m_current_exper_name}.ipynb"  )  ## 'notebooks/2_comparisons/feature_ae_loss_scorer.ipynb'
    ## /content/drive/MyDrive/MSc_AAiT/experiments/ids-msc-thesis/notebooks/2_comparisons/feature_ae_loss_scorer.ipynb

    m_drive_paths_json  = _drive_paths_config
    m_drive_repo_path   = os.path.join(m_drive_paths_json['thesis_proj_root_path'], 'experiments', m_proj_env.GITHUB_REPO_NAME)
    m_colab_repo_path   = os.path.join(m_proj_env.COLAB_HOME_PATH, m_proj_env.GITHUB_REPO_NAME)         ## Path to the cloned GitHub repo in Colab

    m_drive_exper_nb_path = os.path.join(m_drive_repo_path, m_current_nbook_subpath)
    m_colab_exper_nb_path = os.path.join(m_colab_repo_path, m_current_nbook_subpath)

    ## GIT
    m_branch_name    = m_current_exper_name
    m_base_branch    = "main"
    m_br_remote_name = "origin"


Util modules path '/content/drive/MyDrive/MSc_AAiT/project_setup_helper' added to sys.path

DRIVE_PROJ_PATH in 'project_setup_env.py': '/content/drive/MyDrive/MSc_AAiT/experiments/ids_msc_thesis'

DRIVE_PROJ_PATH in 'project_setup_env.py': '/content/drive/MyDrive/MSc_AAiT/experiments/ids_msc_thesis'

proj_configs.COLAB_HOME_PATH: /content


### `FeaWAD Dataset Utils`

In [None]:
# %%writefile feawad_loader.py

# feawad_loader.py
import csv
import urllib.request
import requests
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

#=================================================================================================================================================
def check_remote_file(url):
    try:
        response = requests.head(url)
        return response.status_code == 200
    except requests.RequestException as e:
        print(f"Error checking URL: {e}")
        return False
def get_weakly_supervised_dataset_by_feawad(data_params = {'data_dim' : 122}):

    ### Default
    # feawad_repo_name = "yj-zhou/Feature_Encoding_with_AutoEncoders_for_Weakly-supervised_Anomaly_Detection"
    # feawad_repo_url  = "https://github.com/yj-zhou/Feature_Encoding_with_AutoEncoders_for_Weakly-supervised_Anomaly_Detection"

    feawad_data_path  = "https://raw.githubusercontent.com/yj-zhou/Feature_Encoding_with_AutoEncoders_for_Weakly-supervised_Anomaly_Detection/main/dataset/"
    # feawad_data_url = "https://raw.githubusercontent.com/yj-zhou/Feature_Encoding_with_AutoEncoders_for_Weakly-supervised_Anomaly_Detection/main/dataset/nslkdd_normalization.csv"

    # repo_name     = data_params.get('repo_name',   feawad_repo_name)
    # repo_url      = data_params.get('repo_url',    feawad_repo_url)
    repo_data_path  = data_params.get('repo_data_path', feawad_data_path)
    data_name       = data_params.get('data_name', "nslkdd_normalization").strip()
    data_ext        = data_params.get('data_ext', 'csv')
    local_data_path = data_params.get('local_data_path', './data/')

    data_local_file = local_data_path + data_name + '.' + data_ext
    data_repo_file  = repo_data_path  + data_name + '.' + data_ext

    if not os.path.isfile(data_local_file):
        print(f"\nDataset file is not found at local path: '{data_local_file}', try to downloading......\n")
        # if os.path.isfile(data_repo_file):
        if check_remote_file(data_repo_file):
            urllib.request.urlretrieve(data_repo_file, data_local_file)
            # Check again
            if not os.path.isfile(data_local_file):
                raise ValueError(f"\nDataset file is not found at: \nlocal path: {data_local_file} & at \nremote url: {data_repo_file}")
        else: raise ValueError(f"\nDataset file is not found at remote repo url: '{data_repo_file}'")
    else:
        print(f"\n'{data_name}' dataset found at local path: '{data_local_file}'.")

    data_dim        = data_params.get('data_dim', 122)
    test_size       = data_params.get('test_size', 0.2)
    random_seed     = data_params.get('random_seed', 42)
    cont_rate       = data_params.get('cont_rate', 0.02)
    known_outliers  = data_params.get('known_outliers', 30)
    verbose         = data_params.get('verbose', 1)

    def _dataLoading(path, byte_num):
        # loading data
        x       = []
        labels  = []

        with (open(path,'r')) as data_from:
            csv_reader = csv.reader(data_from)
            for i in csv_reader:
                x.append(i[0:byte_num])
                labels.append(i[byte_num])

        for i in range(len(x)):
            for j in range(byte_num):
                x[i][j] = float(x[i][j])
        for i in range(len(labels)):
            labels[i] = float(labels[i])

        x       = np.array(x)
        labels  = np.array(labels)

        return x, labels;
    def _inject_noise(seed, n_out, random_seed):
        '''
        add anomalies to training data to replicate anomaly contaminated data sets.
        we randomly swape 5% features of anomalies to avoid duplicate contaminated anomalies.
        this is for dense data
        '''
        # seed  = Outlier, in loaded set but before split, i.e. 61,000 (gues)
        # n_out = n_noise = x_normal * conta_rate / 1-conta_rate = 1166
        # 'n_swap_feat'=6, when 'swap_ratio'=0.05 & 'dim'=122
        # 'swap_feats' is selected index, e.g. [18 45 47 89  4 40]

        rng           = np.random.RandomState(random_seed)
        n_sample, dim = seed.shape
        swap_ratio    = 0.05
        n_swap_feat   = int(swap_ratio * dim)
        noise         = np.empty((n_out, dim))

        for i in np.arange(n_out):
            outlier_idx = rng.choice(n_sample, 2, replace = False)
            o1 = seed[outlier_idx[0]] # ------------------------------------> Row Selection
            o2 = seed[outlier_idx[1]] # ------------------------------------>    >>
            swap_feats = rng.choice(dim, n_swap_feat, replace = False)
            noise[i] = o1.copy()
            noise[i, swap_feats] = o2[swap_feats]
        return noise

    x, labels = _dataLoading(data_local_file, byte_num = data_dim)

    if verbose: # [Loaded Dataset]
        print(f"\n[Loaded Dataset]")
        print(f" - Name    : {data_name}")
        print(f" - x       : {x.shape}")
        print(f" - labels  : {labels.shape}")
        print(f" - Normal  : {len(x[labels == 0])}")
        print(f" - Outlier : {len(x[labels == 1])}")

    outlier_indices = np.where(labels == 1)[0]
    outliers        = x[outlier_indices]
    n_outliers_org  = outliers.shape[0]

    # Per Runs
    train_x, test_x, train_label, test_label = train_test_split(x, labels, test_size = test_size, random_state = random_seed, stratify = labels)

    if data_dim != train_x.shape[1]:
        raise ValueError(f"\n[After Split] Invalid input dimension {data_dim} args, current dataset dimension is {train_x.shape[1]}!")

    rng             = np.random.RandomState(random_seed)
    outlier_indices = np.where(train_label == 1)[0]
    n_outliers      = len(outlier_indices)
    inlier_indices  = np.where(train_label == 0)[0]
    n_inliers       = len(inlier_indices)
    n_noise         = int(n_inliers * cont_rate / (1. - cont_rate))

    if verbose: # [Splited Info]
        print(f"\n[After Datasets Splited]:\n")
        print(f" Train:")
        print(f" - train_x     : {train_x.shape}")
        print(f" - train_label : {train_label.shape}")
        print(f" - Normal      : {len(train_x[train_label == 0])}")
        print(f" - Outlier     : {len(train_x[train_label == 1])}")
        print(f" Test:")
        print(f" - test_x      : {test_x.shape}")
        print(f" - test_label  : {test_label.shape}")
        print(f" - Normal      : {len(test_x[test_label == 0])}")
        print(f" - Outlier     : {len(test_x[test_label == 1])}")
    if verbose: # [Weakly Supervised Config Info]
        print(f"\n{'-' * 110}\n")

        print(f"[Weakly Supervised Setting]:\n")
        print(f" Training Dataset With:")
        print(f" - Outliers : Limited (known size & Identified)")
        print(f" - Normal   : Noisy   (known size but Unidentified)")
        print(f" Config:")
        print(f" - 1. Known Outliers Allowed   : {known_outliers}")
        print(f" - 2. Noise Contamination Rate : {cont_rate} ({int(cont_rate * 100)}%)")

        print(f"\n{'-' * 50}\n")

        print(f" Before Outliers Removed (Remove outliers if number of outliers in dataset is exceeds allowed-known-outliers):")
        print(f" - Normal  : {len(train_x[train_label == 0])}")
        print(f" - Outlier : {len(train_x[train_label == 1])}")
        print(f" - Total   : {train_x.shape[0]}")
        print(f"   {'-' * 28}")
        print(f" - Outliers to Remove : {n_outliers - known_outliers} = {n_outliers} - {known_outliers} ----> (n_outliers - n_known_outliers)\n")

    if n_outliers > known_outliers:
        mn          = n_outliers - known_outliers
        remove_idx  = rng.choice(outlier_indices, mn, replace = False)
        train_x     = np.delete(train_x,     remove_idx, axis = 0)
        train_label = np.delete(train_label, remove_idx, axis = 0)
    else: print(f"\n - [Info-Outliers] Number of outliers are NOT GREATER THAN from known outliers.")

    if verbose: # [After Outliers Removed]
        print(f" After Outliers Removed:")
        print(f" - Normal  : {len(train_x[train_label == 0])}")
        print(f" - Outlier : {len(train_x[train_label == 1])}")
        print(f" - Total   : {train_x.shape[0]}")
        print(f"   {'-' * 28}")
        print(f" - Outliers Removed : {n_outliers - known_outliers}")
    if verbose: # [Noise Adding]
        print(f"\n{'-' * 100}\n")
        print(f" Before Noise Added:")
        print(f" - Normal (Clean)   : {len(train_x[train_label == 0])}")
        print(f" - Outlier          : {len(train_x[train_label == 1])}")
        print(f" - Total            : {train_x.shape[0]}")
        print(f"   {'-' * 28}")
        print(f" - Noises to Inject : {n_noise} -----------> [Computed: x_normal * cont_rate / (1-cont_rate)]\n")

    noises      = _inject_noise(outliers, n_noise, random_seed)
    train_x     = np.append(train_x, noises, axis = 0)
    train_label = np.append(train_label, np.zeros((noises.shape[0], 1)))    # Label Noising

    if verbose: # [Noise Added]
        print(f" After Noise Added (Generated based on outliers data):")
        print(f" - Normal (Noisy)  : {len(train_x[train_label == 0])}")
        print(f" - Outlier         : {len(train_x[train_label == 1])}")
        print(f" - Total           : {train_x.shape[0]}")
        print(f"   {'-' * 28}")
        print(f" - Noises Injected : {n_noise}")

    if data_dim != train_x.shape[1]:
        raise ValueError(f"\n[Before Training Loop] Invalid input dimension {data_dim} args, current dataset dimension is {train_x.shape[1]}!")

    outlier_indices = np.where(train_label == 1)[0]
    inlier_indices  = np.where(train_label == 0)[0]
    # train_x_inlier= np.delete(train_x, outlier_indices, axis = 0)   # ???

    input_shape   = train_x.shape[1:]
    n_samples_trn = train_x.shape[0]
    n_outliers    = len(outlier_indices)

    n_samples_test       = test_x.shape[0]
    test_outlier_indices = np.where(test_label == 1)[0]
    test_inlier_indices  = np.where(test_label == 0)[0]

    if verbose: # [Processed Info]
        print(f"\n{'_' * 100}")
        print(f"\n[Dataset] All Processed {'*' * 80}\n")
        print(f" Train:")
        print(f" - Mixed   train_y : {train_label.shape}")
        print(f" - Mixed   train_x : {train_x.shape}")
        print(f"   {'-' * 28}")
        print(f" - Normal  train_x : {inlier_indices.shape[0]}")
        print(f" - Outlier train_x : {outlier_indices.shape[0]}\n")
        print(f" Test:")
        print(f" - Mixed   test_y  : {test_label.shape}")
        print(f" - Mixed   test_x  : {test_x.shape}")
        print(f"   {'-' * 28}")
        print(f" - Normal  test_x  : {test_inlier_indices.shape[0]}")
        print(f" - Outlier test_x  : {test_outlier_indices.shape[0]}")

    return train_x, train_label, test_x, test_label


## **GitHub Setup**

### `SSH Session`

In [4]:
### Configure SSH & GitHub auth
manage_gitssh_colab_github(
    project_envs        = m_proj_env,
    git_ssh_helper      = m_git_ssh_tools,
    start_ssh_session   = True,   ### Start a session with stored key
    show_pub_key        = False,  ### Show public key
    force_new_keys_gen  = False   ### Regenerate keys
)



[CMD] : git config --global user.name Abate, Zelalem

[CODE]  : 0

[CMD] : git config --global user.email phatzolo@gmail.com

[CODE]  : 0


-> SSH Key-Pair already generated before & found at path: /content/drive/MyDrive/MSc_AAiT/project_setup_helper/id_rsa.

-> SSH public key generated before & assumed to be added to GitHub. SSH Key-Pair Path: /content/drive/MyDrive/MSc_AAiT/project_setup_helper/id_rsa

-> Setting up ssh session in: /root/.ssh

-> Adding GitHub to known_hosts...

-> Starting ssh-agent & adding private key...

[ERROR]: Command failed (exit 127): ['bash', '-c', '\n    eval "$(ssh-agent -s)"\n    ssh-add /root/.ssh/id_rsa\n    ssh -T git@github.com || 0\n    ']

[STDERR]: Identity added: /root/.ssh/id_rsa (phatzolo@gmail.com)
Hi zelalemteferi! You've successfully authenticated, but GitHub does not provide shell access.
bash: line 4: 0: command not found

-> SSH session to gitHub established successfully!!!


### `Clone Repo`

In [5]:
## GitHub ---> Drive
m_github_repo_url = clone_repo_from_github(
    m_proj_env,
    m_git_ssh_tools,
    m_colab_repo_path,  ## Path to the cloned GitHub repo in Colab
    verbose             = m_proj_env.VERBOSE,
    force_clone         = True
)


=> Repo is not cloned before, procede to cloning ...

=> github_repo_url: git@github.com:zelalemteferi/ids-msc-thesis.git


=> Cloning 'ids-msc-thesis' to '/content/ids-msc-thesis'

=> Cloning 'ids-msc-thesis' to '/content/ids-msc-thesis'

[CMD] : git clone git@github.com:zelalemteferi/ids-msc-thesis.git

[STDERR]: Cloning into 'ids-msc-thesis'...

[CODE]  : 0

-> Git clone is SUCCESSFUL!

-> Current dir  : /content/ids-msc-thesis

-> Repo contents:

=> Cloned repo is git-repo (.git file):
total 240
drwxr-xr-x  8 root root   4096 Sep 29 22:22 .
drwxr-xr-x 11 root root   4096 Sep 29 22:22 ..
drwxr-xr-x  2 root root   4096 Sep 29 22:22 branches
-rw-r--r--  1 root root    268 Sep 29 22:22 config
-rw-r--r--  1 root root     73 Sep 29 22:22 description
-rw-r--r--  1 root root     21 Sep 29 22:22 HEAD
drwxr-xr-x  2 root root   4096 Sep 29 22:22 hooks
-rw-r--r--  1 root root 196030 Sep 29 22:22 index
drwxr-xr-x  2 root root   4096 Sep 29 22:22 info
drwxr-xr-x  3 root root   4096 Sep 29 22:22

### `Clean Commit`

In [27]:
## BACKUP GITHUB --> COLAB --> DRIVE
if True:
    # !cp -r /content/ids-msc-thesis /content/drive/MyDrive/MSc_AAiT/scaffold_FIRST_PUSHED_BACKUP2
    m_git_ssh_tools.delete_commit_history(
        branch_name     = m_branch_name,            ## branch_name (feature_branch) = "feature_ae_loss_scorer",
        main_branch     = m_base_branch,            ## "main",
        remote_name     = m_br_remote_name,         ## "origin", # DEFAULT
        commit_msg      = "Initial commit (commit history cleaned)",
        backup_main_br  = False,
        backup_feat_br  = False,
        verbose         = True
    )


[CMD] : git checkout --orphan main_last_commit

[STDERR]: Switched to a new branch 'main_last_commit'

[CODE]  : 0

[CMD] : git add -A

[CODE]  : 0

[ERROR]: Command failed (exit 2): git commit -m Initial commit (commit history cleaned)

[STDERR]: /bin/sh: 1: Syntax error: "(" unexpected

[ERROR]: Command failed (exit 1): git branch -D main

[STDERR]: error: branch 'main' not found.

[CMD] : git branch -m main

[CODE]  : 0

[ERROR]: Command failed (exit 1): git push -f origin main

[STDERR]: error: src refspec main does not match any
error: failed to push some refs to 'github.com:zelalemteferi/ids-msc-thesis.git'
'main' commit history deleted, new created & pushed

[CMD] : git push origin --delete feature_ae_loss_scorer

[STDERR]: To github.com:zelalemteferi/ids-msc-thesis.git
 - [deleted]         feature_ae_loss_scorer

[CODE]  : 0
Old remote feature branch 'feature_ae_loss_scorer' deleted

[ERROR]: Command failed (exit 128): git checkout -b feature_ae_loss_scorer

[STDERR]: fatal: A

In [15]:
# !git status
# !git branch -a                ## To confirms that branch exists both locally & remotely
# !git remote -v                ## To show remote origin configuration
# !git remote show origin       ## To show local branch tracks the remote branch correctly
# !git log --oneline            ## Check commit history

### Check commit history (should be one commit)
!git log --oneline


[33mc439d74[m[33m ([m[1;36mHEAD -> [m[1;32mfeature_ae_loss_scorer[m[33m, [m[1;31morigin/feature_ae_loss_scorer[m[33m)[m Push updated experiment nootbook ('feature_ae_loss_scorer')
[33mc89397a[m Merge branch 'feature_ae_loss_scorer'
[33md4d643f[m Push updated experiment nootbook ('feature_ae_loss_scorer')
[33mc9a4cff[m Push updated experiment nootbook ('feature_ae_loss_scorer')
[33m9841c51[m Push updated experiment nootbook ('feature_ae_loss_scorer')
[33m73bfdc8[m Push updated experiment nootbook ('feature_ae_loss_scorer')
[33m8d6d905[m Update experiment notebook feature_ae_loss_scorer
[33m53d55a5[m Push updated experiment nootbook ('feature_ae_loss_scorer')
[33mfe59c00[m Push updated experiment nootbook ('feature_ae_loss_scorer')
[33m3373dc8[m Add scaffold from Drive
[33mfab0aa4[m Initial commit


In [28]:
!git branch -a

  feature_ae_loss_scorer[m
  [31mremotes/origin/HEAD[m -> origin/main
  [31mremotes/origin/feature_ae_loss_scorer[m
  [31mremotes/origin/main[m


In [29]:
!git log --oneline


fatal: your current branch 'main' does not have any commits yet


### `Git-Remote & Branch Configs`

In [6]:
m_branch_name, m_br_remote_name, m_base_branch = git_remote_and_branch_config(
    project_envs        = m_proj_env,
    git_ssh_helper      = m_git_ssh_tools,
    github_repo_url     = m_github_repo_url,
    branch_name         = m_branch_name,            ## m_current_exper_name == "feature_ae_loss_scorer",
    br_remote_name      = m_br_remote_name,         ## "origin",
    base_branch         = m_base_branch,            ## "main",
    auto_commit         = True,
    verbose             = True,
    commit_msg          = "Git-Remote & Branch Configs"
)




Configuring Git remote for: git@github.com:zelalemteferi/ids-msc-thesis.git

[CMD] : git checkout feature_ae_loss_scorer

[STDOUT]: Branch 'feature_ae_loss_scorer' set up to track remote branch 'feature_ae_loss_scorer' from 'origin'.

[STDERR]: Switched to a new branch 'feature_ae_loss_scorer'

[CODE]  : 0
-> Updating/Resetting existing remote URL

[CMD] : git remote set-url origin git@github.com:zelalemteferi/ids-msc-thesis.git

[CODE]  : 0
-> Auto git-committed...
status

-> *** No changes to git commit & push ***

=> Verify remote configs:

[CMD] : git remote -v

[STDOUT]: origin	git@github.com:zelalemteferi/ids-msc-thesis.git (fetch)
origin	git@github.com:zelalemteferi/ids-msc-thesis.git (push)

[CODE]  : 0

=> Current branch status:

[CMD] : git status -sb

[STDOUT]: ## feature_ae_loss_scorer...origin/feature_ae_loss_scorer

[CODE]  : 0

REMOTE CONFIGURATION COMPLETED SUCCESSFULLY!

Fetching all remote branches...

Local branches : ['feature_ae_loss_scorer', 'main'] 
Remote bran

### `.... Check Git`

In [None]:
# !git status
# !git branch -a                ## To confirms that branch exists both locally & remotely
# !git remote -v                ## To show remote origin configuration
# !git remote show origin       ## To show local branch tracks the remote branch correctly
# !git log --oneline            ## Check commit history

In [7]:
!git remote show origin

* remote origin
  Fetch URL: git@github.com:zelalemteferi/ids-msc-thesis.git
  Push  URL: git@github.com:zelalemteferi/ids-msc-thesis.git
  HEAD branch: main
  Remote branches:
    feature_ae_loss_scorer tracked
    main                   tracked
  Local branches configured for 'git pull':
    feature_ae_loss_scorer merges with remote feature_ae_loss_scorer
    main                   merges with remote main
  Local refs configured for 'git push':
    feature_ae_loss_scorer pushes to feature_ae_loss_scorer (up to date)
    main                   pushes to main                   (up to date)


In [8]:
!git branch -a

* [32mfeature_ae_loss_scorer[m
  main[m
  [31mremotes/origin/HEAD[m -> origin/main
  [31mremotes/origin/feature_ae_loss_scorer[m
  [31mremotes/origin/main[m


In [9]:
!git log --oneline

[33mc439d74[m[33m ([m[1;36mHEAD -> [m[1;32mfeature_ae_loss_scorer[m[33m, [m[1;31morigin/feature_ae_loss_scorer[m[33m)[m Push updated experiment nootbook ('feature_ae_loss_scorer')
[33mc89397a[m Merge branch 'feature_ae_loss_scorer'
[33md4d643f[m Push updated experiment nootbook ('feature_ae_loss_scorer')
[33mc9a4cff[m Push updated experiment nootbook ('feature_ae_loss_scorer')
[33m9841c51[m Push updated experiment nootbook ('feature_ae_loss_scorer')
[33m73bfdc8[m Push updated experiment nootbook ('feature_ae_loss_scorer')
[33m8d6d905[m Update experiment notebook feature_ae_loss_scorer
[33m53d55a5[m Push updated experiment nootbook ('feature_ae_loss_scorer')
[33mfe59c00[m Push updated experiment nootbook ('feature_ae_loss_scorer')
[33m3373dc8[m Add scaffold from Drive
[33mfab0aa4[m Initial commit


### `Commit & Push (This Nootebook)`

In [12]:
## ********************** REMEMBER  (SAVE CHANGES (THIS NOTEBOOK)) ******************************

nb_updated = False

## 1. COPY: DRIVE --> COLAB
if not nb_updated:
    shutil.copy2(m_drive_exper_nb_path, m_colab_exper_nb_path)
    print(f"Copied {m_drive_exper_nb_path} ---> {m_colab_exper_nb_path}")
    nb_updated = True

Copied /content/drive/MyDrive/MSc_AAiT/experiments/ids-msc-thesis/notebooks/2_comparisons/feature_ae_loss_scorer.ipynb ---> /content/ids-msc-thesis/notebooks/2_comparisons/feature_ae_loss_scorer.ipynb


In [13]:
## 2. COMMIT + PUSH
if nb_updated:
    m_git_ssh_tools.git_commit_push_changes(
        commit_msg  = f"Push updated experiment nootbook ('{m_branch_name}')",
        branch_name = m_branch_name,        ## 'main',
        remote_name = m_br_remote_name,     ## "origin",
        git_push    = True,
        verbose     = False
    )

status
M notebooks/2_comparisons/feature_ae_loss_scorer.ipynb
-> Changes pushed to origin/feature_ae_loss_scorer


### `Final Merge & Push`

In [14]:
m_git_ssh_tools.merge_feature_branch(
    feature_branch  = m_branch_name,        ## m_branch_name == "feature_ae_loss_scorer"
    main_branch     = m_base_branch,        ## "main",
    remote_name     = m_br_remote_name,     ## "origin",
    use_rebase      = True,
    verbose         = True,
)

Using merge strategy: 'rebase', merging: feature_ae_loss_scorer ----> main
-> *** No local changes to save (stash) ***

[CMD] : git fetch origin

[CODE]  : 0

[CMD] : git checkout main

[STDOUT]: Your branch is up to date with 'origin/main'.

[STDERR]: Switched to branch 'main'

[CODE]  : 0

[CMD] : git pull origin main

[STDOUT]: Already up to date.

[STDERR]: From github.com:zelalemteferi/ids-msc-thesis
 * branch            main       -> FETCH_HEAD

[CODE]  : 0

[CMD] : git checkout feature_ae_loss_scorer

[STDOUT]: Your branch is up to date with 'origin/feature_ae_loss_scorer'.

[STDERR]: Switched to branch 'feature_ae_loss_scorer'

[CODE]  : 0

[CMD] : git pull origin feature_ae_loss_scorer

[STDOUT]: Already up to date.

[STDERR]: From github.com:zelalemteferi/ids-msc-thesis
 * branch            feature_ae_loss_scorer -> FETCH_HEAD

[CODE]  : 0
Rebasing feature branch onto main ...

[CMD] : git rebase main

[STDERR]: Rebasing (1/1)

[KSuccessfully rebased and updated refs/heads/f

# ============== `Experiments` ======================================