In [1]:
# Cell 1: Install dependencies
!pip install --quiet faiss-cpu sentence-transformers transformers torch tensorflow scikit-learn


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Cell 2: Imports & Configuration
import numpy as np
import pandas as pd
import faiss
import torch

import tensorflow as tf
import tensorflow.keras.backend as K

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sentence_transformers import SentenceTransformer
from transformers import pipeline

# Reproducibility
np.random.seed(0)
tf.random.set_seed(0)


In [3]:
# Cell 3: VAE Custom Layers & Builder

@tf.keras.utils.register_keras_serializable()
def sampling(args):
    mean, log_var = args
    log_var = tf.clip_by_value(log_var, -5.0, 5.0)
    eps = K.random_normal(tf.shape(mean))
    return mean + tf.exp(0.5 * log_var) * eps

@tf.keras.utils.register_keras_serializable(package="Custom")
class VAELossLayer(tf.keras.layers.Layer):
    def call(self, inputs):
        orig, recon, mean, log_var = inputs
        recon_loss = tf.reduce_sum(tf.square(orig - recon), axis=1)
        kl_loss    = -0.5 * tf.reduce_sum(1 + log_var - tf.square(mean) - tf.exp(log_var), axis=1)
        self.add_loss(tf.reduce_mean(recon_loss + kl_loss))
        return recon

def build_vae(input_dim):
    inp = tf.keras.Input((input_dim,))
    x = tf.keras.layers.Dense(64, activation="relu")(inp)
    x = tf.keras.layers.Dense(32, activation="relu")(x)
    z_mean   = tf.keras.layers.Dense(4)(x)
    z_logvar = tf.keras.layers.Dense(4)(x)
    z = tf.keras.layers.Lambda(sampling)([z_mean, z_logvar])

    latent = tf.keras.Input((4,))
    y = tf.keras.layers.Dense(32, activation="relu")(latent)
    y = tf.keras.layers.Dense(64, activation="relu")(y)
    out = tf.keras.layers.Dense(input_dim, activation="linear")(y)
    decoder = tf.keras.Model(latent, out)

    recon = decoder(z)
    loss_out = VAELossLayer()([inp, recon, z_mean, z_logvar])
    vae = tf.keras.Model(inp, loss_out)
    vae.compile(optimizer=tf.keras.optimizers.Adam(1e-3))
    return vae

def load_vae(path, dim):
    try:
        m = tf.keras.models.load_model(
            path,
            custom_objects={"sampling": sampling, "VAELossLayer": VAELossLayer}
        )
        if m.input_shape[1] != dim:
            return None
        print("✅ Loaded VAE from disk")
        return m
    except:
        return None


In [4]:
# Cell 4: Load & Preprocess Dataset

# Upload your CSV as 'linux_memory.csv'
df = pd.read_csv("sampled_data1.csv", low_memory=False)

# Define numeric features and label
features = ['ts','PID','MINFLT','MAJFLT','VSTEXT','VSIZE','RSIZE','VGROW','RGROW','MEM']
df = df.dropna(subset=features + ['type']).reset_index(drop=True)

# Prepare feature matrix and labels
X_df = df[features].astype(float)
y    = df['type'].astype(int).values

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X_df).astype(np.float32)

print("Total samples:", X.shape[0], "Features:", X.shape[1])


Total samples: 1000 Features: 10


In [5]:
# Cell 5: Train or Load VAE on Normal Data

normal_mask = (y == 0)
X_norm = X[normal_mask]

vae = load_vae("vae_model.keras", X.shape[1])
if vae is None:
    vae = build_vae(X.shape[1])
    vae.fit(
        X_norm, X_norm,
        epochs=20, batch_size=32,
        validation_split=0.1,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True),
            tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2)
        ],
        verbose=1
    )
    vae.save("vae_model.keras")
    print("✅ Trained & saved VAE")


✅ Loaded VAE from disk


In [6]:
# Cell 6: VAE Reconstruction, Threshold & Flagging

# Reconstruct inputs and compute mean-squared errors
recon = vae.predict(X)
errs  = np.mean((X - recon)**2, axis=1)

# Threshold at 95th percentile of normal errors
thr = np.percentile(errs[normal_mask], 95)
flags_vae = (errs > thr).astype(int)

print(f"Threshold = {thr:.4f}, VAE flagged {flags_vae.sum()} candidates")


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step
Threshold = 0.5816, VAE flagged 53 candidates


In [7]:
# Cell 7: Prepare Text Corpus & FAISS Indices

def to_text(vals):
    d = dict(zip(features, vals))
    return (
        f"Time {int(d['ts'])}: PID {int(d['PID'])}, "
        f"{int(d['MINFLT'])} minor faults, {int(d['MAJFLT'])} major faults, "
        f"{d['MEM']*100:.1f}% memory."
    )

# Build list of textified records
texts = [to_text(row) for row in X_df.values]

# Embed all records
embed_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embs = embed_model.encode(texts, batch_size=64, show_progress_bar=True).astype('float32')

# Build FAISS indices: one for normal, one for anomalies
dim = embs.shape[1]
idx_norm = faiss.IndexFlatL2(dim)
idx_anom = faiss.IndexFlatL2(dim)

norm_idxs = np.where(y == 0)[0]
anom_idxs = np.where(y == 1)[0]

idx_norm.add(embs[norm_idxs])
idx_anom.add(embs[anom_idxs])

print(f"FAISS: indexed {len(norm_idxs)} normals and {len(anom_idxs)} anomalies")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

FAISS: indexed 868 normals and 132 anomalies


In [8]:
from transformers import pipeline

# Cell 8: Retrieval-Augmented LLM Classification with GPT-Neo (fixed max_new_tokens)
device = 0 if torch.cuda.is_available() else -1

rag_llm = pipeline(
    "text-generation",
    model="EleutherAI/gpt-neo-2.7B",
    tokenizer="EleutherAI/gpt-neo-2.7B",
    device=device
)

k = 5
candidates = np.where(flags_vae == 1)[0]
final_flags = np.zeros(len(y), dtype=int)
rationales   = [""] * len(y)

for idx_i, i in enumerate(candidates):
    # 1) Retrieve neighbors
    q = embs[i].reshape(1, -1)
    _, nn = idx_norm.search(q, k)
    _, na = idx_anom.search(q, k)
    normals   = [texts[norm_idxs[j]]   for j in nn[0]]
    anomalies = [texts[anom_idxs[j]] for j in na[0]]

    # 2) Build GPT-Neo-friendly prompt
    prompt = (
        "Here are 5 NORMAL logs:\n" +
        "\n".join(normals) +
        "\n\nHere are 5 ANOMALY logs:\n" +
        "\n".join(anomalies) +
        f"\n\nSuspect log:\n{texts[i]}\n\n"
        "Classify with exactly 'Anomaly' or 'Normal'."
    )

    # 3) Generate with max_new_tokens and truncation
    out = rag_llm(
        prompt,
        max_new_tokens=20,    # generate up to 20 new tokens
        truncation=True,      # truncate the prompt if it exceeds model max length
        do_sample=False,
        pad_token_id=rag_llm.tokenizer.eos_token_id
    )[0]["generated_text"].strip()

    is_anom = out.lower().startswith("anomaly")
    final_flags[i] = int(is_anom)
    rationales[i] = out

    if idx_i % 50 == 0 or idx_i == len(candidates) - 1:
        print(f"RAG classified {idx_i+1}/{len(candidates)} candidates")

print(f"RAG confirmed {final_flags[candidates].sum()}/{len(candidates)} VAE candidates")


config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Device set to use cuda:0


RAG classified 1/53 candidates


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


RAG classified 51/53 candidates
RAG classified 53/53 candidates
RAG confirmed 0/53 VAE candidates


In [9]:
# Cell 9: Evaluate Full Pipeline

# All unflagged = Normal, flagged with final_flags==1 = Anomaly
y_pred = final_flags

print("Overall accuracy:", accuracy_score(y, y_pred))
print("Confusion matrix:\n", confusion_matrix(y, y_pred))
print("Classification report:\n", classification_report(y, y_pred))


Overall accuracy: 0.868
Confusion matrix:
 [[868   0]
 [132   0]]
Classification report:
               precision    recall  f1-score   support

           0       0.87      1.00      0.93       868
           1       0.00      0.00      0.00       132

    accuracy                           0.87      1000
   macro avg       0.43      0.50      0.46      1000
weighted avg       0.75      0.87      0.81      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
