In [None]:
import glob
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

import matplotlib.pyplot as plt

# --------------------------------------------------
# 1) Load Data & Preprocessing
# --------------------------------------------------
train_files = glob.glob("../data/*training.parquet")
test_files  = glob.glob("../data/*testing.parquet")

df_list = []
for f in train_files:
    temp = pd.read_parquet(f)
    df_list.append(temp)
for f in test_files:
    temp = pd.read_parquet(f)
    df_list.append(temp)

df_all = pd.concat(df_list, ignore_index=True)

df_all['Label'] = df_all['Label'].astype(str).str.upper().str.strip()

df_train, df_test = train_test_split(
    df_all, test_size=0.2, 
    stratify=df_all['Label'], random_state=42
)
df_train.dropna(inplace=True)
df_train.drop_duplicates(inplace=True)
df_test.dropna(inplace=True)
df_test.drop_duplicates(inplace=True)

print("Train size:", df_train.shape)
print("Test size: ", df_test.shape)

# Separate Features & Label
def select_features(df):
    drop_cols = ['Source IP', 'Destination IP', 'Timestamp']  # Adjust if necessary
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    features = [c for c in numeric_cols if c not in drop_cols]
    X = df[features]
    y = df['Label']
    return X, y

X_train_raw, y_train_raw = select_features(df_train)
X_test_raw,  y_test_raw  = select_features(df_test)

# LabelEncoder
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train_raw)
y_test_enc  = le.transform(y_test_raw)
print("Classes:", le.classes_)

# MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled  = scaler.transform(X_test_raw)

# --------------------------------------------------
# 2) Split Train into (Train part + Validation part)
# --------------------------------------------------
X_train_part, X_val_part, y_train_part, y_val_part = train_test_split(
    X_train_scaled, y_train_enc,
    test_size=0.2, stratify=y_train_enc, random_state=999
)
print("Train part size:", X_train_part.shape)
print("Val part size:",   X_val_part.shape)

# --------------------------------------------------
# 3) Autoencoder (AE) Data: 'BENIGN' Only (Train part)
# --------------------------------------------------
benign_idx = np.where(le.classes_ == "BENIGN")[0][0]
benign_mask_train = (y_train_part == benign_idx)
X_train_benign = X_train_part[benign_mask_train]

# For monitoring AE performance during validation,
# extract BENIGN samples from Validation -> AE Validation
benign_mask_val = (y_val_part == benign_idx)
X_val_benign = X_val_part[benign_mask_val]

# --------------------------------------------------
# PyTorch Dataset Classes
# --------------------------------------------------
class AEDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data, dtype=torch.float32)
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

class ClassifierDataset(Dataset):
    def __init__(self, data, labels):
        self.data   = torch.tensor(data, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# --------------------------------------------------
# 4) Model Implementation
# --------------------------------------------------
# (4-1) Autoencoder (AE)
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim=8):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, latent_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, input_dim)
        )
    def forward(self, x):
        z = self.encoder(x)
        recon = self.decoder(z)
        return recon

# (4-2) Classifier: Multi-layer Perceptron Model
class Classifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(Classifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, num_classes)
        )
    def forward(self, x):
        return self.model(x)

# --------------------------------------------------
# 5) EarlyStopping Implementation (for Validation Loss Monitoring)
# --------------------------------------------------
class EarlyStopping:
    def __init__(self, patience=3, min_delta=1e-4):
        self.patience   = patience
        self.min_delta  = min_delta
        self.counter    = 0
        self.best_loss  = None
        self.early_stop = False
        
    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif self.best_loss - val_loss > self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

# --------------------------------------------------
# 6) Training Functions
# --------------------------------------------------
def train_autoencoder(model, train_loader, val_loader, 
                      num_epochs=20, lr=1e-3, device="cpu"):
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    criterion = nn.MSELoss()
    stopper   = EarlyStopping(patience=3, min_delta=1e-5)
    
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch_data in train_loader:
            batch_data = batch_data.to(device)
            optimizer.zero_grad()
            recon = model(batch_data)
            loss = criterion(recon, batch_data)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * batch_data.size(0)
        train_loss = total_loss / len(train_loader.dataset)
        
        # Measure Validation Loss (using benign validation data)
        model.eval()
        val_total = 0
        with torch.no_grad():
            for batch_data in val_loader:
                batch_data = batch_data.to(device)
                recon = model(batch_data)
                vloss = criterion(recon, batch_data)
                val_total += vloss.item() * batch_data.size(0)
        val_loss = val_total / len(val_loader.dataset)
        
        print(f"[AE] Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}")
        
        # Check EarlyStopping
        stopper(val_loss)
        if stopper.early_stop:
            print("Early stopping triggered!")
            break
    return model

def train_classifier(model, train_loader, val_data, 
                     num_epochs=20, lr=5e-4, device="cpu"):
    """
    val_data = (X_val, y_val) for optional monitoring.
    """
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    X_val, y_val = val_data
    X_val_t = torch.tensor(X_val, dtype=torch.float32).to(device)
    y_val_t = torch.tensor(y_val, dtype=torch.long).to(device)
    
    stopper = EarlyStopping(patience=3, min_delta=1e-4)
    
    model.to(device)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch_x, batch_y in train_loader:
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            optimizer.zero_grad()
            out = model(batch_x)
            loss = criterion(out, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * batch_x.size(0)
        avg_loss = total_loss / len(train_loader.dataset)
        
        # Monitor Validation accuracy or loss
        model.eval()
        with torch.no_grad():
            val_out = model(X_val_t)
            val_loss = criterion(val_out, y_val_t).item()  # Evaluate on the entire validation set
        print(f"[Classifier] Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_loss:.6f}, Val Loss: {val_loss:.6f}")
        
        stopper(val_loss)
        if stopper.early_stop:
            print("Early stopping triggered!")
            break
    return model

# --------------------------------------------------
# 7) Prepare for AE Training
# --------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = X_train_part.shape[1]

ae_train_dataset = AEDataset(X_train_benign)
ae_val_dataset   = AEDataset(X_val_benign)

ae_train_loader = DataLoader(ae_train_dataset, batch_size=64, shuffle=True)
ae_val_loader   = DataLoader(ae_val_dataset, batch_size=64, shuffle=False)

autoencoder = Autoencoder(input_dim=input_dim, latent_dim=8)

print("\n=== [Phase 1] Training Autoencoder (Benign + Validation Monitoring) ===")
autoencoder = train_autoencoder(
    model=autoencoder, 
    train_loader=ae_train_loader, 
    val_loader=ae_val_loader, 
    num_epochs=20, 
    lr=1e-3,
    device=device
)

# --------------------------------------------------
# 8) Find AE Threshold on Validation Data
# --------------------------------------------------
X_val_t = torch.tensor(X_val_part, dtype=torch.float32).to(device)
autoencoder.eval()
with torch.no_grad():
    recon_val = autoencoder(X_val_t)
    val_errors = torch.mean((X_val_t - recon_val)**2, dim=1).cpu().numpy()

# Binary true labels for attack/benign
y_val_bin = (y_val_part != benign_idx).astype(int)

# Multiple threshold candidates
cand_percentiles = np.linspace(0, 100, 21)  # 21 values between 0 and 100 (5% intervals)
cand_stats = []

mean_val = val_errors.mean()
std_val  = val_errors.std()
cand_stats.append(mean_val + 1.0 * std_val)
cand_stats.append(mean_val + 2.0 * std_val)
cand_stats.append(mean_val + 3.0 * std_val)

best_f1 = -1
best_threshold = None

for p in cand_percentiles:
    thres = np.percentile(val_errors, p)
    y_pred_bin = (val_errors > thres).astype(int)
    f1 = f1_score(y_val_bin, y_pred_bin, average='binary')
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = thres
    # Save candidate stats
    cand_stats.append(thres)

for cs in cand_stats:
    # Pass if you do not want to remove duplicates
    pass

# Final best threshold
print(f"[Val] best threshold = {best_threshold:.6f}, F1 = {best_f1:.4f}")

# --------------------------------------------------
# 9) Train Classifier (for All Classes) & Handle Imbalance
# --------------------------------------------------
classifier_dataset = ClassifierDataset(X_train_part, y_train_part)

# Count per class
class_counts = np.bincount(y_train_part)
print("Class counts:", class_counts)

class_weights = 1.0 / (class_counts + 1e-6)
sample_weights = class_weights[y_train_part]

sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

classifier_loader = DataLoader(
    classifier_dataset,
    batch_size=64,
    sampler=sampler
)

classifier = Classifier(input_dim=input_dim, num_classes=len(le.classes_))

print("\n=== [Phase 2] Training Classifier (WeightedRandomSampler + EarlyStopping) ===")
classifier = train_classifier(
    model=classifier,
    train_loader=classifier_loader,
    val_data=(X_val_part, y_val_part),
    num_epochs=20,
    lr=5e-4,
    device=device
)

# --------------------------------------------------
# 10) Final Test Evaluation
# --------------------------------------------------
X_test_t = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

autoencoder.eval()
classifier.eval()

with torch.no_grad():
    recon_test = autoencoder(X_test_t)
    test_errors = torch.mean((X_test_t - recon_test)**2, dim=1).cpu().numpy()

anomaly_mask = (test_errors > best_threshold)

final_preds = np.zeros(len(X_test_scaled), dtype=int)
normal_indices = np.where(~anomaly_mask)[0]
attack_indices = np.where(anomaly_mask)[0]

# For normal samples, assign BENIGN
final_preds[normal_indices] = benign_idx

# For attacks, use the classifier
if len(attack_indices) > 0:
    X_test_attack = X_test_scaled[attack_indices]
    with torch.no_grad():
        out_attack = classifier(torch.tensor(X_test_attack, dtype=torch.float32).to(device))
        preds_attack = torch.argmax(out_attack, dim=1).cpu().numpy()
    final_preds[attack_indices] = preds_attack

cm = confusion_matrix(y_test_enc, final_preds, labels=range(len(le.classes_)))
print("\n=== Final Test Results ===")
print("Confusion Matrix:\n", cm)

print("\nClassification Report:")
print(classification_report(
    y_test_enc, final_preds,
    labels=range(len(le.classes_)),
    target_names=le.classes_
))

plt.figure(figsize=(10,8))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion Matrix (Improved AE + WeightedSampler + More epochs)")
plt.colorbar()
tick_marks = np.arange(len(le.classes_))
plt.xticks(tick_marks, le.classes_, rotation=45)
plt.yticks(tick_marks, le.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")

thresh_val = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh_val else "black")

# plt.tight_layout()
# plt.show()


# (1) Normalize each row (row sum -> 1.0)
cm_norm = cm.astype(float) / cm.sum(axis=1, keepdims=True)

# (2) Check normalized values (rounded to 2-3 decimals if desired)
print("\nNormalized Confusion Matrix (row-wise):")
print(np.round(cm_norm, 3))

# (3) Visualization (Normalized Matrix)
plt.figure(figsize=(10,8))
plt.imshow(cm_norm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Normalized Confusion Matrix (Row-wise %)")
plt.colorbar()

tick_marks = np.arange(len(le.classes_))
plt.xticks(tick_marks, le.classes_, rotation=45)
plt.yticks(tick_marks, le.classes_)

plt.xlabel("Predicted")
plt.ylabel("Actual")

# (4) Display percentages in each cell
for i in range(cm_norm.shape[0]):
    for j in range(cm_norm.shape[1]):
        # Display as percentage (or decimal)
        cell_text = f"{cm_norm[i, j]*100:.1f}%"
        plt.text(j, i, cell_text,
                 horizontalalignment="center",
                 color="white" if cm_norm[i, j] > 0.5 else "black")

plt.tight_layout()
plt.show()
