In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from utils.data_preprocessing import load_data, preprocess_data
import os
import numpy as np

save_data = "save_data_2017/"

if not os.path.exists(save_data):
    os.mkdir(save_data)
    
def load_data(file_path):
    return pd.read_csv(file_path, encoding='cp1252', engine='python')
cicids2017_path = 'data/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv'

if(not os.path.isfile(save_data + "web_attacks.csv")):
    data_2017 = load_data(cicids2017_path)
    preprocess_data(data_2017, save_data)

In [3]:
df = pd.read_csv(save_data + 'web_attacks.csv')
df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)


normal_df = df[df['Label'] == 0]
attack_df = df[df['Label'] != 0]

num_attack = len(attack_df)

num_normal = 2 * num_attack

In [4]:
normal_df = normal_df.sample(n=num_normal, random_state=42)

# Combine and shuffle the dataset
balanced_df = pd.concat([normal_df, attack_df]).sample(frac=1, random_state=42).reset_index(drop=True)

# Split the balanced dataset into training (60%), validation (20%), and testing sets (20%)
train_df, temp_df = train_test_split(balanced_df, test_size=0.4, random_state=42, stratify=balanced_df['Label'])
val_df, test_df = train_test_split(temp_df, test_size=0.7, random_state=42, stratify=temp_df['Label'])

# Within the training set, ensure normal instances are twice the number of attack instances
train_normal_df = train_df[train_df['Label'] == 0]
train_attack_df = train_df[train_df['Label'] == 1]
num_train_attack = len(train_attack_df)
num_train_normal = 2 * num_train_attack
train_normal_df = train_normal_df.sample(n=num_train_normal, random_state=42)
train_df = pd.concat([train_normal_df, train_attack_df]).sample(frac=1, random_state=42).reset_index(drop=True)

In [5]:
# Print the count of each class in the training, validation, and testing sets
for name, df in zip(["Training", "Validation", "Testing"], [train_df, val_df, test_df]):
    unique, counts = np.unique(df['Label'], return_counts=True)
    print(f"{name} set class distribution:", dict(zip(unique, counts)))

Training set class distribution: {0: 2616, 1: 1308}
Validation set class distribution: {0: 523, 1: 261}
Testing set class distribution: {0: 1221, 1: 611}


In [6]:
# Save the train, validation, and test sets to CSV files
train_df.to_csv(save_data +  'train_set.csv', index=False)
val_df.to_csv(save_data +  'val_set.csv', index=False)
test_df.to_csv(save_data + 'test_set.csv', index=False)

In [7]:
# Define feature columns (excluding the label column)
feature_cols = [col for col in df.columns if col != 'Label']

# Extract features and labels for training, validation, and testing sets
X_train = train_df[feature_cols]
y_train = train_df['Label']
X_val = val_df[feature_cols]
y_val = val_df['Label']
X_test = test_df[feature_cols]
y_test = test_df['Label']

In [8]:
# Define classifiers
from models.decision_tree import train_decision_tree
from models.random_forest import train_random_forest
from models.lda import train_lda
from models.naive_bayes import train_naive_bayes
from models.logistic_regression import train_logistic_regression
from models.knn import train_knn
from models.svm import train_svm
from models.extra_trees import train_extra_trees
from models.bagging import train_bagging
from models.mlp import train_mlp

classifiers = {
    'Decision Tree': train_decision_tree,
    'Random Forest': train_random_forest,
    'Linear Discriminant Analysis': train_lda,
    'Naive Bayes': train_naive_bayes,
    'Logistic Regression': train_logistic_regression,
    'K-Nearest Neighbors': train_knn,
    'Support Vector Machine': train_svm,
    'Extra Trees Classifier': train_extra_trees,
    'Bagging Classifier': train_bagging,
    'Multi-layer Perceptron': train_mlp
}

In [8]:
def model_10Classifier():
    results = []
    for name, train_function in classifiers.items():
        model, val_acc, val_pre, val_rec, val_f1, test_acc, test_pre, test_rec, test_f1, time = train_function(X_train, y_train, X_val, y_val, X_test, y_test, save_data)
        results.append({
            'Model': name,
            'Validation Accuracy': val_acc,
            'Validation Precision': val_pre,
            'Validation Recall': val_rec,
            'Validation F1 Score': val_f1,
            "Test Accuracy": test_acc,
            "Test Precision": test_pre,
            "Test Recall": test_rec,
            "Test F1 Score": test_f1,
            'Time': time
        })

    # Save results to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv(save_data + '10Classsifer_results_2017.csv', index=False)
    
model_10Classifier()

Decision Tree Validation - Accuracy: 0.9744897959183674 Precision: 0.9479553903345725 Recall: 0.9770114942528736 F1 Score: 0.9622641509433962
Decision Tree Test - Accuracy: 0.9770742358078602 Precision: 0.9686985172981878 Recall: 0.9623567921440261 F1 Score: 0.9655172413793104
Testing time: 0.0080 seconds
Random Forest Validation - Accuracy: 0.9744897959183674 Precision: 0.9547169811320755 Recall: 0.9693486590038314 F1 Score: 0.9619771863117871
Random Forest Test - Accuracy: 0.9776200873362445 Precision: 0.9702970297029703 Recall: 0.9623567921440261 F1 Score: 0.9663105998356615
Testing time: 0.0130 seconds
LDA Validation - Accuracy: 0.9553571428571429 Precision: 0.937984496124031 Recall: 0.9272030651340997 F1 Score: 0.9325626204238922
LDA Test - Accuracy: 0.9530567685589519 Precision: 0.9581151832460733 Recall: 0.8985270049099836 F1 Score: 0.9273648648648649
Testing time: 0.0060 seconds
Navi_bayes Validation - Accuracy: 0.8048469387755102 Precision: 0.6304347826086957 Recall: 1.0 F1 Sc

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic_regression Validation - Accuracy: 0.9706632653061225 Precision: 0.976 Recall: 0.9348659003831418 F1 Score: 0.9549902152641878
Logistic_regression Test - Accuracy: 0.9617903930131004 Precision: 0.9838998211091234 Recall: 0.900163666121113 F1 Score: 0.9401709401709402
Testing time: 0.0050 seconds
KNN Validation - Accuracy: 0.9604591836734694 Precision: 0.9166666666666666 Recall: 0.9693486590038314 F1 Score: 0.9422718808193669
KNN Test - Accuracy: 0.9743449781659389 Precision: 0.9490445859872612 Recall: 0.9754500818330606 F1 Score: 0.9620661824051655
Testing time: 0.0957 seconds
SVM Validation - Accuracy: 0.9362244897959183 Precision: 0.8836363636363637 Recall: 0.9310344827586207 F1 Score: 0.9067164179104478
SVM Test - Accuracy: 0.9470524017467249 Precision: 0.9431034482758621 Recall: 0.8952536824877251 F1 Score: 0.9185558354324097
Testing time: 0.0831 seconds
Extra_trees Validation - Accuracy: 0.9795918367346939 Precision: 0.9622641509433962 Recall: 0.9770114942528736 F1 Score: 

In [9]:
import pandas as pd
from sklearn.decomposition import PCA

# Assume the classifiers dictionary and train functions are already defined
# Define a function to apply PCA and train classifiers

def train_with_pca(X_train, y_train, X_val, y_val, X_test, y_test, n_components=20):
    X_combined = np.concatenate((X_train, X_val, X_test), axis=0)
    y_combined = np.concatenate((y_train, y_val, y_test), axis=0)

    pca = PCA(n_components=n_components)
    X_combined_pca = pca.fit_transform(X_combined)

    # Bước 3: Tách lại dữ liệu thành các tập train, validation và test
    X_train_pca = X_combined_pca[:len(X_train)]
    X_val_pca = X_combined_pca[len(X_train):len(X_train)+len(X_val)]
    X_test_pca = X_combined_pca[len(X_train)+len(X_val):]

    # Kiểm tra kích thước của các tập dữ liệu sau khi PCA
    print("X_train_pca shape:", X_train_pca.shape)
    print("X_val_pca shape:", X_val_pca.shape)
    print("X_test_pca shape:", X_test_pca.shape)
    
    results = []
    for name, train_function in classifiers.items():
        model, val_acc, val_pre, val_rec, val_f1, test_acc, test_pre, test_rec, test_f1, time = train_function(X_train_pca, y_train, X_val_pca, y_val, X_test_pca, y_test, save_data)
        results.append({
            'Model': name,
            'Validation Accuracy': val_acc,
            'Validation Precision': val_pre,
            'Validation Recall': val_rec,
            'Validation F1 Score': val_f1,
            'Test Accuracy': test_acc,
            'Test Precision': test_pre,
            'Test Recall': test_rec,
            'Test F1 Score': test_f1,
            'Time': time
        })
    
    return results

# Call the function and save results
pca_results = train_with_pca(X_train, y_train, X_val, y_val, X_test, y_test)
pca_results_df = pd.DataFrame(pca_results)
pca_results_df.to_csv(save_data + 'pca_model_results_2017.csv', index=False)


X_train_pca shape: (3924, 20)
X_val_pca shape: (784, 20)
X_test_pca shape: (1832, 20)
Decision Tree Validation - Accuracy: 0.9681122448979592 Precision: 0.937037037037037 Recall: 0.9693486590038314 F1 Score: 0.9529190207156308
Decision Tree Test - Accuracy: 0.9776200873362445 Precision: 0.9718543046357616 Recall: 0.9607201309328969 F1 Score: 0.9662551440329218
Testing time: 0.0040 seconds
Random Forest Validation - Accuracy: 0.9744897959183674 Precision: 0.9547169811320755 Recall: 0.9693486590038314 F1 Score: 0.9619771863117871
Random Forest Test - Accuracy: 0.9776200873362445 Precision: 0.9702970297029703 Recall: 0.9623567921440261 F1 Score: 0.9663105998356615
Testing time: 0.0121 seconds
LDA Validation - Accuracy: 0.9311224489795918 Precision: 0.9330543933054394 Recall: 0.8544061302681992 F1 Score: 0.892
LDA Test - Accuracy: 0.9263100436681223 Precision: 0.9630350194552529 Recall: 0.8101472995090017 F1 Score: 0.88
Testing time: 0.0050 seconds
Navi_bayes Validation - Accuracy: 0.62244

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic_regression Validation - Accuracy: 0.9362244897959183 Precision: 0.8921933085501859 Recall: 0.9195402298850575 F1 Score: 0.9056603773584906
Logistic_regression Test - Accuracy: 0.9432314410480349 Precision: 0.9408695652173913 Recall: 0.8854337152209493 F1 Score: 0.9123102866779089
Testing time: 0.0050 seconds
KNN Validation - Accuracy: 0.9553571428571429 Precision: 0.9094202898550725 Recall: 0.9616858237547893 F1 Score: 0.9348230912476723
KNN Test - Accuracy: 0.9737991266375546 Precision: 0.9504 Recall: 0.972176759410802 F1 Score: 0.9611650485436893
Testing time: 0.0240 seconds
SVM Validation - Accuracy: 0.9362244897959183 Precision: 0.8836363636363637 Recall: 0.9310344827586207 F1 Score: 0.9067164179104478
SVM Test - Accuracy: 0.9475982532751092 Precision: 0.9447322970639033 Recall: 0.8952536824877251 F1 Score: 0.9193277310924369
Testing time: 0.0871 seconds
Extra_trees Validation - Accuracy: 0.9770408163265306 Precision: 0.9619771863117871 Recall: 0.9693486590038314 F1 Score:

In [10]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Định nghĩa Autoencoder
# Định nghĩa Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        encoding_dim1 = 52
        encoding_dim2 = 26
        encoding_dim3 = int(np.sqrt(input_dim)) + 1

        # Lớp mã hóa
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim1),
            nn.BatchNorm1d(encoding_dim1),  # Thêm BatchNorm1d
            nn.ReLU(),
            nn.Linear(encoding_dim1, encoding_dim2),
            nn.BatchNorm1d(encoding_dim2),  # Thêm BatchNorm1d
            nn.ReLU(),
            nn.Linear(encoding_dim2, encoding_dim3),
            nn.BatchNorm1d(encoding_dim3),  # Thêm BatchNorm1d
            nn.ReLU()
        )
        # Lớp giải mã
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim3, encoding_dim2),
            nn.BatchNorm1d(encoding_dim2),  # Thêm BatchNorm1d
            nn.ReLU(),
            nn.Linear(encoding_dim2, encoding_dim1),
            nn.BatchNorm1d(encoding_dim1),  # Thêm BatchNorm1d
            nn.ReLU(),
            nn.Linear(encoding_dim1, input_dim),
            nn.Sigmoid()  # Giữ sigmoid cho đầu ra
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded, encoded

# Hàm để áp dụng deep autoencoder và train classifiers
def train_with_autoencoder(X_train, y_train, X_val, y_val, X_test, y_test, epochs=10, batch_size=256, alpha1=0.1):
    input_dim = X_train.shape[1]
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Khởi tạo mô hình và các thành phần cần thiết
    autoencoder = Autoencoder(input_dim).to(device)
    optimizer = optim.AdamW(autoencoder.parameters(), lr=0.0001)
    criterion = nn.MSELoss()

    X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
    X_val_tensor = torch.tensor(X_val.to_numpy(), dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val.to_numpy(), dtype=torch.float32)

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Training loop
    for epoch in range(epochs):
        autoencoder.train()
        train_loss = 0.0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()
            decode, encode = autoencoder(inputs)
            loss = alpha1 * criterion(decode, inputs)  # Chỉ sử dụng MSE loss
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(autoencoder.parameters(), max_norm=1.0)  # Gradient Clipping
            optimizer.step()
            train_loss += loss.item()

        print(f'Epoch [{epoch+1}/{epochs}], Loss: {train_loss/len(train_loader)}')

    autoencoder.eval()
    with torch.no_grad():
        X_train_enc = autoencoder.encoder(X_train_tensor.to(device)).cpu().numpy()
        X_val_enc = autoencoder.encoder(X_val_tensor.to(device)).cpu().numpy()
        X_test_enc = autoencoder.encoder(torch.tensor(X_test.to_numpy(), dtype=torch.float32).to(device)).cpu().numpy()
    
    # Train và đánh giá classifiers
    results = []
    for name, train_function in classifiers.items():
        model, val_acc, val_pre, val_rec, val_f1, test_acc, test_pre, test_rec, test_f1, time = train_function(X_train_enc, y_train, X_val_enc, y_val, X_test_enc, y_test, save_data)
        results.append({
            'Model': name,
            'Validation Accuracy': val_acc,
            'Validation Precision': val_pre,
            'Validation Recall': val_rec,
            'Validation F1 Score': val_f1,
            'Test Accuracy': test_acc,
            'Test Precision': test_pre,
            'Test Recall': test_rec,
            'Test F1 Score': test_f1,
            'Time': time
        })
    
    return results

# Gọi hàm và lưu kết quả
autoencoder_results = train_with_autoencoder(X_train, y_train, X_val, y_val, X_test, y_test)
autoencoder_results_df = pd.DataFrame(autoencoder_results)
autoencoder_results_df.to_csv(save_data + 'autoencoder_model_results.csv', index=False)


Epoch [1/10], Loss: 4892750905344.0
Epoch [2/10], Loss: 4931642621952.0
Epoch [3/10], Loss: 4792329936896.0
Epoch [4/10], Loss: 5149251444736.0
Epoch [5/10], Loss: 5001683779584.0
Epoch [6/10], Loss: 4854211543040.0
Epoch [7/10], Loss: 4848670015488.0
Epoch [8/10], Loss: 5220874027008.0
Epoch [9/10], Loss: 4788360822784.0
Epoch [10/10], Loss: 4934459621376.0
Decision Tree Validation - Accuracy: 0.9681122448979592 Precision: 0.937037037037037 Recall: 0.9693486590038314 F1 Score: 0.9529190207156308
Decision Tree Test - Accuracy: 0.9732532751091703 Precision: 0.9652317880794702 Recall: 0.9541734860883797 F1 Score: 0.9596707818930041
Testing time: 0.0080 seconds
Random Forest Validation - Accuracy: 0.9668367346938775 Precision: 0.9304029304029304 Recall: 0.9731800766283525 F1 Score: 0.951310861423221
Random Forest Test - Accuracy: 0.9743449781659389 Precision: 0.9622950819672131 Recall: 0.9607201309328969 F1 Score: 0.9615069615069615
Testing time: 0.0129 seconds
LDA Validation - Accuracy: 

In [10]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import random

# Định nghĩa Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        encoding_dim1 = 52
        encoding_dim2 = 26
        encoding_dim3 = int(np.sqrt(input_dim)) + 1

        # Lớp mã hóa
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim1),
            nn.BatchNorm1d(encoding_dim1),  # Thêm BatchNorm1d

            nn.ReLU(),
            nn.Linear(encoding_dim1, encoding_dim2),
            nn.BatchNorm1d(encoding_dim2),  # Thêm BatchNorm1d
            nn.ReLU(),
            nn.Linear(encoding_dim2, encoding_dim3),
            nn.BatchNorm1d(encoding_dim3),  # Thêm BatchNorm1d
            nn.ReLU()
        )
        # Lớp giải mã
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim3, encoding_dim2),
            nn.BatchNorm1d(encoding_dim2),  # Thêm BatchNorm1d
            nn.ReLU(),
            nn.Linear(encoding_dim2, encoding_dim1),
            nn.BatchNorm1d(encoding_dim1),  # Thêm BatchNorm1d
            nn.ReLU(),
            nn.Linear(encoding_dim1, input_dim),
            nn.Sigmoid()  # Giữ sigmoid cho đầu ra
        )
    
    def _initialize_weights(self):
        for layer in self.encoder:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                nn.init.zeros_(layer.bias)
        for layer in self.decoder:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                nn.init.zeros_(layer.bias)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded, encoded

# Định nghĩa BinaryMagnetLoss
class BinaryMagnetLoss(nn.Module):
    def __init__(self, D=10, M=2, alpha=7.18, epsilon=1e-6):
        super(BinaryMagnetLoss, self).__init__()
        self.D = D
        self.M = M
        self.alpha = alpha
        self.epsilon = epsilon

    def forward(self, outputs, indices, assignment):
        outputs = outputs.float()
        unique_clusters = torch.unique(assignment[indices])
        batch_clusters = {cluster.item(): (assignment[indices] == cluster).nonzero(as_tuple=True)[0] for cluster in unique_clusters}

        num_instances = sum(len(batch_clusters[cluster.item()]) for cluster in unique_clusters)
        c_means = torch.stack([outputs[batch_clusters[cluster.item()]].mean(dim=0) for cluster in unique_clusters])

        stdev = sum((outputs[i] - c_means[unique_clusters.tolist().index(assignment[indices[i]])]).norm(p=2).pow(2) 
                    for i in range(len(indices))) / max(num_instances - 1, 1)
        stdev = -2.0 * (stdev + self.epsilon)

        if torch.isnan(stdev).any():
            stdev = torch.tensor(1.0, device=outputs.device)  # Gán giá trị hợp lý nếu phát hiện NaN

        loss = torch.zeros(1).to(outputs.device)
        denom = [torch.zeros(1).to(outputs.device) for _ in range(len(indices))]

        for m, cluster in enumerate(unique_clusters):
            cluster_indices = batch_clusters[cluster.item()]
            for d in range(min(self.D, len(cluster_indices))):
                ind = cluster_indices[d].item()
                for other_cluster in unique_clusters:
                    if other_cluster != cluster:
                        denom[ind] += ((outputs[ind] - c_means[unique_clusters.tolist().index(other_cluster)]).norm().pow(2) / stdev).exp()

                loss_term = (((outputs[ind] - c_means[m]).norm().pow(2) / stdev - self.alpha).exp() / (denom[ind] + self.epsilon)).log().clamp(max=0.0)
                if torch.isnan(loss_term).any() or torch.isinf(loss_term).any():
                    continue
                loss -= loss_term

        loss /= num_instances
        return loss

# Custom Loss Function combining Cross-Entropy and Binary Magnet Loss
def custom_loss(y_true, y_pred, embeddings, inputs, alpha1=1.0, alpha2=2.0):
    mse_loss = F.mse_loss(inputs, y_pred) 
    magnet_loss_fn = BinaryMagnetLoss()
    magnet_loss_value = magnet_loss_fn(embeddings, torch.arange(len(y_true)), y_true)
    return alpha1 * mse_loss + alpha2 * magnet_loss_value

# Hàm để áp dụng deep autoencoder và train classifiers
def train_with_autoencoder(X_train, y_train, X_val, y_val, X_test, y_test, classifiers, save_data, epochs=10, batch_size=256, alpha1=1.0, alpha2=2.0):
    input_dim = X_train.shape[1]
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    autoencoder = Autoencoder(input_dim).to(device)
    optimizer = optim.AdamW(autoencoder.parameters(), lr=0.0001)

    X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
    X_val_tensor = torch.tensor(X_val.to_numpy(), dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val.to_numpy(), dtype=torch.float32)

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    for epoch in range(epochs):
        autoencoder.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            decoded, encoded = autoencoder(inputs)
            loss = custom_loss(labels, decoded, encoded, inputs, alpha1=alpha1, alpha2=alpha2)
            if torch.isnan(loss).any() or torch.isinf(loss).any():
                print(f'Invalid loss detected at epoch {epoch+1}. Skipping this batch.')
                continue
            loss.backward()
            torch.nn.utils.clip_grad_norm_(autoencoder.parameters(), max_norm=1.0)  # Gradient Clipping
            optimizer.step()
            train_loss += loss.item()

        print(f'Epoch [{epoch+1}/{epochs}], Loss: {train_loss/len(train_loader)}')

    autoencoder.eval()
    with torch.no_grad():
        X_train_enc = autoencoder.encoder(X_train_tensor.to(device)).cpu().numpy()
        X_val_enc = autoencoder.encoder(X_val_tensor.to(device)).cpu().numpy()
        X_test_enc = autoencoder.encoder(torch.tensor(X_test.to_numpy(), dtype=torch.float32).to(device)).cpu().numpy()

    results = []
    try:
        for name, train_function in classifiers.items():
            model, val_acc, val_pre, val_rec, val_f1, test_acc, test_pre, test_rec, test_f1, time = train_function(X_train_enc, y_train, X_val_enc, y_val, X_test_enc, y_test, save_data)
            results.append({
                'Model': name,
                'Validation Accuracy': val_acc,
                'Validation Precision': val_pre,
                'Validation Recall': val_rec,
                'Validation F1 Score': val_f1,
                'Test Accuracy': test_acc,
                'Test Precision': test_pre,
                'Test Recall': test_rec,
                'Test F1 Score': test_f1,
                'epochs': epochs,
                'alpha1': alpha1,
                'alpha2': alpha2,
                'Time': time
            })
    except Exception as e:
        print(f"Error: {e}")

    return results

def generate_random_choice():
    num1 = random.randint(0, 10)
    num2 = random.randint(0, 10)
    return num1, num2

def generate_random_choice1():
    values = [i/100 for i in range(1, 11)]
    num1 = random.choice(values)
    num2 = random.choice(values)
    return num1, num2

def generate_random_choice2():
    values = [i/10 for i in range(1, 11)]
    num1 = random.choice(values)
    num2 = random.choice(values)
    return num1, num2

def generate_random_choice3():
    values1 = [i/10 for i in range(1, 11)]
    values2 = [i/100 for i in range(1, 11)]
    num1 = random.choice(values1)
    num2 = random.choice(values2)
    return num1, num2

def generate_random_choice4():
    values1 = [i/100 for i in range(1, 11)]
    values2 = [i/10 for i in range(1, 11)]
    num1 = random.choice(values1)
    num2 = random.choice(values2)
    return num1, num2

maxx  = 50
for rd in range(1, maxx):
    if rd < maxx/5:
        alpha1, alpha2 = generate_random_choice()
    elif rd < 2*maxx/5:
        alpha1, alpha2 = generate_random_choice1()
    elif rd < 3*maxx/5:
        alpha1, alpha2 = generate_random_choice2()
    elif rd < 4*maxx/5:
        alpha1, alpha2 = generate_random_choice3()
    else:
        alpha1, alpha2 = generate_random_choice4()

    # Gọi hàm và lưu kết quả
    autoencoder_results = train_with_autoencoder(X_train, y_train, X_val, y_val, X_test, y_test, classifiers, save_data, epochs=10, alpha1=alpha1, alpha2=alpha2)
    autoencoder_results_df = pd.DataFrame(autoencoder_results)

    csv_file = save_data + 'autoencoder_Binary_Magnet_Loss_results_2017.csv'

    with open(csv_file, mode='a+') as file:
        file.write('\n')  # Thêm một dòng trống
    autoencoder_results_df.to_csv(csv_file, mode='a+', header=True, index=False)


Epoch [1/10], Loss: 383705075417088.0
Epoch [2/10], Loss: 400948888338432.0
Epoch [3/10], Loss: 388454260146176.0
Epoch [4/10], Loss: 385589371207680.0
Epoch [5/10], Loss: 400566192701440.0
Epoch [6/10], Loss: 386521366528000.0
Epoch [7/10], Loss: 387149478232064.0
Epoch [8/10], Loss: 393766915538944.0
Epoch [9/10], Loss: 386012384591872.0
Epoch [10/10], Loss: 384830630526976.0
Decision Tree Validation - Accuracy: 0.9642857142857143 Precision: 0.9363295880149812 Recall: 0.9578544061302682 F1 Score: 0.946969696969697
Decision Tree Test - Accuracy: 0.9710698689956332 Precision: 0.9603960396039604 Recall: 0.9525368248772504 F1 Score: 0.9564502875924404
Testing time: 0.0040 seconds
Random Forest Validation - Accuracy: 0.9732142857142857 Precision: 0.9477611940298507 Recall: 0.9731800766283525 F1 Score: 0.9603024574669187
Random Forest Test - Accuracy: 0.9759825327510917 Precision: 0.968595041322314 Recall: 0.9590834697217676 F1 Score: 0.9638157894736842
Testing time: 0.0119 seconds
LDA Val

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch [1/10], Loss: 48881611833344.0
Epoch [2/10], Loss: 48582888390656.0
Epoch [3/10], Loss: 52364676169728.0
Epoch [4/10], Loss: 49569207025664.0
Epoch [5/10], Loss: 49305435504640.0
Epoch [6/10], Loss: 49678803795968.0
Epoch [7/10], Loss: 48405089615872.0
Epoch [8/10], Loss: 49417761193984.0
Epoch [9/10], Loss: 49046659792896.0
Epoch [10/10], Loss: 48384137363456.0
Decision Tree Validation - Accuracy: 0.9668367346938775 Precision: 0.9304029304029304 Recall: 0.9731800766283525 F1 Score: 0.951310861423221
Decision Tree Test - Accuracy: 0.9705240174672489 Precision: 0.9588138385502472 Recall: 0.9525368248772504 F1 Score: 0.9556650246305419
Testing time: 0.0041 seconds
Random Forest Validation - Accuracy: 0.9668367346938775 Precision: 0.9304029304029304 Recall: 0.9731800766283525 F1 Score: 0.951310861423221
Random Forest Test - Accuracy: 0.9732532751091703 Precision: 0.9591503267973857 Recall: 0.9607201309328969 F1 Score: 0.9599345870809485
Testing time: 0.0130 seconds
LDA Validation - 

In [9]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import random

# Định nghĩa Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        encoding_dim1 = 52
        encoding_dim2 = 26
        encoding_dim3 = int(np.sqrt(input_dim)) + 1

        # Lớp mã hóa
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim1),
            nn.BatchNorm1d(encoding_dim1),  # Thêm BatchNorm1d

            nn.Tanh(),
            nn.Linear(encoding_dim1, encoding_dim2),
            nn.BatchNorm1d(encoding_dim2),  # Thêm BatchNorm1d
            nn.Tanh(),
            nn.Linear(encoding_dim2, encoding_dim3),
            nn.BatchNorm1d(encoding_dim3),  # Thêm BatchNorm1d
            nn.Tanh(),
        )
        # Lớp giải mã
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim3, encoding_dim2),
            nn.BatchNorm1d(encoding_dim2),  # Thêm BatchNorm1d
            nn.Tanh(),
            nn.Linear(encoding_dim2, encoding_dim1),
            nn.BatchNorm1d(encoding_dim1),  # Thêm BatchNorm1d
            nn.Tanh(),
            nn.Linear(encoding_dim1, input_dim),
            nn.Tanh(),  # Giữ sigmoid cho đầu ra
        )
    
    def _initialize_weights(self):
        for layer in self.encoder:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                nn.init.zeros_(layer.bias)
        for layer in self.decoder:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                nn.init.zeros_(layer.bias)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded, encoded

# Định nghĩa BinaryMagnetLoss
class BinaryMagnetLoss(nn.Module):
    def __init__(self, D=10, M=2, alpha=7.18, epsilon=1e-6):
        super(BinaryMagnetLoss, self).__init__()
        self.D = D
        self.M = M
        self.alpha = alpha
        self.epsilon = epsilon

    def forward(self, outputs, indices, assignment):
        outputs = outputs.float()
        unique_clusters = torch.unique(assignment[indices])
        batch_clusters = {cluster.item(): (assignment[indices] == cluster).nonzero(as_tuple=True)[0] for cluster in unique_clusters}

        num_instances = sum(len(batch_clusters[cluster.item()]) for cluster in unique_clusters)
        c_means = torch.stack([outputs[batch_clusters[cluster.item()]].mean(dim=0) for cluster in unique_clusters])

        stdev = sum((outputs[i] - c_means[unique_clusters.tolist().index(assignment[indices[i]])]).norm(p=2).pow(2) 
                    for i in range(len(indices))) / max(num_instances - 1, 1)
        stdev = -2.0 * (stdev + self.epsilon)

        if torch.isnan(stdev).any():
            stdev = torch.tensor(1.0, device=outputs.device)  # Gán giá trị hợp lý nếu phát hiện NaN

        loss = torch.zeros(1).to(outputs.device)
        denom = [torch.zeros(1).to(outputs.device) for _ in range(len(indices))]

        for m, cluster in enumerate(unique_clusters):
            cluster_indices = batch_clusters[cluster.item()]
            for d in range(min(self.D, len(cluster_indices))):
                ind = cluster_indices[d].item()
                for other_cluster in unique_clusters:
                    if other_cluster != cluster:
                        denom[ind] += ((outputs[ind] - c_means[unique_clusters.tolist().index(other_cluster)]).norm().pow(2) / stdev).exp()

                loss_term = (((outputs[ind] - c_means[m]).norm().pow(2) / stdev - self.alpha).exp() / (denom[ind] + self.epsilon)).log().clamp(max=0.0)
                if torch.isnan(loss_term).any() or torch.isinf(loss_term).any():
                    continue
                loss -= loss_term

        loss /= num_instances
        return loss

# Custom Loss Function combining Cross-Entropy and Binary Magnet Loss
def custom_loss(y_true, y_pred, embeddings, inputs, alpha1=1.0, alpha2=2.0):
    mse_loss = F.mse_loss(inputs, y_pred) 
    magnet_loss_fn = BinaryMagnetLoss()
    magnet_loss_value = magnet_loss_fn(embeddings, torch.arange(len(y_true)), y_true)
    return alpha1 * mse_loss + alpha2 * magnet_loss_value

# Hàm để áp dụng deep autoencoder và train classifiers
def train_with_autoencoder(X_train, y_train, X_val, y_val, X_test, y_test, classifiers, save_data, epochs=10, batch_size=256, alpha1=1.0, alpha2=2.0):
    input_dim = X_train.shape[1]
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    autoencoder = Autoencoder(input_dim).to(device)
    optimizer = optim.Adadelta(autoencoder.parameters(), lr=0.0001)

    X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
    X_val_tensor = torch.tensor(X_val.to_numpy(), dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val.to_numpy(), dtype=torch.float32)

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    for epoch in range(epochs):
        autoencoder.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            decoded, encoded = autoencoder(inputs)
            loss = custom_loss(labels, decoded, encoded, inputs, alpha1=alpha1, alpha2=alpha2)
            if torch.isnan(loss).any() or torch.isinf(loss).any():
                print(f'Invalid loss detected at epoch {epoch+1}. Skipping this batch.')
                continue
            loss.backward()
            torch.nn.utils.clip_grad_norm_(autoencoder.parameters(), max_norm=1.0)  # Gradient Clipping
            optimizer.step()
            train_loss += loss.item()

        print(f'Epoch [{epoch+1}/{epochs}], Loss: {train_loss/len(train_loader)}')

    autoencoder.eval()
    with torch.no_grad():
        X_train_enc = autoencoder.encoder(X_train_tensor.to(device)).cpu().numpy()
        X_val_enc = autoencoder.encoder(X_val_tensor.to(device)).cpu().numpy()
        X_test_enc = autoencoder.encoder(torch.tensor(X_test.to_numpy(), dtype=torch.float32).to(device)).cpu().numpy()

    results = []
    try:
        for name, train_function in classifiers.items():
            model, val_acc, val_pre, val_rec, val_f1, test_acc, test_pre, test_rec, test_f1, time = train_function(X_train_enc, y_train, X_val_enc, y_val, X_test_enc, y_test, save_data)
            results.append({
                'Model': name,
                'Validation Accuracy': val_acc,
                'Validation Precision': val_pre,
                'Validation Recall': val_rec,
                'Validation F1 Score': val_f1,
                'Test Accuracy': test_acc,
                'Test Precision': test_pre,
                'Test Recall': test_rec,
                'Test F1 Score': test_f1,
                'epochs': epochs,
                'alpha1': alpha1,
                'alpha2': alpha2,
                'Time': time
            })
    except Exception as e:
        print(f"Error: {e}")

    return results

def generate_random_choice():
    num1 = random.randint(0, 10)
    num2 = random.randint(0, 10)
    return num1, num2

def generate_random_choice1():
    values = [i/100 for i in range(1, 11)]
    num1 = random.choice(values)
    num2 = random.choice(values)
    return num1, num2

def generate_random_choice2():
    values = [i/10 for i in range(1, 11)]
    num1 = random.choice(values)
    num2 = random.choice(values)
    return num1, num2

def generate_random_choice3():
    values1 = [i/10 for i in range(1, 11)]
    values2 = [i/100 for i in range(1, 11)]
    num1 = random.choice(values1)
    num2 = random.choice(values2)
    return num1, num2

def generate_random_choice4():
    values1 = [i/100 for i in range(1, 11)]
    values2 = [i/10 for i in range(1, 11)]
    num1 = random.choice(values1)
    num2 = random.choice(values2)
    return num1, num2

maxx  = 30
for rd in range(1, maxx):
    # if rd < maxx/5:
    #     alpha1, alpha2 = generate_random_choice()
    # elif rd < 2*maxx/5:
    #     alpha1, alpha2 = generate_random_choice1()
    if rd < 1*maxx/3:
        alpha1, alpha2 = generate_random_choice2()
    elif rd < 2*maxx/3:
        alpha1, alpha2 = generate_random_choice3()
    else:
        alpha1, alpha2 = generate_random_choice4()

    # Gọi hàm và lưu kết quả
    autoencoder_results = train_with_autoencoder(X_train, y_train, X_val, y_val, X_test, y_test, classifiers, save_data, epochs=10, alpha1=alpha1, alpha2=alpha2)
    autoencoder_results_df = pd.DataFrame(autoencoder_results)

    csv_file = save_data + 'autoencoder_Binary_Magnet_Loss_results_2017_14_8_v2.csv'

    with open(csv_file, mode='a+') as file:
        file.write('\n')  # Thêm một dòng trống
    autoencoder_results_df.to_csv(csv_file, mode='a+', header=True, index=False)

Epoch [1/10], Loss: 42187400806400.0
Epoch [2/10], Loss: 41364344274944.0
Epoch [3/10], Loss: 42059339137024.0
Epoch [4/10], Loss: 41115529183232.0
Epoch [5/10], Loss: 42231066787840.0
Epoch [6/10], Loss: 42470956335104.0
Epoch [7/10], Loss: 43133771972608.0
Epoch [8/10], Loss: 41698789687296.0
Epoch [9/10], Loss: 41819891433472.0
Epoch [10/10], Loss: 41889292484608.0
Decision Tree Validation - Accuracy: 0.9668367346938775 Precision: 0.9368029739776952 Recall: 0.9655172413793104 F1 Score: 0.9509433962264151
Decision Tree Test - Accuracy: 0.9737991266375546 Precision: 0.9652892561983472 Recall: 0.955810147299509 F1 Score: 0.9605263157894737
Testing time: 0.0065 seconds
Random Forest Validation - Accuracy: 0.9706632653061225 Precision: 0.9407407407407408 Recall: 0.9731800766283525 F1 Score: 0.9566854990583804
Random Forest Test - Accuracy: 0.9759825327510917 Precision: 0.9670510708401977 Recall: 0.9607201309328969 F1 Score: 0.9638752052545156
Testing time: 0.0125 seconds
LDA Validation -