In [14]:
# Nhập thư viện
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import time

import warnings
warnings.filterwarnings('ignore')

# Đọc bộ dữ liệu
corpus = pd.read_csv('data/cleaned_mhc.csv')

# Chuẩn bị dữ liệu
X_train, X_test, y_train, y_test = train_test_split(
    corpus['text'],
    corpus['label'],
    test_size=0.2,
    random_state=42
)

tfidf = TfidfVectorizer(max_features=3500)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

n_components = 100

lsa = TruncatedSVD(n_components=n_components, random_state=42)
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.transform(X_test_tfidf)

print(f"TF-IDF - Train shape: {X_train_tfidf.shape}, Test Shape: {X_test_tfidf.shape}")
print(f"LSA - Train shape: {X_train_lsa.shape}, Test Shape: {X_test_lsa.shape}\n")

import torch

if torch.cuda.is_available():
    print("CUDA is available. Running on GPU.")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"GPU Capability: {torch.cuda.get_device_capability(0)}")
    print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("CUDA is not available. Running on CPU.")
    
print(f"CPU: {torch.get_num_threads()} threads available")
print(f"PyTorch Version: {torch.__version__}")

TF-IDF - Train shape: (18592, 3500), Test Shape: (4648, 3500)
LSA - Train shape: (18592, 100), Test Shape: (4648, 100)

CUDA is available. Running on GPU.
GPU Name: NVIDIA GeForce RTX 3060
CUDA Version: 12.1
GPU Capability: (8, 6)
Total GPU Memory: 12.88 GB
CPU: 6 threads available
PyTorch Version: 2.5.1


In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import time
import pandas as pd
import numpy as np

# Define the device (GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class MLP(nn.Module):
    def __init__(self, input_size, architecture='tfidf', custom_layers=None):
        super(MLP, self).__init__()
        
        # Default architectures
        architectures = {
            'tfidf': {
                'hidden_layers': [1024, 512, 256],
                'dropout_rates': [0.5, 0.4, 0.3]
            },
            'lsa': {
                'hidden_layers': [64, 32, 16, 8],
                'dropout_rates': [0.4, 0.3, 0.3, 0.2]
            }
        }
        
        # Use custom layers if provided, otherwise use default architecture
        if custom_layers:
            hidden_layers = custom_layers['hidden_layers']
            dropout_rates = custom_layers['dropout_rates']
        else:
            hidden_layers = architectures[architecture]['hidden_layers']
            dropout_rates = architectures[architecture]['dropout_rates']
        
        # Build layers
        layers = []
        prev_size = input_size
        
        for i, (hidden_size, dropout_rate) in enumerate(zip(hidden_layers, dropout_rates)):
            layers.extend([
                nn.Linear(prev_size, hidden_size),
                nn.ReLU(),
                nn.BatchNorm1d(hidden_size),
                nn.Dropout(dropout_rate)
            ])
            prev_size = hidden_size
        
        # Output layer
        layers.append(nn.Linear(prev_size, 1))
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

def prepare_data(X_train, y_train, X_test, y_test, batch_size):
    # Convert to dense if sparse
    if hasattr(X_train, "toarray"):
        X_train = X_train.toarray()
    if hasattr(X_test, "toarray"):
        X_test = X_test.toarray()
        
    # Move data to the selected device (GPU/CPU)
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    return train_loader, test_loader

def train_evaluate_mlp(X_train, y_train, X_test, y_test, model_params):
    # Unpack parameters
    architecture = model_params.get('architecture', 'tfidf')
    custom_layers = model_params.get('custom_layers', None)
    learning_rate = model_params.get('learning_rate', 0.001)
    batch_size = model_params.get('batch_size', 64)
    epochs = model_params.get('epochs', 10)
    model_name = model_params.get('model_name', f'MLP-{architecture.upper()}')
    
    # Prepare data
    train_loader, test_loader = prepare_data(X_train, y_train, X_test, y_test, batch_size)
    
    # Initialize model, loss function, and optimizer
    input_size = X_train.shape[1]
    model = MLP(input_size=input_size, architecture=architecture, custom_layers=custom_layers).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    
    # Training
    start_fit = time.time()
    for epoch in range(epochs):
        model.train()  # Enable dropout during training
        epoch_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch).squeeze()
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss/len(train_loader):.4f}')
            
    fit_time = time.time() - start_fit

    # Evaluation (with dropout disabled)
    model.eval()
    start_test = time.time()
    with torch.no_grad():
        # Training set evaluation (without dropout)
        train_outputs = []
        train_true = []
        for X_batch, y_batch in train_loader:
            outputs = model(X_batch).squeeze()
            train_outputs.extend(torch.round(torch.sigmoid(outputs)).cpu().numpy())
            train_true.extend(y_batch.cpu().numpy())
        
        # Test set evaluation
        test_outputs = []
        test_true = []
        for X_batch, y_batch in test_loader:
            outputs = model(X_batch).squeeze()
            test_outputs.extend(torch.round(torch.sigmoid(outputs)).cpu().numpy())
            test_true.extend(y_batch.cpu().numpy())
    
    test_time = time.time() - start_test

    # Calculate metrics
    metrics = {
        'Model': model_name,
        'Train Accuracy (with dropout)': accuracy_score(y_train, train_outputs),
        'Train Accuracy (no dropout)': accuracy_score(train_true, train_outputs),
        'Test Accuracy': accuracy_score(test_true, test_outputs),
        'Train F1 Score': f1_score(train_true, train_outputs),
        'Test F1 Score': f1_score(test_true, test_outputs),
        'Train Precision': precision_score(train_true, train_outputs),
        'Test Precision': precision_score(test_true, test_outputs),
        'Train Recall': recall_score(train_true, train_outputs),
        'Test Recall': recall_score(test_true, test_outputs),
        'Fit Time (s)': fit_time,
        'Test Time (s)': test_time,
        'Train Confusion Matrix': confusion_matrix(train_true, train_outputs),
        'Test Confusion Matrix': confusion_matrix(test_true, test_outputs)
    }
    
    return metrics, model

# Example usage:
tfidf_params = {
    'architecture': 'tfidf',
    'learning_rate': 0.001,
    'batch_size': 64,
    'epochs': 10,
    'model_name': 'MLP-TF-IDF'
}

lsa_params = {
    'architecture': 'lsa',
    'learning_rate': 0.001,
    'batch_size': 64,
    'epochs': 10,
    'model_name': 'MLP-LSA'
}

# Train and evaluate models
metrics_list = []

# Train TF-IDF model
tfidf_metrics, tfidf_model = train_evaluate_mlp(X_train_tfidf, y_train, X_test_tfidf, y_test, tfidf_params)
metrics_list.append(tfidf_metrics)

# Train LSA model
lsa_metrics, lsa_model = train_evaluate_mlp(X_train_lsa, y_train, X_test_lsa, y_test, lsa_params)
metrics_list.append(lsa_metrics)

# Display results
results_df = pd.DataFrame(metrics_list)
display_columns = ['Model', 'Train Accuracy (no dropout)', 'Train Accuracy (with dropout)', 
                  'Test Accuracy', 'Train F1 Score', 'Test F1 Score', 
                  'Train Precision', 'Test Precision', 'Train Recall', 
                  'Test Recall', 'Fit Time (s)', 'Test Time (s)']
print("\nResults:")
print(results_df[display_columns])

# Print confusion matrices
print("\nConfusion Matrices:")
for metrics in metrics_list:
    print(f"\n{metrics['Model']} Train Confusion Matrix:")
    print(metrics['Train Confusion Matrix'])
    print(f"\n{metrics['Model']} Test Confusion Matrix:")
    print(metrics['Test Confusion Matrix'])

Epoch [1/10], Loss: 0.2521
Epoch [2/10], Loss: 0.1588
Epoch [3/10], Loss: 0.1070
Epoch [4/10], Loss: 0.0681
Epoch [5/10], Loss: 0.0510
Epoch [6/10], Loss: 0.0474
Epoch [7/10], Loss: 0.0415
Epoch [8/10], Loss: 0.0335
Epoch [9/10], Loss: 0.0262
Epoch [10/10], Loss: 0.0283
Epoch [1/10], Loss: 0.4406
Epoch [2/10], Loss: 0.2994
Epoch [3/10], Loss: 0.2794
Epoch [4/10], Loss: 0.2682
Epoch [5/10], Loss: 0.2666
Epoch [6/10], Loss: 0.2663
Epoch [7/10], Loss: 0.2571
Epoch [8/10], Loss: 0.2539
Epoch [9/10], Loss: 0.2515
Epoch [10/10], Loss: 0.2496

Results:
        Model  Train Accuracy (no dropout)  Train Accuracy (with dropout)  \
0  MLP-TF-IDF                     0.999462                       0.506992   
1     MLP-LSA                     0.920288                       0.504303   

   Test Accuracy  Train F1 Score  Test F1 Score  Train Precision  \
0       0.916308        0.999505       0.924393         0.999703   
1       0.909639        0.926713       0.918224         0.925615   

   Test Pre