## üì¶ B∆∞·ªõc 1: C√†i ƒë·∫∑t th∆∞ vi·ªán

In [None]:
!pip install tensorflow==2.15.0 scikit-learn pandas numpy matplotlib seaborn -q

## üìÇ B∆∞·ªõc 2: Clone repository ho·∫∑c upload files

In [None]:
# Option 1: Clone t·ª´ GitHub (n·∫øu c√≥)
# !git clone https://github.com/your-username/phishing-detection.git
# %cd phishing-detection

# Option 2: Upload files t·ª´ m√°y t√≠nh
# Click v√†o bi·ªÉu t∆∞·ª£ng folder b√™n tr√°i -> Upload files
# Upload to√†n b·ªô project structure

# Check current directory
!pwd
!ls -la

## üìä B∆∞·ªõc 3: Import libraries v√† setup

In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from pathlib import Path
import logging
from datetime import datetime
import json
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Check GPU
print("TensorFlow version:", tf.__version__)
print("GPU Available:", tf.config.list_physical_devices('GPU'))

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## üèóÔ∏è B∆∞·ªõc 4: Define Model Architectures

In [None]:
from tensorflow.keras import layers, Model

# ============================================================================
# Attention Layer
# ============================================================================
class SeqSelfAttention(layers.Layer):
    """Sequence Self-Attention Layer"""
    
    def __init__(self, attention_dim=128, **kwargs):
        super(SeqSelfAttention, self).__init__(**kwargs)
        self.attention_dim = attention_dim
    
    def build(self, input_shape):
        self.W = self.add_weight(
            name='attention_weight',
            shape=(input_shape[-1], self.attention_dim),
            initializer='glorot_uniform',
            trainable=True
        )
        self.b = self.add_weight(
            name='attention_bias',
            shape=(self.attention_dim,),
            initializer='zeros',
            trainable=True
        )
        self.u = self.add_weight(
            name='attention_context',
            shape=(self.attention_dim,),
            initializer='glorot_uniform',
            trainable=True
        )
        super(SeqSelfAttention, self).build(input_shape)
    
    def call(self, inputs):
        uit = tf.tanh(tf.tensordot(inputs, self.W, axes=1) + self.b)
        ait = tf.tensordot(uit, self.u, axes=1)
        attention_weights = tf.nn.softmax(ait, axis=1)
        attention_weights = tf.expand_dims(attention_weights, axis=-1)
        weighted_input = inputs * attention_weights
        return weighted_input
    
    def get_config(self):
        config = super().get_config()
        config.update({'attention_dim': self.attention_dim})
        return config

print("‚úì Attention layer defined")

In [None]:
# ============================================================================
# 1. ANN Model
# ============================================================================
class ANNModel(Model):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, dropout=0.3, **kwargs):
        super(ANNModel, self).__init__(**kwargs)
        self.embedding = layers.Embedding(vocab_size, embedding_dim, mask_zero=False)
        self.dense1 = layers.Dense(hidden_dim, activation='relu')
        self.dropout1 = layers.Dropout(dropout)
        self.flatten = layers.Flatten()
        self.dense2 = layers.Dense(128, activation='relu')
        self.dropout2 = layers.Dropout(dropout)
        self.output_layer = layers.Dense(1, activation='sigmoid')
    
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.dense1(x)
        x = self.dropout1(x, training=training)
        x = self.flatten(x)
        x = self.dense2(x)
        x = self.dropout2(x, training=training)
        return self.output_layer(x)

# ============================================================================
# 2. ATT Model
# ============================================================================
class ATTModel(Model):
    def __init__(self, vocab_size, embedding_dim=128, attention_dim=128, dropout=0.3, **kwargs):
        super(ATTModel, self).__init__(**kwargs)
        self.embedding = layers.Embedding(vocab_size, embedding_dim, mask_zero=False)
        self.attention = SeqSelfAttention(attention_dim)
        self.dropout1 = layers.Dropout(dropout)
        self.flatten = layers.Flatten()
        self.dense1 = layers.Dense(128, activation='relu')
        self.dropout2 = layers.Dropout(dropout)
        self.output_layer = layers.Dense(1, activation='sigmoid')
    
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.attention(x)
        x = self.dropout1(x, training=training)
        x = self.flatten(x)
        x = self.dense1(x)
        x = self.dropout2(x, training=training)
        return self.output_layer(x)

# ============================================================================
# 3. RNN Model
# ============================================================================
class RNNModel(Model):
    def __init__(self, vocab_size, embedding_dim=128, lstm_units=256, dropout=0.3, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        self.embedding = layers.Embedding(vocab_size, embedding_dim, mask_zero=False)
        self.lstm = layers.LSTM(lstm_units, return_sequences=False)
        self.dropout1 = layers.Dropout(dropout)
        self.dense1 = layers.Dense(128, activation='relu')
        self.dropout2 = layers.Dropout(dropout)
        self.output_layer = layers.Dense(1, activation='sigmoid')
    
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.lstm(x)
        x = self.dropout1(x, training=training)
        x = self.dense1(x)
        x = self.dropout2(x, training=training)
        return self.output_layer(x)

# ============================================================================
# 4. BRNN Model
# ============================================================================
class BRNNModel(Model):
    def __init__(self, vocab_size, embedding_dim=128, lstm_units=256, dropout=0.3, **kwargs):
        super(BRNNModel, self).__init__(**kwargs)
        self.embedding = layers.Embedding(vocab_size, embedding_dim, mask_zero=False)
        self.bidirectional_lstm = layers.Bidirectional(
            layers.LSTM(lstm_units, return_sequences=False)
        )
        self.dropout1 = layers.Dropout(dropout)
        self.dense1 = layers.Dense(128, activation='relu')
        self.dropout2 = layers.Dropout(dropout)
        self.output_layer = layers.Dense(1, activation='sigmoid')
    
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.bidirectional_lstm(x)
        x = self.dropout1(x, training=training)
        x = self.dense1(x)
        x = self.dropout2(x, training=training)
        return self.output_layer(x)

# ============================================================================
# 5. CNN Model
# ============================================================================
class CNNModel(Model):
    def __init__(self, vocab_size, embedding_dim=128, num_filters=256, kernel_size=3, dropout=0.3, **kwargs):
        super(CNNModel, self).__init__(**kwargs)
        self.embedding = layers.Embedding(vocab_size, embedding_dim, mask_zero=False)
        self.conv1d = layers.Conv1D(num_filters, kernel_size, activation='relu', padding='same')
        self.pooling = layers.GlobalMaxPooling1D()
        self.dropout1 = layers.Dropout(dropout)
        self.dense1 = layers.Dense(128, activation='relu')
        self.dropout2 = layers.Dropout(dropout)
        self.output_layer = layers.Dense(1, activation='sigmoid')
    
    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.conv1d(x)
        x = self.pooling(x)
        x = self.dropout1(x, training=training)
        x = self.dense1(x)
        x = self.dropout2(x, training=training)
        return self.output_layer(x)

print("‚úì All 5 models defined: ANN, ATT, RNN, BRNN, CNN")

## üì• B∆∞·ªõc 5: Load Data

**L∆∞u √Ω**: B·∫°n c·∫ßn upload c√°c file sau v√†o Colab:
- `data/datasets/char_X_train.npy`
- `data/datasets/char_X_val.npy`
- `data/datasets/char_X_test.npy`
- `data/datasets/char_y_train.npy`
- `data/datasets/char_y_val.npy`
- `data/datasets/char_y_test.npy`

In [None]:
# Create data directory if not exists
!mkdir -p data/datasets

# Load preprocessed data
data_dir = Path("data/datasets")

X_train = np.load(data_dir / 'char_X_train.npy')
y_train = np.load(data_dir / 'char_y_train.npy')
X_val = np.load(data_dir / 'char_X_val.npy')
y_val = np.load(data_dir / 'char_y_val.npy')
X_test = np.load(data_dir / 'char_X_test.npy')
y_test = np.load(data_dir / 'char_y_test.npy')

print(f"Train: X={X_train.shape}, y={y_train.shape}")
print(f"Val:   X={X_val.shape}, y={y_val.shape}")
print(f"Test:  X={X_test.shape}, y={y_test.shape}")

# Calculate vocab size
vocab_size = int(X_train.max()) + 1
print(f"\nVocabulary size: {vocab_size}")

## üèãÔ∏è B∆∞·ªõc 6: Build v√† Compile Models

In [None]:
# Model configurations
configs = {
    'ann': {
        'vocab_size': vocab_size,
        'embedding_dim': 128,
        'hidden_dim': 256,
        'dropout': 0.3
    },
    'att': {
        'vocab_size': vocab_size,
        'embedding_dim': 128,
        'attention_dim': 128,
        'dropout': 0.3
    },
    'rnn': {
        'vocab_size': vocab_size,
        'embedding_dim': 128,
        'lstm_units': 256,
        'dropout': 0.3
    },
    'brnn': {
        'vocab_size': vocab_size,
        'embedding_dim': 128,
        'lstm_units': 256,
        'dropout': 0.3
    },
    'cnn': {
        'vocab_size': vocab_size,
        'embedding_dim': 128,
        'num_filters': 256,
        'kernel_size': 3,
        'dropout': 0.3
    }
}

# Build models
models = {}
models['ann'] = ANNModel(**configs['ann'])
models['att'] = ATTModel(**configs['att'])
models['rnn'] = RNNModel(**configs['rnn'])
models['brnn'] = BRNNModel(**configs['brnn'])
models['cnn'] = CNNModel(**configs['cnn'])

# Compile models
for model_name, model in models.items():
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=[
            'accuracy',
            keras.metrics.Precision(name='precision'),
            keras.metrics.Recall(name='recall'),
            keras.metrics.AUC(name='auc')
        ]
    )
    print(f"‚úì {model_name.upper()} model built and compiled")

print("\n‚úì All models ready for training!")

## üéØ B∆∞·ªõc 7: Train Models

Ch·ªçn model mu·ªën train (ho·∫∑c train t·∫•t c·∫£)

In [None]:
# Training configuration
EPOCHS = 50
BATCH_SIZE = 128

# Select which models to train (set to True/False)
train_config = {
    'ann': True,
    'att': True,
    'rnn': True,
    'brnn': True,
    'cnn': True
}

# Store training histories
histories = {}

# Train selected models
for model_name, should_train in train_config.items():
    if not should_train:
        print(f"‚è≠Ô∏è  Skipping {model_name.upper()}")
        continue
    
    print(f"\n{'='*80}")
    print(f"üöÄ Training {model_name.upper()} Model")
    print(f"{'='*80}")
    
    # Callbacks
    callbacks = [
        keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True,
            verbose=1
        ),
        keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-7,
            verbose=1
        )
    ]
    
    # Train
    history = models[model_name].fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=callbacks,
        verbose=1
    )
    
    histories[model_name] = history
    print(f"\n‚úÖ {model_name.upper()} training completed!")

print("\nüéâ All training completed!")

## üìä B∆∞·ªõc 8: Evaluate Models

In [None]:
# Evaluate all trained models
results = {}

print("\n" + "="*80)
print("üìä EVALUATING ALL MODELS")
print("="*80 + "\n")

for model_name, model in models.items():
    if model_name not in histories:
        continue
    
    print(f"\nüîç Evaluating {model_name.upper()}...")
    
    # Evaluate on test set
    test_loss, test_acc, test_precision, test_recall, test_auc = model.evaluate(
        X_test, y_test, verbose=0
    )
    
    # Predictions
    y_pred_proba = model.predict(X_test, verbose=0)
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    # Calculate F1 Score
    from sklearn.metrics import f1_score
    f1 = f1_score(y_test, y_pred)
    
    # Store results
    results[model_name] = {
        'accuracy': test_acc,
        'precision': test_precision,
        'recall': test_recall,
        'f1_score': f1,
        'auc': test_auc,
        'loss': test_loss
    }
    
    print(f"Accuracy:  {test_acc:.4f}")
    print(f"Precision: {test_precision:.4f}")
    print(f"Recall:    {test_recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"AUC:       {test_auc:.4f}")

print("\n" + "="*80)
print("‚úÖ Evaluation completed!")
print("="*80)

## üìà B∆∞·ªõc 9: Visualize Results

In [None]:
# Plot comparison
if results:
    # Create comparison DataFrame
    df_results = pd.DataFrame(results).T
    df_results = df_results.sort_values('accuracy', ascending=False)
    
    print("\nüìä Model Comparison:")
    print(df_results.to_string())
    
    # Plot metrics comparison
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    fig.suptitle('Model Comparison - All Metrics', fontsize=16, fontweight='bold')
    
    metrics = ['accuracy', 'precision', 'recall', 'f1_score', 'auc', 'loss']
    
    for idx, metric in enumerate(metrics):
        ax = axes[idx // 3, idx % 3]
        data = df_results[metric].sort_values(ascending=(metric != 'loss'))
        
        bars = ax.barh(data.index, data.values)
        
        # Color bars
        colors = plt.cm.viridis(np.linspace(0, 1, len(data)))
        for bar, color in zip(bars, colors):
            bar.set_color(color)
        
        ax.set_xlabel(metric.replace('_', ' ').title())
        ax.set_title(f'{metric.replace("_", " ").title()}')
        ax.grid(axis='x', alpha=0.3)
        
        # Add value labels
        for i, v in enumerate(data.values):
            ax.text(v, i, f' {v:.4f}', va='center')
    
    plt.tight_layout()
    plt.show()
    
    # Find best model
    best_model = df_results['accuracy'].idxmax()
    print(f"\nüèÜ Best Model: {best_model.upper()}")
    print(f"   Accuracy: {df_results.loc[best_model, 'accuracy']:.4f}")
    print(f"   F1 Score: {df_results.loc[best_model, 'f1_score']:.4f}")

## üìâ B∆∞·ªõc 10: Plot Training History

In [None]:
# Plot training history for each model
for model_name, history in histories.items():
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    fig.suptitle(f'{model_name.upper()} Training History', fontsize=14, fontweight='bold')
    
    # Plot accuracy
    axes[0].plot(history.history['accuracy'], label='Train Accuracy', linewidth=2)
    axes[0].plot(history.history['val_accuracy'], label='Val Accuracy', linewidth=2)
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Accuracy')
    axes[0].set_title('Model Accuracy')
    axes[0].legend()
    axes[0].grid(alpha=0.3)
    
    # Plot loss
    axes[1].plot(history.history['loss'], label='Train Loss', linewidth=2)
    axes[1].plot(history.history['val_loss'], label='Val Loss', linewidth=2)
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Loss')
    axes[1].set_title('Model Loss')
    axes[1].legend()
    axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## üîç B∆∞·ªõc 11: Confusion Matrix

In [None]:
# Plot confusion matrix for each model
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Confusion Matrices - All Models', fontsize=16, fontweight='bold')

for idx, (model_name, model) in enumerate(models.items()):
    if model_name not in histories:
        continue
    
    ax = axes[idx // 3, idx % 3]
    
    # Predictions
    y_pred_proba = model.predict(X_test, verbose=0)
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # Plot
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                xticklabels=['Legitimate', 'Phishing'],
                yticklabels=['Legitimate', 'Phishing'])
    ax.set_title(f'{model_name.upper()}')
    ax.set_ylabel('True Label')
    ax.set_xlabel('Predicted Label')

# Remove empty subplot
if len(histories) < 6:
    fig.delaxes(axes[1, 2])

plt.tight_layout()
plt.show()

## üíæ B∆∞·ªõc 12: Save Models

In [None]:
# Save trained models
!mkdir -p trained_models

for model_name, model in models.items():
    if model_name not in histories:
        continue
    
    save_path = f"trained_models/{model_name}_model.h5"
    model.save(save_path)
    print(f"‚úì Saved {model_name.upper()} to {save_path}")

# Save results to CSV
if results:
    df_results.to_csv('model_comparison_results.csv')
    print("\n‚úì Results saved to model_comparison_results.csv")

print("\n‚úÖ All models saved!")

## üéØ B∆∞·ªõc 13: Test Prediction (Optional)

Test v·ªõi m·ªôt v√†i URL m·∫´u

In [None]:
# Example: Simple character tokenization for testing
def simple_tokenize(url, max_len=200):
    """Simple character tokenization"""
    # Convert to character indices (a=1, b=2, etc.)
    chars = list(url.lower())
    indices = [ord(c) for c in chars]
    
    # Pad or truncate
    if len(indices) < max_len:
        indices = indices + [0] * (max_len - len(indices))
    else:
        indices = indices[:max_len]
    
    return np.array([indices])

# Test URLs
test_urls = [
    "https://www.google.com",
    "http://suspicious-site-login-verify.tk",
    "https://github.com",
]

print("\nüîç Testing Predictions:\n")

for url in test_urls:
    print(f"URL: {url}")
    
    # Tokenize
    X = simple_tokenize(url)
    
    # Predict with all models
    print("Predictions:")
    for model_name, model in models.items():
        if model_name not in histories:
            continue
        
        pred_proba = model.predict(X, verbose=0)[0][0]
        pred_label = "Phishing" if pred_proba > 0.5 else "Legitimate"
        print(f"  {model_name.upper():6s}: {pred_proba:.4f} ‚Üí {pred_label}")
    
    print()

print("\n‚ö†Ô∏è  Note: N√†y ch·ªâ l√† demo ƒë∆°n gi·∫£n. ƒê·ªÉ c√≥ k·∫øt qu·∫£ ch√≠nh x√°c, c·∫ßn d√πng tokenizer ƒë√£ train.")

## üì• B∆∞·ªõc 14: Download Models (Optional)

Download c√°c model ƒë√£ train v·ªÅ m√°y

In [None]:
# Zip all trained models
!zip -r trained_models.zip trained_models/

# Download
from google.colab import files
files.download('trained_models.zip')
files.download('model_comparison_results.csv')

print("‚úÖ Files ready for download!")

---

## üìù T√≥m t·∫Øt

Notebook n√†y train v√† so s√°nh 5 models Deep Learning:

1. **ANN** - Simple feed-forward network
2. **ATT** - Attention-based network
3. **RNN** - LSTM network
4. **BRNN** - Bidirectional LSTM
5. **CNN** - Convolutional network

### K·∫øt qu·∫£:
- Models ƒë∆∞·ª£c train v√† evaluate tr√™n test set
- So s√°nh performance c·ªßa t·∫•t c·∫£ models
- Visualization: training history, metrics, confusion matrix
- Models ƒë∆∞·ª£c save ƒë·ªÉ s·ª≠ d·ª•ng sau

### ƒê·ªÉ s·ª≠ d·ª•ng:
1. Upload data files (.npy)
2. Run t·∫•t c·∫£ cells theo th·ª© t·ª±
3. Xem k·∫øt qu·∫£ comparison
4. Download models ƒë√£ train

üéâ **Happy Training!**