# Detect website defacement with SVM

* Prepare by **vomtung@gmail.com**

In [None]:
%pip install pandas numpy scikit-learn matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load training and test data
train_df = pd.read_csv('dataset/train_data.csv')
test_df = pd.read_csv('dataset/test_data.csv')

print("=== TRAINING DATA ===")
print(f"Training data shape: {train_df.shape}")
print(f"Columns: {train_df.columns.tolist()}")
print(f"\nLabel distribution:")
print(train_df['Label'].value_counts())
print(f"\nFirst few samples:")
print(train_df.head())

print("\n=== TEST DATA ===")
print(f"Test data shape: {test_df.shape}")
print(f"Test Label distribution:")
print(test_df['Label'].value_counts())

In [None]:
# Prepare data for SVM
X_train = train_df['HTML'].fillna('')
y_train = train_df['Label']

X_test = test_df['HTML'].fillna('')
y_test = test_df['Label']

# TF-IDF Vectorization
print("Applying TF-IDF transformation...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Reduce features for SVM performance
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    stop_words='english'
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")
print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

In [None]:
# Train SVM model
print("Training SVM model...")

svm_model = SVC(
    kernel='rbf',     # Radial Basis Function kernel
    C=1.0,           # Regularization parameter
    gamma='scale',   # Kernel coefficient
    probability=True, # Enable probability estimates
    random_state=42
)

# Train the model
svm_model.fit(X_train_tfidf, y_train)

# Make predictions
y_train_pred = svm_model.predict(X_train_tfidf)
y_test_pred = svm_model.predict(X_test_tfidf)

# Calculate accuracies
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print("SVM model trained successfully!")

In [None]:
# Model Evaluation
print("=== CLASSIFICATION REPORT ===")
print(classification_report(y_test, y_test_pred, target_names=['Normal', 'Hacked']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Normal', 'Hacked'],
            yticklabels=['Normal', 'Hacked'])
plt.title('Confusion Matrix - SVM')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

print(f"\n=== CONFUSION MATRIX VALUES ===")
print(f"True Negatives: {cm[0,0]}")
print(f"False Positives: {cm[0,1]}")
print(f"False Negatives: {cm[1,0]}")
print(f"True Positives: {cm[1,1]}")

In [None]:
# ROC Curve
from sklearn.metrics import roc_curve, auc

# Get probability scores
y_test_proba = svm_model.predict_proba(X_test_tfidf)[:, 1]

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_test_proba)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - SVM')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)
plt.show()

print(f"ROC AUC Score: {roc_auc:.4f}")

In [None]:
# Test with new samples
def predict_website(html_content):
    """Predict if a website is hacked or normal using SVM"""
    html_tfidf = tfidf_vectorizer.transform([html_content])
    
    prediction = svm_model.predict(html_tfidf)[0]
    probability = svm_model.predict_proba(html_tfidf)[0]
    
    result = "Hacked" if prediction == 1 else "Normal"
    confidence = probability[prediction] * 100
    
    return result, confidence

# Test samples
test_samples = [
    "Welcome to our website! We offer great products and services.",
    "HACKED BY ANONYMOUS! Your site has been compromised!",
    "Contact us for more information about our company.",
    "Hacked By Shield Iran! We Are N3TC4t - Nazila Blackhat",
    "This is a normal business website with contact information."
]

print("=== TESTING WITH NEW SAMPLES ===")
for i, sample in enumerate(test_samples, 1):
    result, confidence = predict_website(sample)
    print(f"Sample {i}: {result} (Confidence: {confidence:.2f}%)")
    print(f"Text: {sample[:80]}...")
    print("-" * 60)