# Exploratory Data Analysis: Phishing Brand Classification

This notebook explores the website screenshot dataset for phishing brand classification.

## Objectives
1. Understand the dataset structure and class distribution
2. Analyze image properties (dimensions, file sizes, formats)
3. Visualize sample images from each brand
4. Identify potential data quality issues
5. Understand class imbalance (especially 'others' vs brands)

## Key Considerations for Phishing Detection
- The 'others' class represents benign websites - misclassifying these as brands creates false positives
- Brand websites share visual similarities that phishing sites exploit
- Image quality and dimensions may vary significantly

In [None]:
# Import required libraries
import os
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
from collections import Counter

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Set visualization style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Configuration
DATA_DIR = project_root / 'data' / 'raw'
FIGURES_DIR = project_root / 'outputs' / 'figures'
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data directory: {DATA_DIR}")
print(f"Data directory exists: {DATA_DIR.exists()}")

## 1. Dataset Overview

First, let's scan the dataset and create a comprehensive DataFrame with all image information.

In [None]:
from src.data.utils import scan_dataset, analyze_image_properties

# Scan the dataset
df = scan_dataset(str(DATA_DIR))

# Display basic info
print(f"\nDataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

In [None]:
# Get comprehensive analysis
analysis = analyze_image_properties(df)

print("Dataset Analysis")
print("=" * 50)
print(f"Total images: {analysis['total_images']}")
print(f"Number of classes: {analysis['num_classes']}")
print(f"\nFile size statistics:")
for key, value in analysis['file_size_stats'].items():
    print(f"  {key}: {value / 1024:.2f} KB")

## 2. Class Distribution Analysis

Understanding class distribution is crucial for:
- Identifying class imbalance
- Planning data augmentation strategies
- Setting appropriate class weights for training

In [None]:
# Class distribution
class_counts = df['label'].value_counts().sort_values(ascending=True)

# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
colors = ['coral' if c == 'others' else 'steelblue' for c in class_counts.index]
bars = axes[0].barh(class_counts.index, class_counts.values, color=colors)
axes[0].set_xlabel('Number of Images')
axes[0].set_title('Class Distribution')

# Add value labels
for bar, count in zip(bars, class_counts.values):
    axes[0].text(bar.get_width() + max(class_counts) * 0.01, bar.get_y() + bar.get_height()/2,
                 f'{count}', va='center', fontsize=10)

# Pie chart
# Group small classes for better visualization
axes[1].pie(class_counts.values, labels=class_counts.index, autopct='%1.1f%%',
           colors=['coral' if c == 'others' else 'steelblue' for c in class_counts.index])
axes[1].set_title('Class Distribution (%)')

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'class_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

# Print statistics
print("\nClass Statistics:")
print(class_counts.to_frame('count'))
print(f"\nImbalance ratio (max/min): {class_counts.max() / class_counts.min():.2f}")

In [None]:
# Analyze brands vs others ratio
others_count = class_counts.get('others', 0)
brand_count = class_counts.sum() - others_count

print(f"\nBrands vs Others Analysis:")
print(f"Total brand images: {brand_count}")
print(f"Total 'others' (benign) images: {others_count}")
print(f"Ratio (brands/others): {brand_count / others_count if others_count > 0 else 'N/A'}:.2f")
print(f"\nThis is important because:")
print("- 'Others' represents benign websites")
print("- False positives (benign classified as brand) = poor user experience")
print("- We need to be especially careful with 'others' classification")

## 3. Image Properties Analysis

Analyze image dimensions, aspect ratios, and file formats to understand:
- What preprocessing is needed
- Optimal input size for the model
- Potential quality issues

In [None]:
# Image dimension analysis
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Width distribution
axes[0, 0].hist(df['width'].dropna(), bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Width (pixels)')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('Image Width Distribution')
axes[0, 0].axvline(df['width'].median(), color='red', linestyle='--', label=f'Median: {df["width"].median():.0f}')
axes[0, 0].legend()

# Height distribution
axes[0, 1].hist(df['height'].dropna(), bins=50, color='coral', edgecolor='black', alpha=0.7)
axes[0, 1].set_xlabel('Height (pixels)')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Image Height Distribution')
axes[0, 1].axvline(df['height'].median(), color='red', linestyle='--', label=f'Median: {df["height"].median():.0f}')
axes[0, 1].legend()

# Aspect ratio
df['aspect_ratio'] = df['width'] / df['height']
axes[1, 0].hist(df['aspect_ratio'].dropna(), bins=50, color='seagreen', edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Aspect Ratio (width/height)')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Aspect Ratio Distribution')
axes[1, 0].axvline(df['aspect_ratio'].median(), color='red', linestyle='--', label=f'Median: {df["aspect_ratio"].median():.2f}')
axes[1, 0].legend()

# File size distribution
df['file_size_kb'] = df['file_size'] / 1024
axes[1, 1].hist(df['file_size_kb'].dropna(), bins=50, color='purple', edgecolor='black', alpha=0.7)
axes[1, 1].set_xlabel('File Size (KB)')
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_title('File Size Distribution')
axes[1, 1].axvline(df['file_size_kb'].median(), color='red', linestyle='--', label=f'Median: {df["file_size_kb"].median():.0f} KB')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'image_properties.png', dpi=150, bbox_inches='tight')
plt.show()

# Print statistics
print("\nImage Dimension Statistics:")
print(df[['width', 'height', 'aspect_ratio', 'file_size_kb']].describe())

In [None]:
# Dimension analysis by class
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Width by class
df.boxplot(column='width', by='label', ax=axes[0], rot=45)
axes[0].set_title('Width Distribution by Class')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Width (pixels)')

# Height by class
df.boxplot(column='height', by='label', ax=axes[1], rot=45)
axes[1].set_title('Height Distribution by Class')
axes[1].set_xlabel('Class')
axes[1].set_ylabel('Height (pixels)')

plt.suptitle('')  # Remove automatic title
plt.tight_layout()
plt.savefig(FIGURES_DIR / 'dimensions_by_class.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# File extension distribution
extension_counts = df['extension'].value_counts()

fig, ax = plt.subplots(figsize=(8, 5))
extension_counts.plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
ax.set_xlabel('File Extension')
ax.set_ylabel('Count')
ax.set_title('File Format Distribution')
plt.xticks(rotation=0)

for i, v in enumerate(extension_counts.values):
    ax.text(i, v + max(extension_counts) * 0.01, str(v), ha='center')

plt.tight_layout()
plt.show()

print(f"\nFile formats: {extension_counts.to_dict()}")

## 4. Sample Visualization

Visualize sample images from each class to understand:
- Visual characteristics of each brand
- Diversity within classes
- Quality variations

In [None]:
def display_sample_images(df, class_name, n_samples=4, figsize=(16, 4)):
    """Display sample images from a specific class."""
    class_df = df[df['label'] == class_name]
    
    if len(class_df) == 0:
        print(f"No images found for class: {class_name}")
        return
    
    samples = class_df.sample(min(n_samples, len(class_df)))
    
    fig, axes = plt.subplots(1, n_samples, figsize=figsize)
    if n_samples == 1:
        axes = [axes]
    
    for idx, (ax, (_, row)) in enumerate(zip(axes, samples.iterrows())):
        try:
            img = Image.open(row['image_path'])
            ax.imshow(img)
            ax.set_title(f"{row['domain'][:30]}...\n{row['width']}x{row['height']}")
        except Exception as e:
            ax.text(0.5, 0.5, f"Error loading\n{str(e)[:30]}", 
                   ha='center', va='center', transform=ax.transAxes)
        ax.axis('off')
    
    plt.suptitle(f'Sample Images: {class_name.upper()}', fontsize=14, fontweight='bold')
    plt.tight_layout()
    return fig

In [None]:
# Display samples from each class
classes = df['label'].unique()

for class_name in sorted(classes):
    fig = display_sample_images(df, class_name, n_samples=4)
    if fig:
        plt.savefig(FIGURES_DIR / f'samples_{class_name}.png', dpi=100, bbox_inches='tight')
        plt.show()
    print(f"\n{'-'*50}")

## 5. Data Quality Analysis

Check for potential data quality issues:
- Corrupted images
- Unusual dimensions
- Very small/large files

In [None]:
from src.data.utils import validate_dataset

# Validate all images (this may take a while for large datasets)
print("Validating images...")
df_validated = validate_dataset(df)

# Summary
valid_count = df_validated['is_valid'].sum()
invalid_count = (~df_validated['is_valid']).sum()

print(f"\nValidation Results:")
print(f"Valid images: {valid_count} ({valid_count/len(df)*100:.1f}%)")
print(f"Invalid images: {invalid_count} ({invalid_count/len(df)*100:.1f}%)")

# Show invalid images if any
if invalid_count > 0:
    print(f"\nInvalid images:")
    display(df_validated[~df_validated['is_valid']][['image_path', 'label', 'error']])

In [None]:
# Check for outliers in image dimensions
print("Checking for outliers...\n")

# Small images (potential quality issues)
small_images = df[(df['width'] < 200) | (df['height'] < 200)]
print(f"Very small images (<200px): {len(small_images)}")

# Large images (potential memory issues)
large_images = df[(df['width'] > 3000) | (df['height'] > 3000)]
print(f"Very large images (>3000px): {len(large_images)}")

# Unusual aspect ratios
unusual_aspect = df[(df['aspect_ratio'] < 0.5) | (df['aspect_ratio'] > 3.0)]
print(f"Unusual aspect ratios (<0.5 or >3.0): {len(unusual_aspect)}")

# Very small files (potential empty/corrupted)
small_files = df[df['file_size'] < 1000]  # Less than 1KB
print(f"Very small files (<1KB): {len(small_files)}")

## 6. Domain Analysis

Analyze the domain names in the dataset to understand:
- URL patterns
- Potential duplicates
- Domain characteristics

In [None]:
# Domain analysis
print("Domain Analysis")
print("="*50)

# Check for duplicate domains
duplicate_domains = df[df['domain'].duplicated(keep=False)]
print(f"Duplicate domains: {len(duplicate_domains)}")

if len(duplicate_domains) > 0:
    print(f"\nSample duplicates:")
    display(duplicate_domains.groupby('domain')['label'].apply(list).head(10))

# Domain length statistics
df['domain_length'] = df['domain'].str.len()

fig, ax = plt.subplots(figsize=(10, 5))
df.boxplot(column='domain_length', by='label', ax=ax, rot=45)
ax.set_title('Domain Name Length by Class')
ax.set_xlabel('Class')
ax.set_ylabel('Domain Length (characters)')
plt.suptitle('')
plt.tight_layout()
plt.show()

print(f"\nDomain length statistics:")
print(df.groupby('label')['domain_length'].describe())

## 7. Key Findings and Recommendations

Summarize findings and provide recommendations for the modeling phase.

In [None]:
# Summary statistics
print("="*60)
print("KEY FINDINGS SUMMARY")
print("="*60)

print(f"\n1. DATASET SIZE")
print(f"   Total images: {len(df)}")
print(f"   Number of classes: {df['label'].nunique()}")

print(f"\n2. CLASS DISTRIBUTION")
for cls, count in class_counts.items():
    pct = count / len(df) * 100
    marker = " <-- BENIGN" if cls == 'others' else ""
    print(f"   {cls}: {count} ({pct:.1f}%){marker}")

print(f"\n3. IMAGE PROPERTIES")
print(f"   Median dimensions: {df['width'].median():.0f} x {df['height'].median():.0f}")
print(f"   Median file size: {df['file_size_kb'].median():.0f} KB")
print(f"   Most common format: {df['extension'].mode().values[0]}")

print(f"\n4. DATA QUALITY")
print(f"   Valid images: {valid_count}/{len(df)} ({valid_count/len(df)*100:.1f}%)")
print(f"   Outliers identified: {len(small_images) + len(large_images) + len(unusual_aspect)}")

print(f"\n5. RECOMMENDATIONS")
print("   - Use image size 224x224 for EfficientNet or 384x384 for ViT")
print("   - Apply class weights to handle imbalance")
print("   - Use focal loss to focus on hard examples")
print("   - Apply data augmentation (rotation, brightness, blur)")
print("   - Set high confidence threshold to minimize false positives on 'others'")
print("   - Consider oversampling minority classes or undersampling majority")

In [None]:
# Save processed dataframe for next steps
output_path = project_root / 'data' / 'processed' / 'dataset_info.csv'
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False)
print(f"Dataset info saved to: {output_path}")