In [2]:
# core Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import sys
import io
warnings.filterwarnings('ignore')

# fix windows console encoding
#if sys.platform == 'win32':
 #   sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
if sys.platform == 'win32' and hasattr(sys.stdout, "buffer"):
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')


# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_curve, auc
)

# Ensemble & Boosting Models
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Set visual style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# Configuration
RANDOM_STATE = 42
TEST_SIZE = 0.2
FIGSIZE = (12, 8)

print("=" * 70)
print("NETWORK TRAFFIC CLASSIFICATION - AI CRAWLER TAR PITS RESEARCH")
print("=" * 70)
print(f"Execution Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("Libraries loaded successfully!")
print("=" * 70)

NETWORK TRAFFIC CLASSIFICATION - AI CRAWLER TAR PITS RESEARCH
Execution Time: 2026-02-04 15:04:06
Libraries loaded successfully!


In [9]:
# =========================
# BASIC DATASET INFORMATION
# =========================

print("\n" + "=" * 70)
print("DATASET INFORMATION")
print("=" * 70)

# Drop accidental index column if present
if 'Unnamed: 0' in df.columns:
    df.drop(columns=['Unnamed: 0'], inplace=True)

print(f"\nðŸ“Š Dataset Dimensions:")
print(f"   â€¢ Total Samples: {df.shape[0]:,}")
print(f"   â€¢ Total Features: {df.shape[1]}")

# Detect label column safely
label_col = 'Label' if 'Label' in df.columns else 'label'

# Defensive check (important for robustness)
if label_col not in df.columns:
    raise ValueError("Label column not found in dataset")

# Class distribution
attack_count = (df[label_col] == 1).sum()
normal_count = (df[label_col] == 0).sum()
total_count = len(df)

print(f"\nðŸ“ˆ Class Distribution:")
print(f"   â€¢ Attack Traffic (Label=1): {attack_count:,} "
      f"({attack_count / total_count * 100:.1f}%)")
print(f"   â€¢ Normal Traffic (Label=0): {normal_count:,} "
      f"({normal_count / total_count * 100:.1f}%)")

# Column names
print(f"\nðŸ“‹ Column Names ({len(df.columns)} columns):")
print(df.columns.tolist())



DATASET INFORMATION

ðŸ“Š Dataset Dimensions:
   â€¢ Total Samples: 92,212
   â€¢ Total Features: 58

ðŸ“ˆ Class Distribution:
   â€¢ Attack Traffic (Label=1): 38,898 (42.2%)
   â€¢ Normal Traffic (Label=0): 53,314 (57.8%)

ðŸ“‹ Column Names (58 columns):
['Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Bwd PSH Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt', 'ACK Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg', 

In [8]:
# Basic dataset information
print("\n" + "=" * 70)
print("DATASET INFORMATION")
print("=" * 70)

print(f"\nðŸ“Š Dataset Dimensions:")
print(f"   â€¢ Total Samples: {len(df):,}")
print(f"   â€¢ Total Features: {df.shape[1]}")

# detecting column lable
label_col = 'Label' if 'Label' in df.columns else 'label'

print(f"\nðŸ“ˆ Class Distribution:")
print(f"   â€¢ Attack Traffic (Label=1): {len(df[df[label_col] == 1]):,} "
      f"({len(df[df[label_col] == 1]) / len(df) * 100:.1f}%)")
print(f"   â€¢ Normal Traffic (Label=0): {len(df[df[label_col] == 0]):,} "
      f"({len(df[df[label_col] == 0]) / len(df) * 100:.1f}%)")

print(f"\nðŸ“‹ Column Names ({len(df.columns)} columns):")
print(df.columns)



DATASET INFORMATION

ðŸ“Š Dataset Dimensions:
   â€¢ Total Samples: 92,212
   â€¢ Total Features: 59

ðŸ“ˆ Class Distribution:
   â€¢ Attack Traffic (Label=1): 38,898 (42.2%)
   â€¢ Normal Traffic (Label=0): 53,314 (57.8%)

ðŸ“‹ Column Names (59 columns):
Index(['Unnamed: 0', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Bwd PSH Flags', 'Fwd Header Len', 'Bwd Header Len',
       'Fwd Pkts/s', 'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max',
       'Pkt Len Mean', 'Pkt Len Std', 'Pkt Len Var', 'FIN Fl