In [2]:
# core Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import sys
import io
warnings.filterwarnings('ignore')

# fix windows console encoding
#if sys.platform == 'win32':
 #   sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
if sys.platform == 'win32' and hasattr(sys.stdout, "buffer"):
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')


# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_curve, auc
)

# Ensemble & Boosting Models
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Set visual style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# Configuration
RANDOM_STATE = 42
TEST_SIZE = 0.2
FIGSIZE = (12, 8)

print("=" * 70)
print("NETWORK TRAFFIC CLASSIFICATION - AI CRAWLER TAR PITS RESEARCH")
print("=" * 70)
print(f"Execution Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("Libraries loaded successfully!")
print("=" * 70)

NETWORK TRAFFIC CLASSIFICATION - AI CRAWLER TAR PITS RESEARCH
Execution Time: 2026-02-04 15:04:06
Libraries loaded successfully!


In [5]:
#Data Set Loading
print("\nüìÅ Loading CTU-13 Dataset...")
print("-" * 50)

# load both datasets
attack_df = pd.read_csv('data/CTU13_Attack_Traffic.csv')
normal_df = pd.read_csv('data/CTU13_Normal_Traffic.csv')

print(f"‚úì Attack Traffic samples loaded: {len(attack_df):,}")
print(f"‚úì Normal Traffic samples loaded: {len(normal_df):,}")

# combine datasets
df = pd.concat([attack_df, normal_df], ignore_index=True)
print(f"\nüìä Combined Dataset Shape: {df.shape}")

# display first few rows
print("\nüìã Sample Data (First 5 rows):")
(df.head(5))


üìÅ Loading CTU-13 Dataset...
--------------------------------------------------
‚úì Attack Traffic samples loaded: 38,898
‚úì Normal Traffic samples loaded: 53,314

üìä Combined Dataset Shape: (92212, 59)

üìã Sample Data (First 5 rows):


Unnamed: 0.1,Unnamed: 0,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,...,Fwd Act Data Pkts,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,59086131,7,1,0,0,0,0,0.0,0.0,...,0,2987276.0,0.0,2987276,2987276,18699620.0,19471121.45,41116855,5999291,1
1,1,12452268,37,1,2408,68,68,50,65.081081,6.72631,...,37,0.0,0.0,0,0,0.0,0.0,0,0,1
2,2,118741070,5,4,170,682,45,22,34.0,10.440307,...,5,2276383.0,0.0,2276383,2276383,116128100.0,0.0,116128125,116128125,1
3,3,180643,25,11,180,25790,90,0,7.2,24.919872,...,2,0.0,0.0,0,0,0.0,0.0,0,0,1
4,4,440,4,1,0,0,0,0,0.0,0.0,...,0,0.0,0.0,0,0,0.0,0.0,0,0,1
