In [None]:
### Cell 1: Import Libraries
import pandas as pd
import numpy as np
import joblib # For loading models and preprocessors
import matplotlib.pyplot as plt
import seaborn as sns
# No need to import StandardScaler, OneHotEncoder, ColumnTransformer directly here
# as we'll load the pre-fitted preprocessor object which contains these
# from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve # Uncomment if you have true labels for quantitative evaluation



In [None]:
### Cell 2: Load Trained Model, Threshold, and Fitted Preprocessor
# --- 2.1 Load Trained Model and Threshold ---
model_load_path = 'isolation_forest_model.joblib'
threshold_load_path = 'anomaly_threshold.txt'

try:
    model = joblib.load(model_load_path)
    with open(threshold_load_path, 'r') as f:
        chosen_threshold = float(f.read())
    print(f"Successfully loaded model from {model_load_path}")
    print(f"Successfully loaded threshold: {chosen_threshold:.4f}")
except FileNotFoundError:
    print(f"Error: Model or threshold file not found.")
    print("Please ensure model_trainer.py ran successfully and the files exist in the current directory.")
    model = None # Set to None to indicate loading failed
    chosen_threshold = None
    # Subsequent cells will check for None to avoid NameError

# --- 2.2 Load the FITTED Preprocessor ---
preprocessor_load_path = 'fitted_preprocessor.joblib'
try:
    preprocessor = joblib.load(preprocessor_load_path)
    print(f"Successfully loaded fitted preprocessor from {preprocessor_load_path}")
except FileNotFoundError:
    print(f"Error: Fitted preprocessor file '{preprocessor_load_path}' not found.")
    print("Please ensure preprocessing.ipynb was run and saved the preprocessor.")
    preprocessor = None # Set to None to indicate loading failed
except Exception as e:
    print(f"An unexpected error occurred while loading preprocessor: {e}")
    preprocessor = None # Set to None to indicate loading failed

# --- 2.3 Define necessary constants used during feature engineering in preprocessing.ipynb ---
# These are needed to perform the *same* feature engineering steps on new raw data.
common_ports = {
    'port_21_ftp': 21, 'port_22_ssh': 22, 'port_23_telnet': 23, 'port_80_http': 80,
    'port_443_https': 443, 'port_3389_rdp': 3389, 'port_8080_proxy': 8080
}
protocol_map = {1: 'ICMP', 6: 'TCP', 17: 'UDP'} # Needed for protocol_name feature

# Columns that were dropped *before* the ColumnTransformer in preprocessing.ipynb
# This list is used to drop corresponding columns from new data before transformation.
features_to_drop_for_preprocessing = [
    'timestamp', 'src_ip', 'dst_ip', 'flow_key', 'bidirectional_flow_key',
    'sorted_ips', 'sorted_ports', 'protocol' # Include minute_of_hour if you created it in preprocessing.ipynb, otherwise omit
]

In [None]:
### Cell 3: Prepare New Data for Evaluation (Using dns.cap)
from scapy.all import rdpcap, IP, TCP, UDP # Import scapy here as it's used in this cell

# Path to your new, raw PCAP file for evaluation
# We are using 'dns.cap' as requested for evaluation.
# Ensure 'dns.cap' is in your project directory
new_pcap_file_path = 'dns.cap'


# Function to parse raw PCAP into extracted features (re-used from pcap_parser.py logic)
def parse_raw_pcap_for_eval(pcap_file):
    packets = rdpcap(pcap_file)
    data = []
    for pkt in packets:
        if IP in pkt:
            ip_src = pkt[IP].src
            ip_dst = pkt[IP].dst
            protocol = pkt[IP].proto
            pkt_len = len(pkt)
            timestamp = pkt.time # scapy.time returns a float
            src_port = None
            dst_port = None
            if TCP in pkt:
                src_port = pkt[TCP].sport
                dst_port = pkt[TCP].dport
            elif UDP in pkt:
                src_port = pkt[UDP].sport
                dst_port = pkt[UDP].dport
            row = {
                'timestamp': timestamp, 'src_ip': str(ip_src), 'dst_ip': str(ip_dst),
                'protocol': protocol, 'src_port': src_port, 'dst_port': dst_port,
                'packet_length': pkt_len
            }
            data.append(row)
    return pd.DataFrame(data)

# --- Step 3.1: Parse the new raw PCAP into extracted features ---
df_new_raw_features = pd.DataFrame() # Initialize as empty DataFrame in case of errors
try:
    df_new_raw_features = parse_raw_pcap_for_eval(new_pcap_file_path)

    # Ensure timestamp is treated as a numeric value before converting to datetime
    df_new_raw_features['timestamp'] = pd.to_numeric(df_new_raw_features['timestamp'], errors='coerce')
    df_new_raw_features['timestamp'] = pd.to_datetime(df_new_raw_features['timestamp'], unit='s', errors='coerce') # Use errors='coerce' to turn unparseable dates into NaT
    df_new_raw_features.dropna(subset=['timestamp'], inplace=True) # Remove rows where timestamp couldn't be parsed

    print(f"\nSuccessfully parsed new PCAP: {len(df_new_raw_features)} rows.")
    print(df_new_raw_features.head())

except FileNotFoundError:
    print(f"Error: New PCAP file '{new_pcap_file_path}' not found. Cannot evaluate.")
    print("Please ensure the evaluation PCAP file exists in the current directory and the path is correct.")
except Exception as e:
    print(f"Error during new PCAP parsing or initial timestamp conversion: {e}")

# --- Step 3.2: Apply the SAME Feature Engineering as Training Data ---
# This block must perfectly replicate the feature engineering steps from preprocessing.ipynb
df_new_engineered = df_new_raw_features.copy()

if not df_new_engineered.empty:
    df_new_engineered['hour_of_day'] = df_new_engineered['timestamp'].dt.hour
    df_new_engineered['day_of_week'] = df_new_engineered['timestamp'].dt.dayofweek
    df_new_engineered['minute_of_hour'] = df_new_engineered['timestamp'].dt.minute # THIS LINE IS NOW ACTIVE

    for feature_name, port_num in common_ports.items():
        df_new_engineered[feature_name] = ((df_new_engineered['src_port'] == port_num) | (df_new_engineered['dst_port'] == port_num)).astype(int)

    df_new_engineered['protocol_name'] = df_new_engineered['protocol'].map(protocol_map).fillna(df_new_engineered['protocol']).astype(str)

    df_new_engineered['sorted_ips'] = df_new_engineered.apply(lambda row: tuple(sorted([row['src_ip'], row['dst_ip']])), axis=1)
    df_new_engineered['sorted_ports'] = df_new_engineered.apply(lambda row: tuple(sorted([row['src_port'], row['dst_port']])), axis=1)
    df_new_engineered['bidirectional_flow_key'] = df_new_engineered.apply(lambda row:
        f"{row['sorted_ips'][0]}_{row['sorted_ips'][1]}_{row['sorted_ports'][0]}_{row['sorted_ports'][1]}_{row['protocol_name']}", axis=1)

    bidirectional_flow_features_new = df_new_engineered.groupby('bidirectional_flow_key').agg(
        bidir_flow_duration=('timestamp', lambda x: (x.max() - x.min()).total_seconds()),
        bidir_total_packets=('packet_length', 'size'),
        bidir_total_bytes=('packet_length', 'sum'),
        bidir_mean_packet_length=('packet_length', 'mean'),
        bidir_std_packet_length=('packet_length', 'std'),
        num_unique_src_ips=('src_ip', lambda x: x.nunique()),
        num_unique_dst_ips=('dst_ip', lambda x: x.nunique())
    ).reset_index()

    bidirectional_flow_features_new['bidir_flow_duration'] = bidirectional_flow_features_new['bidir_flow_duration'].replace(0, np.nan).fillna(0)
    bidirectional_flow_features_new['bidir_std_packet_length'] = bidirectional_flow_features_new['bidir_std_packet_length'].fillna(0)
    df_new_engineered = pd.merge(df_new_engineered, bidirectional_flow_features_new, on='bidirectional_flow_key', how='left')

    # Retrieve the exact numerical columns from the preprocessor's stored transformers
    try:
        numerical_cols_from_preprocessor = preprocessor.named_transformers_['num'].get_feature_names_out()
    except Exception as e:
        print(f"Error getting numerical column names from preprocessor: {e}. Ensure preprocessor is loaded and correctly configured.")
        numerical_cols_from_preprocessor = []

    for col in numerical_cols_from_preprocessor: # Use the list from the fitted preprocessor
        if col in df_new_engineered.columns:
            df_new_engineered[col] = pd.to_numeric(df_new_engineered[col], errors='coerce').fillna(0)
        else:
            print(f"Warning: Numerical column '{col}' from training data is missing in new data. Filling with 0.")
            df_new_engineered[col] = 0.0

    # Fill NaN for port columns consistently (as they become strings for OneHotEncoder)
    for col in ['src_port', 'dst_port']:
        if col in df_new_engineered.columns:
            df_new_engineered[col] = df_new_engineered[col].fillna(-1).astype(int).astype(str)
        else:
            df_new_engineered[col] = '-1'

    # Drop the temporary/original columns before transforming using the preprocessor
    df_new_data_for_transform = df_new_engineered.drop(columns=features_to_drop_for_preprocessing, errors='ignore')

    # --- Step 3.3: Transform the new data using the FITTED preprocessor ---
    if preprocessor is not None:
        try:
            X_new_preprocessed = preprocessor.transform(df_new_data_for_transform)

            # Create a DataFrame for the new preprocessed data with correct column names
            all_preprocessed_cols = preprocessor.get_feature_names_out()
            df_new_preprocessed = pd.DataFrame(X_new_preprocessed, columns=all_preprocessed_cols)
            print("\nNew Preprocessed Data Head (ready for inference):")
            print(df_new_preprocessed.head())
        except Exception as e:
            print(f"Error during preprocessor transformation: {e}")
            df_new_preprocessed = pd.DataFrame() # Set to empty on error
    else:
        print("Preprocessor not loaded. Cannot transform new data.")
        df_new_preprocessed = pd.DataFrame() # Create empty DataFrame to avoid errors later
else:
    print("New raw features DataFrame is empty. Skipping feature engineering and transformation.")
    df_new_preprocessed = pd.DataFrame() # Create empty DataFrame to avoid errors later



In [None]:
### Cell 4: Perform Anomaly Detection Inference on New Data
# Check if df_new_preprocessed was successfully created and populated
if 'df_new_preprocessed' in locals() and not df_new_preprocessed.empty and model is not None and chosen_threshold is not None:
    # Get anomaly scores for the new data
    df_new_preprocessed['anomaly_score'] = model.decision_function(df_new_preprocessed.values)

    # Apply the loaded threshold to classify anomalies
    df_new_preprocessed['is_anomaly_detected'] = (df_new_preprocessed['anomaly_score'] <= chosen_threshold).astype(int)
    # Convert 0/1 to 1/-1 for consistency (1: normal, -1: anomaly)
    df_new_preprocessed['is_anomaly_detected'] = df_new_preprocessed['is_anomaly_detected'].replace({0: 1, 1: -1})

    print("\nAnomaly detection results on new data (first 10 rows):")
    print(df_new_preprocessed[['anomaly_score', 'is_anomaly_detected']].head(10))

    num_anomalies_new = df_new_preprocessed[df_new_preprocessed['is_anomaly_detected'] == -1].shape[0]
    print(f"\nTotal anomalies detected in new data: {num_anomalies_new}")
    print(f"Proportion of anomalies detected in new data: {num_anomalies_new / len(df_new_preprocessed):.4f}")
else:
    print("Skipping anomaly detection inference due to empty data, or missing model/threshold, or df_new_preprocessed not defined.")



In [None]:
### Cell 5: Evaluation and Analysis (Qualitative and Quantitative)
# --- Qualitative Inspection of Detected Anomalies ---
if 'df_new_preprocessed' in locals() and not df_new_preprocessed.empty and 'is_anomaly_detected' in df_new_preprocessed.columns:
    detected_anomalies = df_new_preprocessed[df_new_preprocessed['is_anomaly_detected'] == -1]
    print("\nCharacteristics of Detected Anomalies (Preprocessed Features):")
    if not detected_anomalies.empty:
        print(detected_anomalies.describe()) # Summary statistics for preprocessed features of anomalies

        # To see original context (IPs, ports etc.), you would merge with df_new_raw_features
        # Ensure df_new_raw_features is populated from Cell 3.
        # This merge uses the index to align.
        if 'df_new_raw_features' in locals() and not df_new_raw_features.empty:
            df_anomalies_with_context = df_new_raw_features.iloc[detected_anomalies.index].copy()
            df_anomalies_with_context['anomaly_score'] = detected_anomalies['anomaly_score']
            df_anomalies_with_context['is_anomaly_detected'] = detected_anomalies['is_anomaly_detected']
            print("\nOriginal context of Detected Anomalies (first 5):")
            print(df_anomalies_with_context.head())
        else:
            print("\nOriginal raw features not available for context display.")

    else:
        print("No anomalies detected in the new data based on the current threshold.")
else:
    print("Cannot perform anomaly analysis: Data or anomaly detection results are missing.")

# --- Visualize Anomaly Scores (New Data) ---
if 'df_new_preprocessed' in locals() and not df_new_preprocessed.empty and 'anomaly_score' in df_new_preprocessed.columns:
    plt.figure(figsize=(12, 6))
    sns.histplot(df_new_preprocessed['anomaly_score'], bins=50, kde=True, color='purple')
    plt.title('Distribution of Anomaly Scores on New (Evaluation) Data', fontsize=16)
    plt.xlabel('Anomaly Score', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    if chosen_threshold is not None:
        plt.axvline(x=chosen_threshold, color='red', linestyle='--', label=f'Chosen Threshold: {chosen_threshold:.4f}')
    plt.legend(fontsize=10)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()
else:
    print("Cannot plot anomaly score distribution: Data or anomaly scores are missing.")
