In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

####################################
# 1) Load Data (Parquet Files)
####################################
# Must Use Official Dataset
train_file = r"C:\Users\taejo\Desktop\CSI-5388-project\train_data.parquet"
test_file  = r"C:\Users\taejo\Desktop\CSI-5388-project\test_data.parquet"

df_train = pd.read_parquet(train_file)
df_test  = pd.read_parquet(test_file)

# Common preprocessing function
def common_preprocessing(df):
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.columns = df.columns.str.strip()
    # Create UID: combine Source IP and Destination IP
    df['uid'] = df['Source IP'].str.strip() + "-" + df['Destination IP'].str.strip()
    # Convert the Timestamp column to datetime and sort
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df.sort_values('Timestamp', inplace=True)
    # Clean up the Label column: remove whitespace and convert to uppercase (e.g., "BENIGN", "DrDoS_DNS", etc.)
    df['Label'] = df['Label'].astype(str).str.strip().str.upper()
    return df

df_train = common_preprocessing(df_train)
df_test  = common_preprocessing(df_test)

print("Train dataset size:", df_train.shape)
print("Test dataset size:", df_test.shape)

####################################
# 2) Set Parameters (Refer to Table 1 in the paper)
####################################
time_interval = '2min'      # Time interval T (e.g., 2 minutes)
beta_lower    = 0.2         # Lower threshold for system entropy (βlower)
delta_susp    = 0.15        # Entropy change threshold for suspicious cluster (δsusp)
delta_attack  = 0.10        # Entropy change threshold for attacker cluster (δattack)

####################################
# 3) Entropy Calculation Function (Shannon Entropy)
####################################
def shannon_entropy(freqs):
    total = np.sum(freqs)
    if total == 0:
        return 0
    p = freqs / total
    return -np.sum(p * np.log2(p + 1e-10))

####################################
# 4) Data Processing Function (Two-stage Attacker Detection by Time Interval, Multiclass Classification)
####################################
def process_data(df):
    predictions = []
    
    # Group by time interval using pd.Grouper
    for window_start, window_data in df.groupby(pd.Grouper(key='Timestamp', freq=time_interval)):
        window_data = window_data.copy()
        if len(window_data) == 0:
            continue
        
        # Step 1: Count requests per uid
        user_counts = window_data.groupby('uid').size()
        H_system = shannon_entropy(user_counts.values)
        
        # Initially set all predicted labels to "BENIGN" for all data
        window_data['predicted_label'] = "BENIGN"
        
        # Step 2: If the overall system entropy is below βlower, mark the time window as suspicious
        if H_system < beta_lower:
            # Suspicious time window → Perform KMeans clustering for each uid (number of clusters: 3)
            uid_df = pd.DataFrame({
                'uid': user_counts.index,
                'count': user_counts.values
            })
            uid_df['count_log'] = np.log1p(uid_df['count'])
            
            # If there are fewer than 3 active UIDs, clustering is not feasible → Keep the window as "BENIGN"
            if uid_df.shape[0] < 3:
                predictions.append(window_data)
                continue
            
            scaler_uid = StandardScaler()
            X_uid = scaler_uid.fit_transform(uid_df[['count', 'count_log']])
            
            kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
            uid_df['cluster'] = kmeans.fit_predict(X_uid)
            
            # Determine the order of clusters based on the mean count of each cluster
            cluster_means = uid_df.groupby('cluster')['count'].mean().sort_values()
            cluster_order = cluster_means.index.tolist()
            # Mapping: low count → normal, medium → suspicious, high → attackers
            cluster_label_map = {
                cluster_order[0]: 'BENIGN',         # Normal
                cluster_order[1]: 'SUSPICIOUS',       # Suspicious
                cluster_order[2]: 'ATTACKERS'         # Attackers
            }
            uid_df['cluster_label'] = uid_df['cluster'].map(cluster_label_map)
            
            # Step 3: Calculate H_normal from the set of UIDs in the normal cluster
            normal_uids = uid_df[uid_df['cluster_label'] == 'BENIGN']['uid']
            normal_counts = user_counts[normal_uids] if len(normal_uids) > 0 else None
            H_normal = shannon_entropy(normal_counts.values) if (normal_counts is not None and normal_counts.sum() > 0) else 0
            
            # Step 4: Additional entropy ratio check for suspicious and attacker clusters
            def ratio_entropy(uid_candidate):
                if normal_counts is None or normal_counts.sum() == 0:
                    return 1.0
                if uid_candidate in normal_counts.index:
                    return 1.0
                combined = pd.concat([normal_counts, pd.Series([user_counts.loc[uid_candidate]], index=[uid_candidate])])
                return shannon_entropy(combined.values) / (H_normal + 1e-10)
            
            # Set of UIDs ultimately determined as attackers
            final_attacker_uids = set()
            # Apply threshold for each of the suspicious and attacker clusters
            for row in uid_df.itertuples():
                uid_val = row.uid
                cl_label = row.cluster_label
                if cl_label == 'BENIGN':
                    continue
                r = ratio_entropy(uid_val)
                if cl_label == 'SUSPICIOUS' and r < delta_susp:
                    final_attacker_uids.add(uid_val)
                elif cl_label == 'ATTACKERS' and r < delta_attack:
                    final_attacker_uids.add(uid_val)
            
            # Final prediction: For UIDs determined as attackers within the window, predict the label as the majority (mode) true label of that uid,
            # otherwise predict "BENIGN".
            # Get the mode of the true Label for each uid
            uid_label_mode = window_data.groupby('uid')['Label'].agg(lambda x: x.mode().iloc[0])
            
            def assign_label(uid):
                if uid in final_attacker_uids:
                    return uid_label_mode.loc[uid]
                else:
                    return "BENIGN"
            
            window_data['predicted_label'] = window_data['uid'].apply(assign_label)
        
        # If the system entropy is high enough, the predictions in window_data remain "BENIGN".
        predictions.append(window_data)
    
    return pd.concat(predictions).sort_index()

####################################
# 5) Process and Evaluate the Training Dataset (Multiclass)
####################################
df_train_pred = process_data(df_train)

# Multiclass evaluation: use the original Label column as the true label
cm_train = confusion_matrix(df_train_pred['Label'], df_train_pred['predicted_label'])
accuracy_train = accuracy_score(df_train_pred['Label'], df_train_pred['predicted_label'])
report_train = classification_report(df_train_pred['Label'], df_train_pred['predicted_label'])

print("\n--- Train Dataset Results ---")
print("Confusion Matrix (Rows: True, Columns: Predicted):")
print(cm_train)
print("Accuracy: {:.2f}%".format(accuracy_train * 100))
print(report_train)

####################################
# 6) Process and Evaluate the Test Dataset (Multiclass)
####################################
df_test_pred = process_data(df_test)
cm_test = confusion_matrix(df_test_pred['Label'], df_test_pred['predicted_label'])
accuracy_test = accuracy_score(df_test_pred['Label'], df_test_pred['predicted_label'])
report_test = classification_report(df_test_pred['Label'], df_test_pred['predicted_label'])

print("\n--- Test Dataset Results ---")
print("Confusion Matrix (Rows: True, Columns: Predicted):")
print(cm_test)
print("Accuracy: {:.2f}%".format(accuracy_test * 100))
print(report_test)

import matplotlib.pyplot as plt
import seaborn as sns

# Assume that the confusion matrix (cm_test) has already been computed.
# For example: cm_test = confusion_matrix(df_test_pred['Label'], df_test_pred['predicted_label'])

# Set class names (e.g., "Benign", "Attack")
class_names = df_test['Label'].unique()
plt.figure(figsize=(15, 10))
sns.heatmap(cm_test, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.title("Confusion Matrix (Test Dataset)")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
# plt.show()

# If you wish to save the image to a file, use the following:
plt.savefig("confusion_matrix_test.png")

# Normalize each row (true label): divide each cell by the row sum
cm_norm = cm_test.astype('float') / cm_test.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(12, 10))
sns.heatmap(cm_norm, annot=True, fmt=".2f", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.title("Normalized Confusion Matrix (Test Dataset)")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
# plt.show()
plt.savefig("confusion_matrix_test_normalized.png")
