In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [None]:
data_dir = Path("data/cic-ids2018/")

# Move all CSV files into a raw subdirectory
raw_dir = data_dir / "raw"
raw_dir.mkdir(exist_ok=True)

for file in data_dir.glob("*.csv"):
    print(f"Moving {file} to {raw_dir}...")
    file.rename(raw_dir / file.name)

# Load all CSV files from the raw directory
all_files = sorted(raw_dir.glob("*.csv"))
all_dfs = []
for filename in all_files:
    print(f"Loading {filename}...")
    all_dfs.append(pd.read_csv(filename, low_memory=False))

df: pd.DataFrame = pd.concat(all_dfs, ignore_index=True)
df

Loading data/cic-ids2018/raw/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv...
Loading data/cic-ids2018/raw/Friday-16-02-2018_TrafficForML_CICFlowMeter.csv...
Loading data/cic-ids2018/raw/Friday-23-02-2018_TrafficForML_CICFlowMeter.csv...
Loading data/cic-ids2018/raw/Thuesday-20-02-2018_TrafficForML_CICFlowMeter.csv...
Loading data/cic-ids2018/raw/Thursday-01-03-2018_TrafficForML_CICFlowMeter.csv...
Loading data/cic-ids2018/raw/Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv...
Loading data/cic-ids2018/raw/Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv...
Loading data/cic-ids2018/raw/Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv...
Loading data/cic-ids2018/raw/Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv...


Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label,Flow ID,Src IP,Src Port,Dst IP
0,443,6,02/03/2018 08:47:38,141385,9,7,553,3773.0,202,0,...,0.0,0.0,0.0,0.0,0.0,Benign,,,,
1,49684,6,02/03/2018 08:47:38,281,2,1,38,0.0,38,0,...,0.0,0.0,0.0,0.0,0.0,Benign,,,,
2,443,6,02/03/2018 08:47:40,279824,11,15,1086,10527.0,385,0,...,0.0,0.0,0.0,0.0,0.0,Benign,,,,
3,443,6,02/03/2018 08:47:40,132,2,0,0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,Benign,,,,
4,443,6,02/03/2018 08:47:41,274016,9,13,1285,6141.0,517,0,...,0.0,0.0,0.0,0.0,0.0,Benign,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15184422,23,6,28/02/2018 11:59:12,3,1,1,0,0,0,0,...,0,0,0,0,0,Infilteration,,,,
15184423,425,6,28/02/2018 10:50:04,2,1,1,0,0,0,0,...,0,0,0,0,0,Infilteration,,,,
15184424,445,6,28/02/2018 12:52:55,732728,2,2,0,0,0,0,...,0,0,0,0,0,Benign,,,,
15184425,23,6,28/02/2018 11:10:50,22,1,1,0,0,0,0,...,0,0,0,0,0,Infilteration,,,,


In [3]:
identifier_cols = ["Flow ID", "Src IP", "Src Port", "Dst IP", "Timestamp"]
df = df.drop(columns=identifier_cols)

# Remove zero-variance columns
nunique = df.nunique()
cols_to_drop = nunique[nunique == 1].index
if len(cols_to_drop) > 0:
    df = df.drop(columns=cols_to_drop)
    print(f"Dropped zero-variance columns: {list(cols_to_drop)}")

# Convert feature columns to numerics
feature_cols = [c for c in df.columns if c not in ["Label"]]
df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors='coerce')

# Remove rows with inf/-inf/NaN values
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

# Fix label typos and remove rows with missing labels
df["Fine Label"] = df["Label"].replace({"Infilteration": "Infiltration", "SQL Injection": np.nan, "Label": np.nan})
df = df.dropna(subset=["Fine Label"])

# Aggregate labels into broader attack categories
label_mapping = {
    'Benign': 'Benign',
    'DoS attacks-Hulk': 'DoS',
    'DoS attacks-SlowHTTPTest': 'DoS',
    'DoS attacks-GoldenEye': 'DoS',
    'DoS attacks-Slowloris': 'DoS',
    'DDOS attack-HOIC': 'DDoS',
    'DDoS attacks-LOIC-HTTP': 'DDoS',
    'DDOS attack-LOIC-UDP': 'DDoS',
    'Bot': 'Bot',
    'Brute Force -Web': 'Brute Force',
    'Brute Force -XSS': 'Brute Force',
    'Infiltration': 'Infiltration',
}

df['Label'] = df['Fine Label'].map(label_mapping)
df.dropna(subset=['Label'], inplace=True)

processed_dir = data_dir / "processed"
processed_dir.mkdir(exist_ok=True)
df.to_csv(processed_dir / "cleaned.csv", index=False)

In [8]:
df.columns

Index(['Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
       'Fwd Seg Siz

In [7]:
# identifier_cols = ["Src IP", "Src Port", "Dst IP", "Timestamp"]
# df = df.drop(columns=identifier_cols)
df.to_csv(processed_dir / "cleaned.csv", index=False)

In [None]:
# Remove identifier columns
df = df.drop(columns=identifier_cols)

# Remove zero-variance columns
nunique = df.nunique()
cols_to_drop = nunique[nunique == 1].index
if len(cols_to_drop) > 0:
    df = df.drop(columns=cols_to_drop)
    print(f"Dropped zero-variance columns: {list(cols_to_drop)}")

# Convert feature columns to numerics
feature_cols = [c for c in df.columns if c != "Label"]
df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors='coerce')

# Remove rows with inf/-inf/NaN values
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

# One-hot enccode categorical features
cat_columns = ["Protocol"]
df = pd.get_dummies(df, columns=cat_columns, dtype=int)

# Scale features
scaler = MinMaxScaler()
feature_cols = [c for c in df.columns if c != "Label"]
df[feature_cols] = scaler.fit_transform(df[feature_cols])
df[feature_cols] = df[feature_cols].astype(np.float32)
df

# Map labels to higher-level attacks
label_mapping = {
    'Benign': 'Benign',
    'DoS attacks-Hulk': 'DoS',
    'DoS attacks-SlowHTTPTest': 'DoS',
    'DoS attacks-GoldenEye': 'DoS',
    'DoS attacks-Slowloris': 'DoS',
    'DDOS attack-HOIC': 'DDoS',
    'DDoS attacks-LOIC-HTTP': 'DDoS',
    'DDOS attack-LOIC-UDP': 'DDoS',
    'Bot': 'Bot',
    'Brute Force -Web': 'Brute Force',
    'Brute Force -XSS': 'Brute Force',
    'Infiltration': 'Infiltration',
}

df['Label'] = df['Label'].map(label_mapping)
df.dropna(subset=['Label'], inplace=True)

In [None]:
# Split into training and testing sets
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['Label']
)

# Drop 80% of the "Benign" samples from the training set to address class imbalance
benign_mask = train_df['Label'] == 'Benign'
benign_indices = train_df[benign_mask].index
n_benign_to_drop = int(0.8 * len(benign_indices))
indices_to_drop = np.random.choice(benign_indices, size=n_benign_to_drop, replace=False)
train_df = train_df.drop(index=indices_to_drop)

# Save preprocessed datasets
preprocessed_dir = data_dir / "preprocessed"
preprocessed_dir.mkdir(exist_ok=True)
train_df.to_csv(preprocessed_dir / "train.csv", index=False)
test_df.to_csv(preprocessed_dir / "test.csv", index=False)

In [None]:
from textwrap import dedent

def generate_prompt_from_row(row):
    attack_descriptions = {
        'DoS attacks-Slowloris': "Slowloris exhausts a server's resources by holding many connections open with partial HTTP requests using minimal bandwidth.",
        'DoS attacks-Hulk': "Hulk generates large volumes of HTTP requests with random parameters to overwhelm the server.",
        'DoS attacks-GoldenEye': "GoldenEye sends malformed HTTP requests at high frequency to degrade server responsiveness.",
        'DoS attacks-SlowHTTPTest': "SlowHTTPTest sends legitimate-looking HTTP traffic very slowly to keep server connections open indefinitely.",
        'DDOS attack-HOIC': "HOIC floods web servers with HTTP requests from multiple machines using booster scripts to randomize targets.",
        'DDoS attacks-LOIC-HTTP': "LOIC-HTTP overwhelms web services with multi-threaded GET/POST requests in distributed fashion.",
        'DDOS attack-LOIC-UDP': "LOIC-UDP floods targets with high-rate UDP packets from multiple machines.",
        'Bot': "This flow is from a machine infected with Zeus or Ares botnet, which periodically exfiltrate data or perform keylogging and remote commands.",
        'Brute Force -Web': "The attacker tried different username/password combinations on a web login interface in an automated fashion.",
        'Brute Force -XSS': "This attack injected malicious JavaScript to exploit cross-site scripting vulnerabilities.",
        'Infiltration': "A compromised internal host scanned internal network resources after being exploited through a malicious document.",
        'Benign': "This flow represents normal activity in a typical corporate network, such as file transfers, browsing, or background services."
    }

    label = row["Label"]
    fine_label = row["Fine Label"]
    desc = attack_descriptions.get(fine_label, "No description available.")

    prompt = dedent(f"""You are a cybersecurity analyst. Your task is to explain why the following network flow is labeled as a {fine_label} attack. This flow is part of the CIC-IDS2018 dataset, which includes realistic attack scenarios and benign behavior.

    ### Ground Truth
    - Label (High-level): {label}
    - Label (Specific Variant): {fine_label}

    ### Background on Attack
    {desc}

    ### INSTRUCTIONS
    Only use the flow features and values below to construct your explanation. Do NOT reference IPs, timestamps, or any external knowledge. Justify the label using only flow-level behavioral characteristics.

    ### Flow Features
    - Dst Port: {row['Dst Port']}
    - Flow Duration: {row['Flow Duration']}
    - Tot Fwd Pkts: {row['Tot Fwd Pkts']}
    - TotLen Fwd Pkts: {row['TotLen Fwd Pkts']}
    - Fwd Pkt Len Max: {row['Fwd Pkt Len Max']}
    - Fwd Pkt Len Mean: {row['Fwd Pkt Len Mean']}
    - Fwd IAT Tot: {row['Fwd IAT Tot']}
    - Fwd IAT Mean: {row['Fwd IAT Mean']}
    - Fwd IAT Max: {row['Fwd IAT Max']}
    - Fwd IAT Min: {row['Fwd IAT Min']}
    - Flow IAT Min: {row['Flow IAT Min']}
    - Flow IAT Max: {row['Flow IAT Max']}
    - Flow IAT Mean: {row['Flow IAT Mean']}
    - Fwd Seg Size Min: {row['Fwd Seg Size Min']}
    - Fwd Seg Size Avg: {row['Fwd Seg Size Avg']}
    - Flow Pkts/s: {row['Flow Pkts/s']}
    - Fwd Pkts/s: {row['Fwd Pkts/s']}
    - Bwd Pkts/s: {row['Bwd Pkts/s']}
    - Fwd Header Len: {row['Fwd Header Len']}
    - Init Fwd Win Byts: {row['Init Fwd Win Byts']}
    - Init Bwd Win Byts: {row['Init Bwd Win Byts']}
    - Pkt Len Max: {row['Pkt Len Max']}
    - Subflow Fwd Byts: {row['Subflow Fwd Byts']}
    - Subflow Fwd Pkts: {row['Subflow Fwd Pkts']}

    ### Your Task
    Based on these features alone, write a brief explanation of why this flow is consistent with the {fine_label} label. Make sure your response includes which features specifically lead you to believe that it is the said attack or not.
    """)

    return prompt

generate_prompt_from_row(df[df["Label"] == "DDoS"].iloc[0])

'You are a cybersecurity analyst. Your task is to explain why the following network flow is labeled as a DDoS attacks-LOIC-HTTP attack. This flow is part of the CIC-IDS2018 dataset, which includes realistic attack scenarios and benign behavior.\n\n    ### Ground Truth\n    - Label (High-level): DDoS\n    - Label (Specific Variant): DDoS attacks-LOIC-HTTP\n\n    ### Background on Attack\n    LOIC-HTTP overwhelms web services with multi-threaded GET/POST requests in distributed fashion.\n\n    ### INSTRUCTIONS\n    Only use the flow features and values below to construct your explanation. Do NOT reference IPs, timestamps, or any external knowledge. Justify the label using only flow-level behavioral characteristics.\n\n    ### Flow Features\n    - Dst Port: 80\n    - Flow Duration: 3904.0\n    - Tot Fwd Pkts: 3.0\n    - TotLen Fwd Pkts: 20.0\n    - Fwd Pkt Len Max: 20.0\n    - Fwd Pkt Len Mean: 6.666666667\n    - Fwd IAT Tot: 3310.0\n    - Fwd IAT Mean: 1655.0\n    - Fwd IAT Max: 2937.0\n

In [13]:
df[df["Label"] == "DDoS"].iloc[0]

Dst Port                             80
Protocol                            6.0
Flow Duration                    3904.0
Tot Fwd Pkts                        3.0
Tot Bwd Pkts                        4.0
                          ...          
Idle Std                            0.0
Idle Max                            0.0
Idle Min                            0.0
Label                              DDoS
Fine Label       DDoS attacks-LOIC-HTTP
Name: 3145788, Length: 80, dtype: object

In [None]:
import pandas as pd
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from tqdm.auto import tqdm
import time
import numpy as np
from textwrap import dedent


MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
INPUT_DATA_PATH = "data/cic-ids2018/processed/cleaned.csv"
OUTPUT_DATA_PATH = "data/cic-ids2018/processed/attack_explanations.csv"
MAX_TIME_HOURS = 2.0  
BATCH_SIZE = 1


print("Loading cleaned data...")
df = pd.read_csv(INPUT_DATA_PATH, low_memory=False)

# Filter: Only Attacks (Label != Benign) TODO: I think we should also have a decent bit of benign examples in there?
print(f"Original shape: {df.shape}")
attack_df = df[df['Label'] != 'Benign'].copy()
print(f"Attack samples only: {attack_df.shape}")


print(f"Loading model: {MODEL_ID}...")
device = 0 if torch.cuda.is_available() else -1
dtype = torch.float16 if torch.cuda.is_available() else torch.float32

pipe = pipeline(
    "text-generation",
    model=MODEL_ID,
    device=device,
    torch_dtype=dtype,
    trust_remote_code=True
)

print("\n--- Estimating Generation Time ---")
test_samples = attack_df.head(3).copy()
start_test = time.time()

for idx, row in test_samples.iterrows():
    prompt = generate_prompt_from_row(row)
    # Generate with constraints to speed up
    _ = pipe(prompt, max_new_tokens=200, do_sample=False, truncation=True)

end_test = time.time()
avg_time_per_sample = (end_test - start_test) / 3
total_samples = len(attack_df)
estimated_total_seconds = total_samples * avg_time_per_sample
estimated_total_hours = estimated_total_seconds / 3600



#Generation Loop
print(f"\nStarting generation for {len(attack_df)} samples...")
explanations = []

for idx, row in tqdm(attack_df.iterrows(), total=len(attack_df)):
    prompt = generate_prompt_from_row(row)
    
    # Generate
    outputs = pipe(
        prompt, 
        max_new_tokens=128, 
        do_sample=True, 
        temperature=0.7,
        top_p=0.9,
        truncation=True,
        return_full_text=False 
    )
    
    explanation = outputs[0]['generated_text'].strip()
    explanations.append(explanation)

#save results
attack_df['generated_explanation'] = explanations
attack_df.to_csv(OUTPUT_DATA_PATH, index=False)
print(f"\nDone! Explanations saved to {OUTPUT_DATA_PATH}")

# Print a few examples
print("\n--- Example Outputs ---")
for i in range(2):
    print(f"\n[Label: {attack_df.iloc[i]['Fine Label']}]")
    print(f"Explanation: {attack_df.iloc[i]['generated_explanation']}")

NameError: name 'train_df' is not defined