In [None]:
import os
import sys
# Get the directory where the current script is located
current_dir = os.path.dirname(os.getcwd()).split('\\')

# Construct the path to your target folder (e.g., 'data' inside the repo)
target_folder = "/".join(current_dir[:current_dir.index('src')+1])
sys.path.append(os.path.abspath(target_folder))

In [None]:
import glob
import pandas as pd

In [None]:
CSV_DIR = "C:\VScode_Projects\DP\datasets\CIC-DDoS-2019\\raw\csv\\03-11"
COLS = [
    "Flow ID",
    " Source IP",
    " Source Port",
    " Destination IP",
    " Destination Port",
    " Protocol",
    " Timestamp",
    " Flow Duration",
    " Label",
]

In [None]:
def read_csv(dir: str, cols: list = None):
    print("READING CSV FILES...")

    extension = "*.csv"
    files = glob.glob(f"{dir}/{extension}")

    dfs = []
    for file in files:
        print(f"READING {file}")
        if cols:
            dfs.append(pd.read_csv(file, usecols=cols))
        else:
            dfs.append(pd.read_csv(file))

    print("MERGING CSV FILES...")
    df = pd.concat(dfs, ignore_index=True)
    df.columns = df.columns.str.strip()

    print('DROPPING ROWS WITH MISSING "Flow ID"...')
    df = df.drop(df[pd.isnull(df["Flow ID"])].index)

    df["Timestamp"] = pd.to_datetime(df["Timestamp"])
    min_timestamp = df["Timestamp"].min()

    return df, min_timestamp

In [None]:
df, _ = read_csv(CSV_DIR, COLS)
df

In [None]:
df = df.rename(columns={"Timestamp": "stime", "Flow Duration": "dur"})

In [None]:
df["stime"] = df["stime"].apply(lambda x: x.timestamp())

In [None]:
df.head(5)

In [None]:
OUTPUT_CSV = "C:\VScode_Projects\DP\datasets\CIC-DDoS-2019\clean\labeled_sample.csv"

TARGET_UDP = 100000
TARGET_LAG = 100000
TARGET_BENIGN = 100000

# Initialize counters
udp_count = 0
lag_count = 0
benign_count = 0

counter = 0
for chunk in pd.read_csv("C:\VScode_Projects\DP\datasets\CIC-DDoS-2019\clean\sample.csv", chunksize=100_000):
    temp1 = pd.merge(chunk, df, how="left", left_on=["src_ip", "src_p", "dst_ip", "dst_p", "protocol.1",], right_on=["Source IP", "Source Port", "Destination IP", "Destination Port", "Protocol"])
    temp2 = pd.merge(chunk, df, how="left", left_on=["src_ip", "src_p", "dst_ip", "dst_p", "protocol.1",], right_on=["Destination IP", "Destination Port", "Source IP", "Source Port", "Protocol"])
    combine = pd.concat([temp1, temp2])
    combine.drop_duplicates(inplace=True)
    combine = combine[
        (combine["stime"] <= combine["timestamp"])
        & (combine["timestamp"] <= combine["stime"] + combine["dur"])
    ]
    combine = combine.drop(columns=["stime", "dur", "timestamp", "Flow ID", "protocol.1", "Source IP", "Source Port", "Destination IP", "Destination Port", "Protocol"])
    chunk = combine.rename(columns={"Label": "label"})
    
    # Initialize output file with header
    if counter == 0:
        columns = chunk.columns
        pd.DataFrame(columns=columns).to_csv(OUTPUT_CSV, mode='w', index=False)
    counter += 1
    
    # Split into categories
    udp_mask = chunk['label'] == 'UDP'
    lag_mask = chunk['label'] == 'UDPLag'
    benign_mask = chunk['label'] == 'BENIGN'
    other_mask = ~(udp_mask | benign_mask)
    
    # Handle UDP rows
    if udp_count < TARGET_UDP:
        udp_chunk = chunk[udp_mask]
        needed = TARGET_UDP - udp_count
        udp_samples = udp_chunk.head(needed)
        udp_samples.to_csv(OUTPUT_CSV, mode='a', header=False, index=False)
        udp_count += len(udp_samples)
        
    # Handle LAG rows
    if lag_count < TARGET_LAG:
        lag_chunk = chunk[lag_mask]
        needed = TARGET_LAG - lag_count
        lag_samples = lag_chunk.head(needed)
        lag_samples.to_csv(OUTPUT_CSV, mode='a', header=False, index=False)
        lag_count += len(lag_samples)
    
    # Handle BENIGN rows
    if benign_count < TARGET_BENIGN:
        benign_chunk = chunk[benign_mask]
        needed = TARGET_BENIGN - benign_count
        benign_samples = benign_chunk.head(needed)
        benign_samples.to_csv(OUTPUT_CSV, mode='a', header=False, index=False)
        benign_count += len(benign_samples)
    
    # Handle other rows
    chunk[other_mask].to_csv(OUTPUT_CSV, mode='a', header=False, index=False)
    

In [None]:
df_check = pd.read_csv("C:\VScode_Projects\DP\datasets\CIC-DDoS-2019\clean\labeled_sample.csv")

In [None]:
df_check.label.unique()

In [None]:
df_check.label.value_counts()