In [None]:
# This file handles the data. 
# The datasets are not included in the submission as they are too huge. Refer to the references if you need to retrive original datasets.
import os
import pandas as pd
import random

def sample_large_csv(file_path, sample_frac=0.3):
    with open(file_path, 'r', encoding='utf-8') as f:
        total_lines = sum(1 for line in f) - 1  


    sample_size = int(total_lines * sample_frac)
    sampled_indices = sorted(random.sample(range(1, total_lines + 1), sample_size))  


    df_sample = pd.read_csv(
        file_path,
        skiprows=lambda i: i != 0 and i not in sampled_indices 
    )
    return df_sample

def process_folder(folder_path, sample_frac=0.3):
    all_samples = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            print(f'Processing {file_path} ...')
            df = sample_large_csv(file_path, sample_frac)
            all_samples.append(df)

    result = pd.concat(all_samples, ignore_index=True)
    return result


folder = '2018'
combined_df = process_folder(folder, sample_frac=0.08) # 2018 0.3/4 = 0.08

combined_df.to_csv('2018.csv', index=False)

Processing 2018\02-14-2018.csv ...
Processing 2018\02-15-2018.csv ...
Processing 2018\02-23-2018.csv ...
Processing 2018\02-28-2018.csv ...


  df = sample_large_csv(file_path, sample_frac)


In [2]:
df_2017 = pd.read_csv("2017.csv")
df_2018 = pd.read_csv("2018.csv")
df_2019 = pd.read_csv("2019.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
import re, pandas as pd

# Clean function, to resovle ambiguis context and make them same descriptions

ABBR = {
    r"\bpkt\b": "packet",
    r"\bpkt[s]?\b": "packets",
    r"\bpkts\b": "packets",
    r"\blen\b": "length",
    r"\bbyt[s]?\b": "bytes",
    r"\bseg\b": "segment",
    r"\bwin\b": "window",
    r"\bavg\b": "avg",
    r"\bvar\b": "variance",
    r"\bstd\b": "std",
    r"\bmin\b": "min",
    r"\bmax\b": "max",
    r"\bblk\b": "bulk",
    r"\bcnt\b": "count",
    r"\bact\b": "act",
    r"\btot\b": "total",
    r"\bs/s\b": "s",
}

def clean(col):
    col = col.strip() 
    col = re.sub(r"[./]", " ", col)
    col = re.sub(r"\s+", " ", col) 
    col = col.lower()
    for pat, rep in ABBR.items():
        col = re.sub(pat, rep, col)
    col = col.split()
    return col[0] + "".join(w.capitalize() for w in col[1:])


MANUAL = {
    "finFlagCnt":        "finFlagCount",
    "synFlagCnt":        "synFlagCount",
    "pshFlagCnt":        "pshFlagCount",
    "rstFlagCnt":        "rstFlagCount",
    "ackFlagCnt":        "ackFlagCount",
    "urgFlagCnt":        "urgFlagCount",
    "pktLenMean":        "packetLengthMean",
    "pktLenStd":         "packetLengthStd",
    "pktLenVar":         "packetLengthVariance",
    "pktLenMin":         "packetLengthMin",
    "pktLenMax":         "packetLengthMax",
    "pktSizeAvg":        "avgPacketSize",
    "totlenFwdPkts":     "totalLengthFwdPackets",
    "totlenBwdPkts":     "totalLengthBwdPackets",
    "bwdPacketsLengthTotal": "totalLengthBwdPackets",
    "fwdPacketsLengthTotal": "totalLengthFwdPackets",
    "flowBytsS":         "flowBytesPerSecond",
    "flowPktsS":         "flowPacketsPerSecond",
    "initFwdWinByts":    "initWinBytesForward",
    "initBwdWinByts":    "initWinBytesBackward",
    "initBwdWinBytes":   "initWinBytesBackward",
    "fwdActDataPkts":    "fwdActDataPackets",
    "fwdSegSizeAvg":     "avgFwdSegmentSize",
    "bwdSegSizeAvg":     "avgBwdSegmentSize",
    "fwdSegSizeMin":     "minSegSizeForward",
    "dstPort":           "destinationPort",
    "class":             "label",
    "timestamp":         "timeStamp",
    "unnamed0":          None,
}

def harmonize(cols):
    cols = [clean(c) for c in cols]
    return [MANUAL.get(c, c) for c in cols if MANUAL.get(c, c) is not None]


def load_and_align(path):
    df = pd.read_csv(path)
    new_cols = harmonize(df.columns)
    df.columns = new_cols
    return df

df17 = load_and_align("2017.csv")
df18 = load_and_align("2018.csv")
df19 = load_and_align("2019.csv")


print(len(df17.columns), len(df18.columns), len(df19.columns))
print(set(df17.columns) ^ set(df18.columns) ^ set(df19.columns)) 


  df18 = load_and_align("2018.csv")


79 80 80
{'pshFlagCount', 'subflowFwdBytes', 'fwdHeaderLength1', 'minPacketLength', 'activeMin', 'avgPacketSize', 'idleMean', 'bwdHeaderLength', 'packetLengthVariance', 'urgFlagCount', 'totlenBwdPackets', 'idleStd', 'eceFlagCount', 'totalFwdPackets', 'fwdUrgFlags', 'activeStd', 'flowIatMean', 'ackFlagCount', 'packetLengthMean', 'min_seg_size_forward', 'fwdPacketsS', 'subflowBwdBytes', 'totalLengthFwdPackets', 'bwdPacketsBAvg', 'totalLengthBwdPackets', 'fwdIatTotal', 'fwdPacketLengthMax', 'fwdHeaderLength', 'idleMax', 'fwdPacketsBAvg', 'totalLengthOfBwdPackets', 'fwdIatMax', 'init_win_bytes_forward', 'bwdPacketLengthStd', 'bwdPacketLengthMax', 'flowIatMax', 'fwdBulkRateAvg', 'bwdBulkRateAvg', 'flowBytesS', 'maxPacketLength', 'fwdPacketLengthMin', 'bwdPshFlags', 'fwdIatStd', 'bwdBytesBAvg', 'fwdIatMean', 'totalLengthOfFwdPackets', 'bwdIatTotal', 'fwdPacketLengthMean', 'timeStamp', 'unnamed:0', 'flowDuration', 'subflowBwdPackets', 'fwdSegmentSizeAvg', 'fwdIatMin', 'flowIatMin', 'packetLen

In [None]:
print("CIC-2017 Columns:")
print(set(df_2017.columns))

print("CIC-2018 Columns:")
print(set(df_2018.columns))

print("CIC-2019 Columns:")
print(set(df_2019.columns))

print("common:")
print(df17.columns & df18.columns & df19.columns)

CIC-2017 Columns:
{' act_data_pkt_fwd', ' Packet Length Mean', ' Fwd Avg Bulk Rate', ' Fwd Avg Packets/Bulk', ' Fwd Packet Length Max', ' Average Packet Size', ' Total Length of Bwd Packets', ' Bwd IAT Min', ' Bwd Packet Length Mean', ' min_seg_size_forward', ' Total Fwd Packets', ' Subflow Fwd Bytes', ' Packet Length Variance', ' Bwd Packet Length Min', 'Idle Mean', ' Destination Port', ' CWE Flag Count', ' Avg Fwd Segment Size', ' Bwd URG Flags', 'FIN Flag Count', 'Total Length of Fwd Packets', ' Fwd URG Flags', ' Fwd IAT Max', 'Fwd PSH Flags', ' PSH Flag Count', ' Subflow Bwd Bytes', 'Fwd Avg Bytes/Bulk', ' ECE Flag Count', ' Idle Std', ' Fwd IAT Min', 'Active Mean', ' Fwd Header Length.1', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Flow IAT Max', ' Min Packet Length', ' Max Packet Length', ' Fwd Header Length', ' Bwd Packet Length Std', ' Flow IAT Min', ' Total Backward Packets', ' Flow IAT Mean', ' Bwd Header Length', ' Bwd Avg Packets/Bulk', ' Avg Bwd Segment Size', '

  print(df17.columns & df18.columns & df19.columns)


In [None]:
common_cols = sorted(list(set(df17.columns) & set(df18.columns) & set(df19.columns)))


df17 = df17[common_cols]
df18 = df18[common_cols]
df19 = df19[common_cols]

In [13]:
df17.to_csv("c2017.csv", index=False)
df18.to_csv("c2018.csv", index=False)
df19.to_csv("c2019.csv", index=False)

In [24]:
df17 = pd.read_csv("c2017.csv")
df18 = pd.read_csv("c2018.csv")
df19 = pd.read_csv("c2019.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)
