In [8]:
!ls raw_data

'Active Wiretap'  'Mirai Botnet'  'SSL Renegotiation'
'ARP MitM'	  'OS Scan'	  'SYN DoS'
 Fuzzing	  'SSDP Flood'	  'Video Injection'


In [1]:
import pandas as pd
import os

In [10]:
# Define the function to sample the data and save to a new CSV file
def sample_data(folder_name):
    print(folder_name, "==>")
    # Set the path to the folder
    folder_path = os.path.join('./raw_data', folder_name)

    # Search for data and label files in the folder
    data_file_path = ''
    label_file_path = ''
    for file_name in os.listdir(folder_path):
        if file_name.endswith('_dataset.csv'):
            data_file_path = os.path.join(folder_path, file_name)
        elif file_name.endswith('_labels.csv'):
            label_file_path = os.path.join(folder_path, file_name)
  
    # Load the labels into pandas dataframes
    # header is 0-th row, index is 0-th col
    labels = pd.read_csv(label_file_path, header=0, index_col=0)
    label_column_name = 'x'
    
    # Count the records with 0 and 1 in the label column
    n_0 = labels[label_column_name].value_counts()[0]
    n_1 = labels[label_column_name].value_counts()[1]

    print("   benign :", n_0)
    print("   attack :", n_1)

    # Load data
    print("   Loading data...")
    data = pd.read_csv(data_file_path, header=None)

    # Merge data with labels
    print("   Merging data...")
    df = pd.merge(data, labels,left_index=True, right_index=True)
    
    # Drop any duplicate rows in the data and labels
    print("   Dropping duplicates...")
    df = df.drop_duplicates()
    
    # Filter out records with label equal to 0 and 1 (benign/attack)
    print("   Filtering...")
    df_0 = df[df[label_column_name] == 0]
    df_1 = df[df[label_column_name] == 1]

    # Randomly sample 1000 records from each group
    print("   Sampling...")
    df_0_sampled = df_0.sample(n=7000, replace=True, random_state=42)
    df_1_sampled = df_1.sample(n=7000, replace=True, random_state=42)

    # Concatenate the two groups back together
    print("   Concatinating...")
    df_sampled = pd.concat([df_0_sampled, df_1_sampled]).reset_index(drop=True)

    # Shuffle the rows of the resulting dataframe
    print("   Shuffling data...")
    df_sampled = df_sampled.sample(frac=1, random_state=42)

    # Save result as csv
    folder_name = folder_name.replace(' ', '_').lower()
    output_filename = f'{folder_name}_sampled.csv'
    print("   Saving to csv...")
    df_sampled.to_csv(output_filename, index=False)
    print("   Done.")
    print()
    
    return

In [None]:
# skip mirai botnet, it does not have index column
folders = [
    #'Active Wiretap',
    #'ARP MitM',
    #'Fuzzing',
    #'OS Scan',
    'SSDP Flood',
    #'SSL Renegotiation',
    #'SYN DoS',
    #'Video Injection',
]
for folder in folders:
  sample_data(folder)

SSDP Flood ==>
   benign : 2637662
   attack : 1439604
   Loading data...
   Merging data...


In [15]:
# Mirai Botnet dataset does not have and index and header column
def sample_mirai(folder_name):
    print(folder_name, "==>")
    # Set the path to the folder
    folder_path = os.path.join('./raw_data', folder_name)

    # Search for data and label files in the folder
    data_file_path = ''
    label_file_path = ''
    for file_name in os.listdir(folder_path):
        if file_name.endswith('_dataset.csv'):
            data_file_path = os.path.join(folder_path, file_name)
        elif file_name.endswith('_labels.csv'):
            label_file_path = os.path.join(folder_path, file_name)
  
    # Load the labels into pandas dataframes
    # header is 0-th row, index is 0-th col
    labels = pd.read_csv(label_file_path, header=None)
    label_column_name = 'x'
    labels.columns = [label_column_name]
    
    # Count the records with 0 and 1 in the label column
    n_0 = labels[label_column_name].value_counts()[0]
    n_1 = labels[label_column_name].value_counts()[1]

    print("   benign :", n_0)
    print("   attack :", n_1)

    # Load data
    print("   Loading data...")
    data = pd.read_csv(data_file_path, header=None, index_col=0)

    # Merge data with labels
    print("   Merging data...")
    df = pd.merge(data, labels,left_index=True, right_index=True)
    
    # Drop any duplicate rows in the data and labels
    print("   Dropping duplicates...")
    df = df.drop_duplicates()
    
    # Filter out records with label equal to 0 and 1 (benign/attack)
    print("   Filtering...")
    df_0 = df[df[label_column_name] == 0]
    df_1 = df[df[label_column_name] == 1]

    # Randomly sample 1000 records from each group
    print("   Sampling...")
    df_0_sampled = df_0.sample(n=7000, replace=True, random_state=42)
    df_1_sampled = df_1.sample(n=7000, replace=True, random_state=42)

    # Concatenate the two groups back together
    print("   Concatinating...")
    df_sampled = pd.concat([df_0_sampled, df_1_sampled]).reset_index(drop=True)

    # Shuffle the rows of the resulting dataframe
    print("   Shuffling data...")
    df_sampled = df_sampled.sample(frac=1, random_state=42)

    # Save result as csv
    folder_name = folder_name.replace(' ', '_').lower()
    output_filename = f'{folder_name}_sampled.csv'
    print("   Saving to csv...")
    df_sampled.to_csv(output_filename, index=False)
    print("   Done.")
    print()

    return

In [16]:
sample_mirai('Mirai Botnet')

Mirai Botnet ==>
   benign : 121621
   attack : 642516
   Loading data...
   Merging data...
   Dropping duplicates...
   Filtering...
   Sampling...
   Concatinating...
   Shuffling data...
   Saving to csv...
   Done.

