In [26]:
import pandas as pd
import pickle
import os
from glob import glob
from concurrent.futures import ThreadPoolExecutor

In [28]:
def process_csv_file(input_csv_file, output_dir, batch_size=30):
  labelName = input_csv_file.split('/')[-1].split('.')[0]
  subdirectory = os.path.join(output_dir, labelName)
  if not os.path.exists(subdirectory):
    os.makedirs(subdirectory)
  print(f'[WARNING] Processing {labelName} ...')

  # Read the CSV file into a pandas DataFrame
  df = pd.read_csv(input_csv_file)
  df.set_index('timestamp', inplace=True)

  # Split the DataFrame into batches of size 'batch_size'
  batches = [df.iloc[i:i+batch_size] for i in range(0, df.shape[0], batch_size)]

  for i, batch_df in enumerate(batches):
    # Create a pickle file for each batch
      
    output_file = os.path.join(subdirectory, f'batch_{i}_{i*batch_size}-{(i+1)*batch_size}.pkl')
    with open(output_file, 'wb') as f:
      pickle.dump(batch_df, f)
          
  print(f'[INFO] Done processing {labelName} ...')

In [29]:
def process_csv_files(csv_files, output_dir, batch_size, num_workers=4):
  print(f'[INFO] Processing {len(csv_files)} CSV files ...')
  # Ensure output directory exists
  if not os.path.exists(output_dir):
    os.makedirs(output_dir)

  # Use ThreadPoolExecutor for concurrent processing with specified number of workers
  with ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = []
    for csv_file in csv_files:
      future = executor.submit(process_csv_file, csv_file, output_dir, batch_size)
      futures.append(future)
      
    # Wait for all tasks to complete
    for future in futures:
      future.result()
  
  print(f'[SUCCESS] Done processing.')

In [30]:
output_dir = 'bin_dataset'
csv_files = glob('csv_dataset/*.csv')

process_csv_files(csv_files, output_dir, batch_size=30, num_workers=4)

[INFO] Processing 21 CSV files ...
[INFO] Done processing smtp22 ...
[INFO] Done processing hydra_ftp2 ...
[INFO] Done processing blackEnergy ...
[INFO] Done processing vsftpd ...
[INFO] Done processing smtp ...
[INFO] Done processing vsftpd2 ...
[INFO] Done processing netbios_ssn2 ...
[INFO] Done processing distcc_exec_backdoor2 ...
[INFO] Done processing 0day ...
[INFO] Done processing hydra_ftp ...
[INFO] Done processing hydra_ssh2 ...
[INFO] Done processing netbios_ssn ...
[INFO] Done processing zeus ...
[INFO] Done processing distcc_exec_backdoor ...
[INFO] Done processing unreallrcd2 ...
[INFO] Done processing ruby_drb2 ...
[INFO] Done processing ruby_drb ...
[INFO] Done processing hydra_ssh ...
[INFO] Done processing replayAttacks ...
[INFO] Done processing unreallrcd ...
[INFO] Done processing mirai ...
[SUCCESS] Done processing.
