In [1]:
import os
from glob import glob
from concurrent.futures import ThreadPoolExecutor
from scv import binvis

In [2]:
# Global variables
config = {
  'block': None, # Mark a block of data with a specified color. Format: hexstartaddr:hexendaddr[:hexcolor].
  'color': 'hilbert', # Color map to use. Options: class, hilbert, entropy, gradient.
  'map': 'hilbert', # Pixel layout map. Can be any supported curve.
  'namesuffix': '', # Suffix for generated file names. Ignored if destination is specified.
  'progress': False, # Show progress bar.
  'size': 256, # Image size in pixels.
  'type': "square", # Image aspect ratio - square (1x1) or unrolled (1x4). Options: square, unrolled.
  'quiet': False, # Suppress all output.
}

In [3]:
def process_pkl(pkl_input_file, img_output_path, img_output_file):
  print(f'[INFO] Processing {pkl_input_file}')
  
  if not os.path.exists(img_output_path):
    os.makedirs(img_output_path)
  
  img_output_file = os.path.join(img_output_path, img_output_file)
  
  # Read binary file
  with open(pkl_input_file, 'rb') as f:
    data = f.read()
  
  # if not os.path.exists(img_output_file):
  #   os.makedirs(img_output_file)

  # Block specification
  block = None
  if config['block']:
    parts = config['block'].split(':')
    if len(parts) not in [2, 3]:
      raise ValueError("Invalid block specification.")
    s, e = int(parts[0], 16), int(parts[1], 16)
    if len(parts) == 3:
      c = binvis.draw.parseColor(parts[2])
    else:
      c = [255, 0, 0]
    block = (s, e, c)
    
  # Color map
  if config['color'] not in ['class', 'hilbert', 'entropy', 'gradient']:
    raise ValueError("Invalid color map.")
  color = config['color']

  csource_map = {
    'class': binvis.ColorClass(data, block),
    'hilbert': binvis.ColorHilbert(data, block),
    'gradient': binvis.ColorGradient(data, block),
    'entropy': binvis.ColorEntropy(data, block),
  }
  csource = csource_map[color]

  # Progress bar
  if config['progress']:
    print("Generating image...")
    print(f"Destination: {img_output_file}")

  if config['quiet'] or config['progress']:
    prog = binvis.progress.Dummy()
  else:
    prog = binvis.progress.Progress(None)

  # Type specification
  if config['type'] == 'unrolled':
    binvis.drawmap_unrolled(config['map'], config['size'], csource, img_output_file, prog)
  elif config['type'] == 'square':
    binvis.drawmap_square(config['map'], config['size'], csource, img_output_file, prog)
  prog.clear()
  
  print(f'[SUCCESS] Image saved to {img_output_file} \n')

In [4]:
def process_folder(input_folder, output_folder):
  print(f'[INFO] Processing folder: {input_folder}')
  # Get all .pkl files in the folder
  if not os.path.exists(output_folder):
    os.makedirs(output_folder)
  label_name = input_folder.split('/')[-2]
  
  if len(glob(os.path.join(output_folder, label_name, '*.png'))) >= 200:
    print(f'[INFO] {label_name} images already exists in {output_folder} \n')
    return
  else:
    pkl_files = glob(os.path.join(input_folder, '*.pkl'))
    for fkl_file in pkl_files:
      img_name = fkl_file.split('/')[-1].split('.')[0]
      output_img = os.path.join(output_folder, label_name, f'{img_name}.png')
      # Skip if image already exists
      if os.path.exists(output_img) or len(glob(os.path.join(output_folder, label_name, '*.png'))) >= 200:
        continue
      else:
        process_pkl(fkl_file, os.path.join(output_folder, label_name), f'{img_name}.png')
  
    print(f'[SUCCESS] {label_name} images saved to {output_folder} \n')

In [5]:
def multi_thread_process(folders_paths, output_folder, num_workers=4):
  print(f'[INFO] Processing {len(folders_paths)} folders with {num_workers} workers')
  # Use ThreadPoolExecutor for concurrent processing with specified number of workers
  with ThreadPoolExecutor(max_workers=num_workers) as executor:
    futures = []
    for folder_path in folders_paths:
      future = executor.submit(process_folder, folder_path, output_folder)
      futures.append(future)
    
    # Wait for all tasks to complete
    for future in futures:
      future.result()
      
  print(f'[SUCCESS] All folders processed and images saved to {output_folder}')

In [15]:
bin_dataset_folder = "./bin_dataset"

# List all subfolders in the bin_dataset folder
subfolders = glob(os.path.join(bin_dataset_folder, '*/'))

# Process folders concurrently with 4 workers
multi_thread_process(subfolders, './img_dataset', num_workers=2)

|>                                        | 0:00:13 

[SUCCESS] Image saved to ./img_dataset/vsftpd2/batch_1133_33990-34020.png 

[INFO] Processing ./bin_dataset/vsftpd2/batch_735_22050-22080.pkl


|>                                        | 0:00:13 

[SUCCESS] Image saved to ./img_dataset/vsftpd2/batch_735_22050-22080.png 

[INFO] Processing ./bin_dataset/vsftpd2/batch_350_10500-10530.pkl


|>                                        | 0:00:13 

[SUCCESS] Image saved to ./img_dataset/vsftpd2/batch_350_10500-10530.png 

[INFO] Processing ./bin_dataset/vsftpd2/batch_228_6840-6870.pkl


|>                                        | 0:00:12 

[SUCCESS] Image saved to ./img_dataset/vsftpd2/batch_228_6840-6870.png 

[INFO] Processing ./bin_dataset/vsftpd2/batch_150_4500-4530.pkl


                                                    

[SUCCESS] Image saved to ./img_dataset/vsftpd2/batch_150_4500-4530.png 

[SUCCESS] vsftpd2 images saved to ./img_dataset 

[SUCCESS] All folders processed and images saved to ./img_dataset
