threatcode · pull · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/conf/default/distributed.conf.default b/conf/default/distributed.conf.default
@@ -32,16 +32,8 @@ fstab_socket = /tmp/cape-fstab
 
 # Google Cloud Platform
 [GCP]
+# Note: Further GCP configuration (project_id, zones, autodiscovery, etc.)
+# MUST be done in conf/gcp.conf. This section only controls the enabled state.
 enabled = no
-# Comma separated list of zones
-zones = ""
-project_id = ""
-# rest usage instead of GCP python client
-# https://cloud.google.com/docs/authentication/rest
-# gcloud auth print-access-token
-token = ""
-# Seconds between try to discoverd new instances
-autodiscovery = 600
-# Instances should start with following name pattern
-instance_name = cape-server
+
 
diff --git a/conf/default/gcp.conf.default b/conf/default/gcp.conf.default
@@ -1,66 +1,48 @@
 [gcp]
-# Specify the Google Cloud Zone (for example, europe-north2-a). This is case-sensitive
-zone = <zone_name>
-
+# Global Environment Settings
 # Specify the project identifier
-project = <project_id>
-
-# pubsub
-subscription_id =
-
-# Running in a GCP environment. If true, the Compute Engine credentials will be used
-running_in_gcp = true
-
-# Specify the path to the service account key file. If not specified, the default service account will be used
-service_account_path =
-
-# Specify a comma-separated list of available machines to be used.
-# Each machine will be represented by the instance-name (for example, cape-server-windows).
-# For each specified instance-name you have to define a dedicated section containing the details
-# on the respective machine. (E.g. cape-server-windows,cape-server-linux)
-# For better performance, it is recommended to leave this empty and set autoscale = yes.
-machines =
-
-[cape-server-linux]
-# Specify the label name.
-# Label would be the instance-name of the current machine as specified in your GCP account.
-label = cape-server-linux
-
-# Specify the operating system platform used by current machine
-# [windows/darwin/linux].
-platform = linux
-
-# Set the machine architecture
-# x64 or x86
-arch = x64
-
-# Specify the IP address of the current virtual machine. Make sure that the
-# IP address is valid and that the host machine is able to reach it. If not,
-# the analysis will fail.
-# ip =
-
-# (Optional) Specify the name of the network interface that should be used
-# when dumping network traffic from this machine with tcpdump. If specified,
-# overrides the default interface specified above.
-# Example (eth0 is the interface name):
-# interface =
-
-# (Optional) Specify the IP of the Result Server, as your virtual machine sees it.
-# The Result Server will always bind to the address and port specified in cuckoo.conf,
-# however you could set up your virtual network to use NAT/PAT, so you can specify here
-# the IP address for the Result Server as your machine sees it. If you don't specify an
-# address here, the machine will use the default value from cuckoo.conf.
-# NOTE: if you set this option you have to set result server IP to 0.0.0.0 in cuckoo.conf.
-# Example:
-# resultserver_ip =
-
-# (Optional) Specify the port for the Result Server, as your virtual machine sees it.
-# The Result Server will always bind to the address and port specified in cuckoo.conf,
-# however you could set up your virtual network to use NAT/PAT, so you can specify here
-# the port for the Result Server as your machine sees it. If you don't specify a port
-# here, the machine will use the default value from cuckoo.conf.
-# resultserver_port =
-
-# (Optional) Set your own tags. These are comma separated and help to identify
-# specific VMs. You can run samples on VMs with tag you require.
-# tags =
+project = 
+# Specify the Google Cloud Zone (for example, europe-north2-a). This is case-sensitive
+zone = 
+# Authentication method: vm (instance credentials), json (key file), or token (manual token)
+auth_by = vm
+# Path to the service account key file (required if auth_by = json)
+# service_account_path = data/gcp-credentials.json
+# Bearer token for REST API usage (optional)
+# token = 
+
+[samples_pubsub]
+# GCP Pub/Sub Sample Processing Service
+enabled = no
+# Pub/Sub subscription name
+subscription_id = 
+# GCS bucket for sample downloads
+samples_bucket = sandbox-samples-unique
+# Concurrent message limit
+max_messages = 5
+# Lease duration in seconds
+lease_duration = 1800
+
+[distributed]
+# Worker Node Autodiscovery
+enabled = no
+# Seconds between try to discover new instances
+autodiscovery_interval = 600
+# Instances should start with following name prefix
+instance_name_pattern = cape-server
+# Comma separated list of zones to scan (defaults to global zone if empty)
+zones = 
+
+[reporting]
+# Analysis Results Upload to GCS
+enabled = no
+# The name of your GCS bucket where reports will be uploaded
+results_bucket = 
+# Upload mode: zip (single archive per task) or file (individual files)
+mode = zip
+# Delete local report after successful upload to GCS
+delete_after_upload = no
+# Comma-separated list of DIRECTORY names to exclude (e.g., logs, shots)
+exclude_dirs = logs, shots
+# Comma-separated list of exact FILENAMES to exclude
+exclude_files = 
diff --git a/conf/default/reporting.conf.default b/conf/default/reporting.conf.default
@@ -226,27 +226,7 @@ enabled = no
 
 # Google Cloud Storage
 [gcs]
+# Note: Further GCS configuration (bucket name, credentials, mode, etc.)
+# MUST be done in conf/gcp.conf. This section only controls the enabled state.
 enabled = no
-# The name of your Google Cloud Storage bucket where files will be uploaded.
-bucket_name = your-gcs-bucket-name
 
-# Comma-separated list of DIRECTORY names to exclude from the upload.
-# Good examples are 'shots' (contains all screenshots) or 'memory' (for full memory dumps).
-exclude_dirs = logs, shots
-
-# Comma-separated list of exact FILENAMES to exclude from the upload.
-# Good examples are large report formats you don't need in GCS.
-exclude_files =
-
-# Mode: zip - will submit all files and folders as unique zip archive. Useful to not spam pubsub notification on file creation.
-# Mode: file - will submit one by one.
-mode = zip
-
-# Can be vm or json
-auth_by = vm
-# only if auth_by = json. The absolute path to your Google Cloud service account JSON key file.
-# This file is required for authentication.
-credentials_path = data/gcp-credentials.json
-
-# Delete local report after successful upload to GCS
-delete_after_upload = no
diff --git a/lib/cuckoo/common/demux.py b/lib/cuckoo/common/demux.py
@@ -168,9 +168,35 @@ def is_valid_package(package: str) -> bool:
     return any(ptype in package for ptype in VALID_PACKAGES)
 
 
+# list of junk extensions to skip
+JUNK_EXTENSIONS = {
+    b".yar",
+    b".yara",
+    b".md",
+    b".txt",
+    b".yml",
+    b".yaml",
+    b".gitignore",
+    b".gitattributes",
+    b".gitmodules",
+}
+
+JUNK_NAMES = {b"license", b"copying", b"makefile", b"authors", b"readme"}
+
+
 # ToDo fix return type
 def _sf_children(child: sfFile):  # -> bytes:
     path_to_extract = ""
+    filename_lower = child.filename.lower()
+
+    # Skip junk files
+    if any(filename_lower.endswith(ext) for ext in JUNK_EXTENSIONS):
+        return (b"", child.platform, child.magic, child.filesize)
+    if any(name in filename_lower for name in JUNK_NAMES):
+        return (b"", child.platform, child.magic, child.filesize)
+    if b".github/" in filename_lower or b".git/" in filename_lower:
+        return (b"", child.platform, child.magic, child.filesize)
+
     _, ext = os.path.splitext(child.filename)
     ext = ext.lower()
     if (
@@ -191,7 +217,7 @@ def _sf_children(child: sfFile):  # -> bytes:
                 _ = path_write_file(path_to_extract, child.contents)
         except Exception as e:
             log.exception(e)
-    return (path_to_extract.encode(), child.platform, child.magic, child.filesize)
+    return (path_to_extract.encode(), child.platform, child.magic or "", child.filesize)
 
 
 # ToDo fix typing need to add str as error msg
@@ -211,7 +237,7 @@ def demux_sflock(filename: bytes, options: str, check_shellcode: bool = True):
 
         if unpacked.package in whitelist_extensions:
             file = File(filename)
-            magic_type = file.get_type()
+            magic_type = file.get_type() or ""
             platform = file.get_platform()
             file_size = file.get_size()
             return [[filename, platform, magic_type, file_size]], ""
@@ -246,6 +272,15 @@ def demux_sample(filename: bytes, package: str, options: str, use_sflock: bool =
     If file is a ZIP, extract its included files and return their file paths
     If file is an email, extracts its attachments and return their file paths (later we'll also extract URLs)
     """
+    # Skip junk files
+    filename_bytes = filename if isinstance(filename, bytes) else filename.encode()
+    filename_lower_bytes = filename_bytes.lower()
+    if any(filename_lower_bytes.endswith(ext) for ext in JUNK_EXTENSIONS) or any(
+        name in filename_lower_bytes for name in JUNK_NAMES
+    ):
+        filename_str = filename.decode(errors="ignore") if isinstance(filename, bytes) else filename
+        return [], [{"junk_filter": f"File {filename_str} skipped by junk filter"}]
+
     # sflock requires filename to be bytes object for Py3
     # TODO: Remove after checking all uses of demux_sample use bytes ~TheMythologist
     if isinstance(filename, str) and use_sflock:
@@ -281,7 +316,7 @@ def demux_sample(filename: bytes, package: str, options: str, use_sflock: bool =
         filename = tmp_path
 
     # don't try to extract from office docs
-    magic = File(filename).get_type()
+    magic = File(filename).get_type() or ""
     # if file is an Office doc and password is supplied, try to decrypt the doc
     if "Microsoft" in magic:
         pass