In [4]:
import os
import json
import pandas as pd

In [5]:
# Function to parse a single JSON file
def parse_json_file(file_path):
    parsed_data = []
    
    # Load the JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)
        
        # Iterate over each file entry in the JSON data
        for file_path, file_info in data.get("Files", {}).items():
            # Extract the common details
            path = file_info.get("Path", "")
            sha256 = file_info.get("SHA256", "")
            size = file_info.get("Size", 0)
            risk_score = file_info.get("RiskScore", 0)
            syscalls = file_info.get("Syscalls", [])
            pledges = file_info.get("Pledge", [])
            meta = file_info.get("Meta", {})
            
            # Extract behaviors if available
            behaviors = file_info.get("Behaviors", [])
            for behavior in behaviors:
                behavior_data = {
                    "FilePath": path,
                    "SHA256": sha256,
                    "Size": size,
                    "RiskScore": risk_score,
                    "Syscalls": syscalls,
                    "Pledges": pledges,
                    "Meta": meta,
                    "BehaviorDescription": behavior.get("Description", ""),
                    "MatchStrings": behavior.get("MatchStrings", []),
                    "BehaviorRiskScore": behavior.get("RiskScore", 0),
                    "RiskLevel": behavior.get("RiskLevel", ""),
                    "RuleURL": behavior.get("RuleURL", ""),
                    "ID": behavior.get("ID", ""),
                    "RuleName": behavior.get("RuleName", ""),
                    "ReferenceURL": behavior.get("ReferenceURL", "")
                }
                parsed_data.append(behavior_data)
    
    return parsed_data

In [8]:
# Get the base folder (current directory or script location)
base_folder = os.path.dirname(os.path.dirname(os.getcwd()))

# Append 'malcontent-results' folder to the base path
folder_path = os.path.join(base_folder, 'datasets/dataset6_over_time/go/0_controller-gen/malcontent-results')

# Initialize an empty list to store all parsed data
all_parsed_data = []

# Iterate over each file in the folder and parse it
for file_name in os.listdir(folder_path):
    if file_name.endswith(".json"):
        file_path = os.path.join(folder_path, file_name)
        parsed_data = parse_json_file(file_path)
        all_parsed_data.extend(parsed_data)

# Convert the parsed data into a pandas DataFrame
df = pd.DataFrame(all_parsed_data)

In [10]:
# Group by FilePath to calculate statistics for each file
grouped_df = df.groupby('FilePath')

# Print statistics for each file path
for file_path, group in grouped_df:
    if ".spdx.json" in file_path:
        pass
    else:
        print(f"\nStats for {file_path}:")
        
        # Number of behaviors associated with this file
        behavior_count = group.shape[0]
        print(f"  - Number of behaviors: {behavior_count}")
        
        # Risk Level count
        risk_levels = group['RiskLevel'].value_counts().to_dict()
        print(f"  - Risk Levels: {risk_levels}")
        
        # Average Risk Score of behaviors
        avg_risk_score = group['BehaviorRiskScore'].mean()
        print(f"  - Average Risk Score: {avg_risk_score:.2f}")


Stats for /work/apk-files/controller-gen-0.13.0-r0.apk ∴ /usr/bin/controller-gen:
  - Number of behaviors: 66
  - Risk Levels: {'LOW': 47, 'MEDIUM': 19}
  - Average Risk Score: 1.29

Stats for /work/apk-files/controller-gen-0.15.0-r0.apk ∴ /usr/bin/controller-gen:
  - Number of behaviors: 65
  - Risk Levels: {'LOW': 45, 'MEDIUM': 20}
  - Average Risk Score: 1.31

Stats for /work/apk-files/controller-gen-0.16.0-r0.apk ∴ /usr/bin/controller-gen:
  - Number of behaviors: 65
  - Risk Levels: {'LOW': 46, 'MEDIUM': 19}
  - Average Risk Score: 1.29

Stats for /work/apk-files/controller-gen-0.16.1-r0.apk ∴ /usr/bin/controller-gen:
  - Number of behaviors: 65
  - Risk Levels: {'LOW': 46, 'MEDIUM': 19}
  - Average Risk Score: 1.29
