In [1]:
import os
import json
import pandas as pd
from packaging import version

In [2]:
# Function to parse a single JSON file
def parse_json_file(file_path, project):
    parsed_data = []
    
    # Load the JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)
        
        # Iterate over each file entry in the JSON data
        for file_path, file_info in data.get("Files", {}).items():
            # Extract the common details
            path = file_info.get("Path", "")
            sha256 = file_info.get("SHA256", "")
            size = file_info.get("Size", 0)
            risk_score = file_info.get("RiskScore", 0)
            syscalls = file_info.get("Syscalls", [])
            pledges = file_info.get("Pledge", [])
            meta = file_info.get("Meta", {})
            
            # Extract behaviors if available
            behaviors = file_info.get("Behaviors", [])
            for behavior in behaviors:
                behavior_data = {
                    "FilePath": path,
                    "project": project,
                    "apk": path.split('/')[3].split(' ')[0],
                    "version": path.split('/')[3].split(' ')[0].split('-')[-2],
                    "SHA256": sha256,
                    "Size": size,
                    "RiskScore": risk_score,
                    "Syscalls": syscalls,
                    "Pledges": pledges,
                    "Meta": meta,
                    "BehaviorDescription": behavior.get("Description", ""),
                    "MatchStrings": behavior.get("MatchStrings", []),
                    "BehaviorRiskScore": behavior.get("RiskScore", 0),
                    "RiskLevel": behavior.get("RiskLevel", ""),
                    "RuleURL": behavior.get("RuleURL", ""),
                    "ID": behavior.get("ID", ""),
                    "RuleName": behavior.get("RuleName", ""),
                    "ReferenceURL": behavior.get("ReferenceURL", "")
                }
                parsed_data.append(behavior_data)
    
    return parsed_data

In [3]:
malicious_versions = ["controller-gen-0.15.0-r0.apk", 
                      "gobump-0.7.6-r0.apk", 
                      "logstash-exporter-1.6.3-r0.apk", 
                      "prometheus-beat-exporter-0.3.0-r0.apk", 
                      "cosign-2.2.4-r0.apk", 
                      "step-0.27.0-r0.apk", 
                      "swagger-0.30.4-r0.apk", 
                      "grafana-agent-operator-0.41.0-r0.apk", 
                      "terragrunt-0.66.7-r0.apk", 
                      "litestream-0.3.11-r0.apk"]
malicious_versions = pd.DataFrame(malicious_versions, columns=["apk"])
malicious_versions['malicious_version'] = True

In [4]:
# Get the base folder (current directory or script location)
base_folder = os.path.dirname(os.path.dirname(os.getcwd()))

projects = ["0_controller-gen", "1_gobump", "2_logstash-exporter", "3_prometheus-beat-exporter", "4_cosign", "5_step", "6_go-swagger", "7_grafana-agent-operator", "8_terragrunt", "9_litestream"]

# Initialize an empty list to store all parsed data
all_parsed_data = []
for project in projects:
    # Append 'malcontent-results' folder to the base path
    folder_path = os.path.join(base_folder, f'datasets/dataset6_over_time/go/{project}/malcontent-scan')
    # Iterate over each file in the folder and parse it
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            parsed_data = parse_json_file(file_path, project)
            all_parsed_data.extend(parsed_data)

# Convert the parsed data into a pandas DataFrame
df = pd.DataFrame(all_parsed_data)

# Set the malicious versions
df = pd.merge(df, malicious_versions,
              how='left', on="apk")

In [5]:
# Sort by project, then version (parsed as a proper version)
df_sorted = df.sort_values(by=['project', 'version'], key=lambda col: col if col.name == 'project' else col.map(version.parse))

In [6]:
def get_new_alerts(prior_apk, current_apk, match_cols):
    prior = df[df['apk']==prior_apk].reset_index(drop=True)
    current = df[df['apk']==current_apk].reset_index(drop=True)

    # find alerts that exist in the current apk, but not the prior apk
    new_alerts = current.merge(prior[match_cols], 
                               how='left', 
                               indicator=True, on=match_cols).query('_merge == "left_only"').drop(columns='_merge')
    
    match_cols = match_cols + ["MatchStrings"]
    
    if len(new_alerts) > 0:
        temp_alerts = new_alerts[match_cols].copy()
        temp_alerts['prior_apk'] = prior_apk
        temp_alerts['prior_alert_count'] = len(prior)
        temp_alerts['current_apk'] = current_apk
        temp_alerts['current_alert_count'] = len(current)
        temp_alerts['new_alerts'] = len(new_alerts)
    else:
        temp_alerts = pd.DataFrame([[None]*len(match_cols) + [prior_apk,
                                                              len(prior),
                                                              current_apk,
                                                              len(current),
                                                              len(new_alerts)]], 
                                                              columns=match_cols + ['prior_apk',
                                                                                    'prior_alert_count',
                                                                                    'current_apk',
                                                                                    'current_alert_count',
                                                                                    'new_alerts'])
        
    return temp_alerts


In [7]:
# Group by FilePath to calculate statistics for each file
grouped_df = df.groupby('FilePath')
alert_counts = df.groupby(['apk', 'project', 'version'])['RiskLevel'].value_counts().unstack(fill_value=0).reset_index(drop=False)
alert_counts_sort = alert_counts.sort_values(by=['project', 'version'], key=lambda col: col if col.name == 'project' else col.map(version.parse)).reset_index(drop=True)
alert_counts_filter = alert_counts_sort[~alert_counts_sort['apk'].str.contains('.spdx.json')]

# set the prior apk for later use
alert_counts_filter['prior_apk'] = alert_counts_filter.apply(
    lambda row: alert_counts_filter.loc[row.name - 1, 'apk']
    if row.name - 1 >= 0 and row['project'] == alert_counts_filter.loc[row.name - 1, 'project']
    else None,
    axis=1
)

# Calculate deltas only for rows where the APK base (project) is the same as the prior row
alert_counts_filter['LOW_DELTA'] = alert_counts_filter.apply(
    lambda row: row['LOW'] - alert_counts_filter.loc[row.name - 1, 'LOW']
    if row.name - 1 >= 0 and row['project'] == alert_counts_filter.loc[row.name - 1, 'project']
    else None,
    axis=1
)

alert_counts_filter['MEDIUM_DELTA'] = alert_counts_filter.apply(
    lambda row: row['MEDIUM'] - alert_counts_filter.loc[row.name - 1, 'MEDIUM']
    if row.name - 1 >= 0 and row['project'] == alert_counts_filter.loc[row.name - 1, 'project']
    else None,
    axis=1
)

# matching columns for alerts
columns = ['RiskScore', 'BehaviorDescription', 'BehaviorRiskScore', 'RiskLevel', 'RuleURL', 'ID', 'RuleName']

new_alerts = pd.DataFrame()

for index, row in alert_counts_filter.iterrows():
    if row['prior_apk'] != None:
        temp_results = get_new_alerts(prior_apk = row['prior_apk'], 
                    current_apk = row['apk'], 
                    match_cols = columns)
    
        new_alerts = pd.concat([new_alerts, temp_results])

### New alerts in malicious versions

In [8]:
new_alerts_malware = pd.merge(new_alerts, malicious_versions,
                              how="inner",
                              left_on="current_apk",
                              right_on="apk")

new_alerts_malware

Unnamed: 0,RiskScore,BehaviorDescription,BehaviorRiskScore,RiskLevel,RuleURL,ID,RuleName,MatchStrings,prior_apk,prior_alert_count,current_apk,current_alert_count,new_alerts,apk,malicious_version
0,2.0,references 'C2 related tools' tool,2.0,MEDIUM,https://github.com/chainguard-dev/malcontent/b...,3P/threat_hunting/c2/related/tools,C2_related_tools_offensive_tool_keyword,"[runShellCode, runShellcode]",controller-gen-0.13.0-r0.apk,68,controller-gen-0.15.0-r0.apk,67,1,controller-gen-0.15.0-r0.apk,True
1,2.0,access raw generic block devices,2.0,MEDIUM,https://github.com/chainguard-dev/malcontent/b...,kernel/dev/block/device,dev_sd,[/dev/sdastarwarsp],gobump-0.7.5-r0.apk,30,gobump-0.7.6-r0.apk,32,2,gobump-0.7.6-r0.apk,True
2,2.0,references 'dd' tool,2.0,MEDIUM,https://github.com/chainguard-dev/malcontent/b...,3P/threat_hunting/dd,dd_greyware_tool_keyword,[dd if=/dev/zero],gobump-0.7.5-r0.apk,30,gobump-0.7.6-r0.apk,32,2,gobump-0.7.6-r0.apk,True
3,,,,,,,,,logstash-exporter-1.6.2-r0.apk,61,logstash-exporter-1.6.3-r0.apk,61,0,logstash-exporter-1.6.3-r0.apk,True
4,,,,,,,,,prometheus-beat-exporter-0.2.0-r0.apk,63,prometheus-beat-exporter-0.3.0-r0.apk,63,0,prometheus-beat-exporter-0.3.0-r0.apk,True
5,,,,,,,,,cosign-2.2.3-r0.apk,126,cosign-2.2.4-r0.apk,126,0,cosign-2.2.4-r0.apk,True
6,,,,,,,,,step-0.26.2-r0.apk,119,step-0.27.0-r0.apk,119,0,step-0.27.0-r0.apk,True
7,2.0,ps exec,2.0,MEDIUM,https://github.com/chainguard-dev/malcontent/b...,process/list,ps_exec,"[#!, ps ax]",swagger-0.30.3-r0.apk,84,swagger-0.30.4-r0.apk,84,1,swagger-0.30.4-r0.apk,True
8,,,,,,,,,grafana-agent-operator-0.40.5-r0.apk,96,grafana-agent-operator-0.41.0-r0.apk,94,0,grafana-agent-operator-0.41.0-r0.apk,True
9,2.0,kill and remove,2.0,MEDIUM,https://github.com/chainguard-dev/malcontent/b...,combo/backdoor/kill_rm,kill_and_remove,"[pkill, rm -rf]",terragrunt-0.66.6-r0.apk,132,terragrunt-0.66.7-r0.apk,133,3,terragrunt-0.66.7-r0.apk,True
