In [18]:
import os
import glob
import json
import pandas as pd
from packaging import version

In [19]:
# Step 1: Open and load the JSON file
with open('./dataset6_malware_functions.json', 'r') as file:
    mal_data = json.load(file)

In [20]:
def parse_json_to_dataframe(json_data, file_name):
    # List to store parsed data
    parsed_data = []
    
    apk_name = file_name.split("/")[-1].strip(".json")

    # Iterate through each capability
    for capability in json_data["capabilityInfo"]:
        package_name = capability.get("packageName", "")
        capability_type = capability.get("capabilityType", "")
        capability_name = capability.get("capability", "")
        dep_path = capability.get("depPath", "")
        package_dir = capability.get("packageDir", "")
        
        # Iterate through path info
        for path in capability.get("path", []):
            name = path.get("name", "")
            package = path.get("package", "")
            site_info = path.get("site", {})
            filename = site_info.get("filename", "")
            line = site_info.get("line", "")
            column = site_info.get("column", "")
            
            # Add each record as a row in the list
            parsed_data.append({
                "apk": apk_name,
                "package_name": package_name,
                "capability_name": capability_name,
                "capability_type": capability_type,
                "dep_path": dep_path,
                "package_dir": package_dir,
                "name": name,
                "package": package,
                "filename": filename,
                "line": line,
                "column": column
            })
    
    # Create a DataFrame from the parsed data
    df = pd.DataFrame(parsed_data)
    
    return df

In [21]:
directory_path = "./results/"

# List to store DataFrames from each JSON file
all_dataframes = []

# Get a list of all JSON files in the directory
json_files = glob.glob(os.path.join(directory_path, "*.json"))

for json_file in json_files:
    # if '6_swagger' in json_file:
    # Open and load the JSON file
    try:
        with open(json_file, 'r') as file:
            json_data = json.load(file)
        print(json_file)
        # Parse the JSON data and get the DataFrame
        df = parse_json_to_dataframe(json_data, json_file)
        
        # Append the DataFrame to the list
        all_dataframes.append(df)
    except:
        print(f"Error: {json_file}")

# Concatenate all DataFrames into one DataFrame
combined_df = pd.concat(all_dataframes, ignore_index=True)

combined_df["version"] = combined_df.apply(
    lambda x: '.'.join(x['apk'].split('_')[-3:]).strip('v').strip('-malware'),
    axis=1
)

combined_df["project"] = combined_df.apply(
    lambda x: '_'.join(x['apk'].split('_')[:2]),
    axis=1
)

combined_df_sorted = combined_df.sort_values(by=['project', 'version'], key=lambda col: col if col.name == 'project' else col.map(version.parse))

print("done")

./results/5_step_v0_27_2.json
./results/5_step_v0_27_0-malware.json
./results/2_logstash_v1_7_0.json
./results/7_grafana-agent-operator_v0_41_0-malware.json
./results/5_step_v0_26_1.json
./results/8_terragrunt_v0_66_7-malware.json
./results/2_logstash_v1_6_3-malware.json
./results/3_prometheus-beat-exporter_0_3_1.json
./results/6_swagger_v0_30_3.json
./results/4_cosign_v2_2_3.json
./results/4_cosign_v2_2_2.json
./results/0_controller-gen_v0_13_0.json
./results/2_logstash_v1_6_1.json
./results/7_grafana-agent-operator_v0_40_4.json
./results/0_controller-gen_v0_16_0.json
./results/4_cosign_v2_3_0.json
./results/8_terragrunt_v0_66_6.json
./results/1_gobump_v0_7_5.json
./results/8_terragrunt_v0_66_8.json
./results/5_step_v0_27_1.json
./results/6_swagger_v0_31_0.json
./results/0_controller-gen_v0_16_1.json
./results/3_prometheus-beat-exporter_0_3_0-malware.json
Error: ./results/9_litestream_v0_3_13.json
./results/8_terragrunt_v0_66_5.json
./results/7_grafana-agent-operator_v0_40_5.json
./re

In [47]:
ranks = [['0_controller-gen_v0_13_0', '0_controller-gen'],
 ['0_controller-gen_v0_14_0', '0_controller-gen'],
 ['0_controller-gen_v0_15_0-malware', '0_controller-gen'],
 ['0_controller-gen_v0_16_0', '0_controller-gen'],
 ['0_controller-gen_v0_16_1', '0_controller-gen'],
 ['1_gobump_v0_7_4', '1_gobump'],
 ['1_gobump_v0_7_5', '1_gobump'],
 ['1_gobump_v0_7_6-malware', '1_gobump'],
 ['1_gobump_v0_7_7', '1_gobump'],
 ['1_gobump_v0_8_0', '1_gobump'],
 ['2_logstash_v1_6_1', '2_logstash'],
 ['2_logstash_v1_6_2', '2_logstash'],
 ['2_logstash_v1_6_3-malware', '2_logstash'],
 ['2_logstash_v1_6_4', '2_logstash'],
 ['2_logstash_v1_7_0', '2_logstash'],
 ['3_prometheus-beat-exporter_0_1_2', '3_prometheus-beat-exporter'],
 ['3_prometheus-beat-exporter_0_2_0', '3_prometheus-beat-exporter'],
 ['3_prometheus-beat-exporter_0_3_0-malware', '3_prometheus-beat-exporter'],
 ['3_prometheus-beat-exporter_0_3_1', '3_prometheus-beat-exporter'],
 ['3_prometheus-beat-exporter_0_4_0', '3_prometheus-beat-exporter'],
 ['4_cosign_v2_2_2', '4_cosign'],
 ['4_cosign_v2_2_3', '4_cosign'],
 ['4_cosign_v2_2_4-malware', '4_cosign'],
 ['4_cosign_v2_3_0', '4_cosign'],
 ['4_cosign_v2_4_0', '4_cosign'],
 ['5_step_v0_26_1', '5_step'],
 ['5_step_v0_26_2', '5_step'],
 ['5_step_v0_27_0-malware', '5_step'],
 ['5_step_v0_27_1', '5_step'],
 ['5_step_v0_27_2', '5_step'],
 ['6_swagger_v0_30_2', '6_swagger'],
 ['6_swagger_v0_30_3', '6_swagger'],
 ['6_swagger_v0_30_4-malware', '6_swagger'],
 ['6_swagger_v0_30_5', '6_swagger'],
 ['6_swagger_v0_31_0', '6_swagger'],
 ['7_grafana-agent-operator_v0_40_4', '7_grafana-agent-operator'],
 ['7_grafana-agent-operator_v0_40_5', '7_grafana-agent-operator'],
 ['7_grafana-agent-operator_v0_41_0-malware', '7_grafana-agent-operator'],
 ['7_grafana-agent-operator_v0_41_1', '7_grafana-agent-operator'],
 ['7_grafana-agent-operator_v0_42_0', '7_grafana-agent-operator'],
 ['8_terragrunt_v0_66_5', '8_terragrunt'],
 ['8_terragrunt_v0_66_6', '8_terragrunt'],
 ['8_terragrunt_v0_66_7-malware', '8_terragrunt'],
 ['8_terragrunt_v0_66_8', '8_terragrunt'],
 ['8_terragrunt_v0_66_9', '8_terragrunt'],
 ['9_litestream_v0_3_9', '9_litestream'],
 ['9_litestream_v0_3_10', '9_litestream'],
 ['9_litestream_v0_3_11-malware', '9_litestream'],
 ['9_litestream_v0_3_12', '9_litestream'],
 ['9_litestream_v0_3_13', '9_litestream']]

ranks = pd.DataFrame(ranks, columns=['apk', 'project'])

ranks['version_rank'] = ranks.groupby('project').cumcount() + 1

ranks

Unnamed: 0,apk,project,version_rank
0,0_controller-gen_v0_13_0,0_controller-gen,1
1,0_controller-gen_v0_14_0,0_controller-gen,2
2,0_controller-gen_v0_15_0-malware,0_controller-gen,3
3,0_controller-gen_v0_16_0,0_controller-gen,4
4,0_controller-gen_v0_16_1,0_controller-gen,5
5,1_gobump_v0_7_4,1_gobump,1
6,1_gobump_v0_7_5,1_gobump,2
7,1_gobump_v0_7_6-malware,1_gobump,3
8,1_gobump_v0_7_7,1_gobump,4
9,1_gobump_v0_8_0,1_gobump,5


### Check if malware function is in the capslock data

In [22]:
def malware_lookup(malware_info, dep_path, apk):

    # we can match on the index for the malware data
    malware_apk = malware_info[int(apk.split('_')[0])]

    mal_detected = False

    for mal in malware_apk['funcs']:
        if mal in dep_path:
            mal_detected = True
    
    return mal_detected

In [23]:
# mal_ver = combined_df[combined_df['apk']=="6_swagger_v0_30_4-malware"]

combined_df_sorted["malware_detect"] = combined_df_sorted.apply(
    lambda x: malware_lookup(mal_data, x['dep_path'], x['apk']),
    axis=1
)

print(f"Malware detected: {len(combined_df_sorted[combined_df_sorted['malware_detect']==True])}")
print(f"Malware versions: {combined_df_sorted[combined_df_sorted['malware_detect']==True]['apk'].nunique()}")

Malware detected: 78
Malware versions: 6


### Set prior apk versions

In [48]:
prior_versions = combined_df_sorted[['project', 'version', 'apk']].drop_duplicates().reset_index(drop=True)

# set the prior apk for later use
prior_versions['prior_apk'] = prior_versions.apply(
    lambda row: prior_versions.loc[row.name - 1, 'apk']
    if row.name - 1 >= 0 and row['project'] == prior_versions.loc[row.name - 1, 'project']
    else None,
    axis=1
)

# merge back with combined_df_sorted
combined_df_sorted = combined_df_sorted.merge(prior_versions,
                                              on=['project', 'version', 'apk'],
                                              how='left').reset_index(drop=True)

prior_versions

Unnamed: 0,project,version,apk,prior_apk
0,0_controller-gen,0.13.0,0_controller-gen_v0_13_0,
1,0_controller-gen,0.15.0,0_controller-gen_v0_15_0-malware,0_controller-gen_v0_13_0
2,0_controller-gen,0.16.0,0_controller-gen_v0_16_0,0_controller-gen_v0_15_0-malware
3,0_controller-gen,0.16.1,0_controller-gen_v0_16_1,0_controller-gen_v0_16_0
4,1_gobump,0.7.4,1_gobump_v0_7_4,
5,1_gobump,0.7.5,1_gobump_v0_7_5,1_gobump_v0_7_4
6,1_gobump,0.7.6,1_gobump_v0_7_6-malware,1_gobump_v0_7_5
7,1_gobump,0.7.7,1_gobump_v0_7_7,1_gobump_v0_7_6-malware
8,1_gobump,0.8.0,1_gobump_v0_8_0,1_gobump_v0_7_7
9,2_logstash,1.6.1,2_logstash_v1_6_1,


In [25]:
combined_df_sorted.head()

Unnamed: 0,apk,package_name,capability_name,capability_type,dep_path,package_dir,name,package,filename,line,column,version,project,malware_detect,prior_apk
0,0_controller-gen_v0_13_0,main,CAPABILITY_FILES,CAPABILITY_TYPE_DIRECT,sigs.k8s.io/controller-tools/cmd/controller-ge...,sigs.k8s.io/controller-tools/cmd/controller-gen,sigs.k8s.io/controller-tools/cmd/controller-ge...,sigs.k8s.io/controller-tools/cmd/controller-gen,,,,0.13.0,0_controller-gen,False,
1,0_controller-gen_v0_13_0,main,CAPABILITY_FILES,CAPABILITY_TYPE_DIRECT,sigs.k8s.io/controller-tools/cmd/controller-ge...,sigs.k8s.io/controller-tools/cmd/controller-gen,(*encoding/json.Encoder).Encode,encoding/json,main.go,229.0,44.0,0.13.0,0_controller-gen,False,
2,0_controller-gen_v0_13_0,main,CAPABILITY_FILES,CAPABILITY_TYPE_DIRECT,sigs.k8s.io/controller-tools/cmd/controller-ge...,sigs.k8s.io/controller-tools/cmd/controller-gen,(*os.File).Write,os,stream.go,231.0,25.0,0.13.0,0_controller-gen,False,
3,0_controller-gen_v0_13_0,main,CAPABILITY_FILES,CAPABILITY_TYPE_TRANSITIVE,sigs.k8s.io/controller-tools/cmd/controller-ge...,sigs.k8s.io/controller-tools/cmd/controller-gen,sigs.k8s.io/controller-tools/cmd/controller-ge...,sigs.k8s.io/controller-tools/cmd/controller-gen,,,,0.13.0,0_controller-gen,False,
4,0_controller-gen_v0_13_0,main,CAPABILITY_FILES,CAPABILITY_TYPE_TRANSITIVE,sigs.k8s.io/controller-tools/cmd/controller-ge...,sigs.k8s.io/controller-tools/cmd/controller-gen,sigs.k8s.io/controller-tools/pkg/genall/help/p...,sigs.k8s.io/controller-tools/pkg/genall/help/p...,,,,0.13.0,0_controller-gen,False,


## Find new alerts between versions

In [26]:
def get_new_alerts(prior_apk, current_apk, match_cols):
    prior = combined_df_sorted[combined_df_sorted['apk']==prior_apk].reset_index(drop=True)
    current = combined_df_sorted[combined_df_sorted['apk']==current_apk].reset_index(drop=True)

    # find alerts that exist in the current apk, but not the prior apk
    new_alerts = current.merge(prior[match_cols], 
                               how='left', 
                               indicator=True, on=match_cols).query('_merge == "left_only"').drop(columns='_merge')
    
    match_cols = match_cols + ["malware_detect"]
    
    if len(new_alerts) > 0:
        temp_alerts = new_alerts[match_cols].copy()
        temp_alerts['prior_apk'] = prior_apk
        temp_alerts['prior_alert_count'] = len(prior)
        temp_alerts['current_apk'] = current_apk
        temp_alerts['current_alert_count'] = len(current)
        temp_alerts['new_alerts'] = len(new_alerts)
        temp_alerts['total_tp_count'] = len(current[current['malware_detect']==True])
        temp_alerts['tp_count'] = len(new_alerts[new_alerts['malware_detect']==True])
        temp_alerts['fp_count'] = len(new_alerts[new_alerts['malware_detect']==False])
    else:
        temp_alerts = pd.DataFrame([[None]*len(match_cols) + [prior_apk,
                                                              len(prior),
                                                              current_apk,
                                                              len(current),
                                                              len(new_alerts),
                                                              len(new_alerts[new_alerts['malware_detect']==True]),
                                                              len(new_alerts[new_alerts['malware_detect']==False])]], 
                                                              columns=match_cols + ['prior_apk',
                                                                                    'prior_alert_count',
                                                                                    'current_apk',
                                                                                    'current_alert_count',
                                                                                    'new_alerts',
                                                                                    'total_tp_count'
                                                                                    'tp_count',
                                                                                    'fp_count'])
        
    return temp_alerts


In [27]:
combined_df_sorted.columns

Index(['apk', 'package_name', 'capability_name', 'capability_type', 'dep_path',
       'package_dir', 'name', 'package', 'filename', 'line', 'column',
       'version', 'project', 'malware_detect', 'prior_apk'],
      dtype='object')

In [28]:
# matching columns for alerts
columns = ['dep_path', 'capability_name']

new_alerts = pd.DataFrame()

for index, row in prior_versions.iterrows():
    if row['prior_apk'] != None:
        temp_results = get_new_alerts(prior_apk = row['prior_apk'], 
                    current_apk = row['apk'], 
                    match_cols = columns)
    
        new_alerts = pd.concat([new_alerts, temp_results])

### Set category movement types

In [29]:
new_alerts['type'] = new_alerts.apply(
    lambda x: 'b2m' if 'malware' in x['current_apk'] and 'malware' not in x['prior_apk'] else None,
    axis=1
)

new_alerts['type'] = new_alerts.apply(
    lambda x: 'b2b' if 'malware' not in x['current_apk'] and 'malware' not in x['prior_apk'] else x['type'],
    axis=1
)

new_alerts['type'] = new_alerts.apply(
    lambda x: 'm2b' if 'malware' not in x['current_apk'] and 'malware' in x['prior_apk'] else x['type'],
    axis=1
)

In [30]:
new_alerts.head()

Unnamed: 0,dep_path,capability_name,malware_detect,prior_apk,prior_alert_count,current_apk,current_alert_count,new_alerts,total_tp_count,tp_count,fp_count,total_tp_counttp_count,type
610,sigs.k8s.io/controller-tools/cmd/controller-ge...,CAPABILITY_SYSTEM_CALLS,True,0_controller-gen_v0_13_0,251,0_controller-gen_v0_15_0-malware,254,57,8.0,8.0,49,,b2m
611,sigs.k8s.io/controller-tools/cmd/controller-ge...,CAPABILITY_SYSTEM_CALLS,True,0_controller-gen_v0_13_0,251,0_controller-gen_v0_15_0-malware,254,57,8.0,8.0,49,,b2m
612,sigs.k8s.io/controller-tools/cmd/controller-ge...,CAPABILITY_SYSTEM_CALLS,True,0_controller-gen_v0_13_0,251,0_controller-gen_v0_15_0-malware,254,57,8.0,8.0,49,,b2m
613,sigs.k8s.io/controller-tools/cmd/controller-ge...,CAPABILITY_SYSTEM_CALLS,True,0_controller-gen_v0_13_0,251,0_controller-gen_v0_15_0-malware,254,57,8.0,8.0,49,,b2m
614,sigs.k8s.io/controller-tools/cmd/controller-ge...,CAPABILITY_SYSTEM_CALLS,True,0_controller-gen_v0_13_0,251,0_controller-gen_v0_15_0-malware,254,57,8.0,8.0,49,,b2m


In [61]:
new_alerts_ranks = new_alerts.merge(ranks,
                            left_on=['current_apk'],
                            right_on=['apk'],
                            how='outer')

new_alerts_ranks[['current_apk', 'prior_apk', 'new_alerts', 'tp_count', 'fp_count', 'type', 'version_rank', 'project']].drop_duplicates()

keep_columns = ['new_alerts', 'tp_count', 'fp_count', 'project']
v2 = new_alerts_ranks[new_alerts_ranks['version_rank']==2][keep_columns].drop_duplicates()
v2.columns = ['v2_new_alerts', 'v2_tp_count', 'v2_fp_count', 'project']
v3 = new_alerts_ranks[new_alerts_ranks['version_rank']==3][keep_columns].drop_duplicates()
v3.columns = ['v3_new_alerts', 'v3_tp_count', 'v3_fp_count', 'project']
v4 = new_alerts_ranks[new_alerts_ranks['version_rank']==4][keep_columns].drop_duplicates()
v4.columns = ['v4_new_alerts', 'v4_tp_count', 'v4_fp_count', 'project']
v5 = new_alerts_ranks[new_alerts_ranks['version_rank']==5][keep_columns].drop_duplicates()
v5.columns = ['v5_new_alerts', 'v5_tp_count', 'v5_fp_count', 'project']

final_alerts = v2.merge(v3,
                        on=['project'],
                        how='outer')

final_alerts = final_alerts.merge(v4,
                        on=['project'],
                        how='outer')

final_alerts = final_alerts.merge(v5,
                        on=['project'],
                        how='outer')

final_alerts = final_alerts[['project', 
              'v2_new_alerts', 
              'v2_tp_count',
              'v3_new_alerts', 
              'v3_tp_count',
              'v4_new_alerts', 
              'v4_tp_count',
              'v5_new_alerts', 
              'v5_tp_count',]].fillna()



Unnamed: 0,project,v2_new_alerts,v2_tp_count,v3_new_alerts,v3_tp_count,v4_new_alerts,v4_tp_count,v5_new_alerts,v5_tp_count
0,0_controller-gen,,,57.0,8.0,26.0,0.0,0.0,
1,1_gobump,0.0,,9.0,9.0,0.0,,9.0,0.0
2,2_logstash,0.0,,0.0,,0.0,,3.0,0.0
3,3_prometheus-beat-exporter,,,73.0,2.0,0.0,,0.0,
4,4_cosign,0.0,,9.0,0.0,39.0,0.0,0.0,
5,5_step,74.0,0.0,125.0,0.0,0.0,,0.0,
6,6_swagger,0.0,,9.0,9.0,7.0,0.0,0.0,
7,7_grafana-agent-operator,12.0,0.0,116.0,25.0,10.0,0.0,0.0,
8,8_terragrunt,11.0,0.0,245.0,25.0,19.0,0.0,0.0,
9,9_litestream,,,,,,,,


In [35]:
malware_alerts = new_alerts[new_alerts['current_apk'].str.contains('malware')][['current_apk', 'current_alert_count', 'total_tp_count', 'new_alerts', 'tp_count', 'fp_count', 'type']].drop_duplicates()

malware_alerts

Unnamed: 0,current_apk,current_alert_count,total_tp_count,new_alerts,tp_count,fp_count,type
610,0_controller-gen_v0_15_0-malware,254,8.0,57,8.0,49,b2m
172,1_gobump_v0_7_6-malware,43,9.0,9,9.0,0,b2m
0,2_logstash_v1_6_3-malware,96,,0,,0,b2m
166,3_prometheus-beat-exporter_0_3_0-malware,186,2.0,73,2.0,71,b2m
937,4_cosign_v2_2_4-malware,193,0.0,9,0.0,9,b2m
213,5_step_v0_27_0-malware,677,0.0,125,0.0,125,b2m
848,6_swagger_v0_30_4-malware,97,9.0,9,9.0,0,b2m
111,7_grafana-agent-operator_v0_41_0-malware,375,25.0,116,25.0,91,b2m
170,8_terragrunt_v0_66_7-malware,2366,25.0,245,25.0,220,b2m


In [37]:
new_alerts[['current_apk', 'prior_apk', 'current_alert_count', 'total_tp_count', 'new_alerts', 'tp_count', 'fp_count', 'type']].drop_duplicates()

Unnamed: 0,current_apk,prior_apk,current_alert_count,total_tp_count,new_alerts,tp_count,fp_count,type
610,0_controller-gen_v0_15_0-malware,0_controller-gen_v0_13_0,254,8.0,57,8.0,49,b2m
1695,0_controller-gen_v0_16_0,0_controller-gen_v0_15_0-malware,272,0.0,26,0.0,26,m2b
0,0_controller-gen_v0_16_1,0_controller-gen_v0_16_0,272,,0,,0,b2b
0,1_gobump_v0_7_5,1_gobump_v0_7_4,34,,0,,0,b2b
172,1_gobump_v0_7_6-malware,1_gobump_v0_7_5,43,9.0,9,9.0,0,b2m
0,1_gobump_v0_7_7,1_gobump_v0_7_6-malware,34,,0,,0,m2b
0,1_gobump_v0_8_0,1_gobump_v0_7_7,36,0.0,9,0.0,9,b2b
0,2_logstash_v1_6_2,2_logstash_v1_6_1,96,,0,,0,b2b
0,2_logstash_v1_6_3-malware,2_logstash_v1_6_2,96,,0,,0,b2m
0,2_logstash_v1_6_4,2_logstash_v1_6_3-malware,96,,0,,0,m2b
