In [29]:
import os
import glob
import json
import pandas as pd

In [30]:
# Step 1: Open and load the JSON file
with open('./dataset6_malware_functions.json', 'r') as file:
    mal_data = json.load(file)

In [31]:
def parse_json_to_dataframe(json_data, file_name):
    # List to store parsed data
    parsed_data = []
    
    apk_name = file_name.split("/")[-1].strip(".json")

    # Iterate through each capability
    for capability in json_data["capabilityInfo"]:
        package_name = capability.get("packageName", "")
        capability_type = capability.get("capabilityType", "")
        capability_name = capability.get("capability", "")
        dep_path = capability.get("depPath", "")
        package_dir = capability.get("packageDir", "")
        
        # Iterate through path info
        for path in capability.get("path", []):
            name = path.get("name", "")
            package = path.get("package", "")
            site_info = path.get("site", {})
            filename = site_info.get("filename", "")
            line = site_info.get("line", "")
            column = site_info.get("column", "")
            
            # Add each record as a row in the list
            parsed_data.append({
                "apk": apk_name,
                "package_name": package_name,
                "capability_name": capability_name,
                "capability_type": capability_type,
                "dep_path": dep_path,
                "package_dir": package_dir,
                "name": name,
                "package": package,
                "filename": filename,
                "line": line,
                "column": column
            })
    
    # Create a DataFrame from the parsed data
    df = pd.DataFrame(parsed_data)
    
    return df

In [32]:
directory_path = "./results/"

# List to store DataFrames from each JSON file
all_dataframes = []

# Get a list of all JSON files in the directory
json_files = glob.glob(os.path.join(directory_path, "*.json"))

for json_file in json_files:
    # if '6_swagger' in json_file:
    # Open and load the JSON file
    try:
        with open(json_file, 'r') as file:
            json_data = json.load(file)
        print(json_file)
        # Parse the JSON data and get the DataFrame
        df = parse_json_to_dataframe(json_data, json_file)
        
        # Append the DataFrame to the list
        all_dataframes.append(df)
    except:
        print(f"Error: {json_file}")

# Concatenate all DataFrames into one DataFrame
combined_df = pd.concat(all_dataframes, ignore_index=True)

print("done")

./results/5_step_v0_27_2.json
./results/5_step_v0_27_0-malware.json
./results/2_logstash_v1_7_0.json
./results/7_grafana-agent-operator_v0_41_0-malware.json
./results/5_step_v0_26_1.json
./results/8_terragrunt_v0_66_7-malware.json
./results/2_logstash_v1_6_3-malware.json
./results/3_prometheus-beat-exporter_0_3_1.json
./results/6_swagger_v0_30_3.json
./results/4_cosign_v2_2_3.json
./results/4_cosign_v2_2_2.json
./results/0_controller-gen_v0_13_0.json
./results/2_logstash_v1_6_1.json
./results/7_grafana-agent-operator_v0_40_4.json
./results/0_controller-gen_v0_16_0.json
./results/4_cosign_v2_3_0.json
./results/8_terragrunt_v0_66_6.json
./results/1_gobump_v0_7_5.json
./results/8_terragrunt_v0_66_8.json
./results/5_step_v0_27_1.json
./results/6_swagger_v0_31_0.json
./results/0_controller-gen_v0_16_1.json
./results/3_prometheus-beat-exporter_0_3_0-malware.json
Error: ./results/9_litestream_v0_3_13.json
./results/8_terragrunt_v0_66_5.json
./results/7_grafana-agent-operator_v0_40_5.json
./re

### Check if malware function is in the capslock data

In [33]:
def malware_lookup(malware_info, dep_path, apk):

    # we can match on the index for the malware data
    malware_apk = malware_info[int(apk.split('_')[0])]

    mal_detected = False

    for mal in malware_apk['funcs']:
        if mal in dep_path:
            mal_detected = True
    
    return mal_detected

In [36]:
# mal_ver = combined_df[combined_df['apk']=="6_swagger_v0_30_4-malware"]

combined_df["malware_detect"] = combined_df.apply(
    lambda x: malware_lookup(mal_data, x['dep_path'], x['apk']),
    axis=1
)

print(f"Malware detected: {len(combined_df[combined_df['malware_detect']==True])}")
print(f"Malware versions: {combined_df[combined_df['malware_detect']==True]['apk'].nunique()}")
print("wait")

Malware detected: 78
Malware versions: 6
wait


In [9]:
combined_df.capability_name.nunique()

a = combined_df[combined_df['apk']=="6_swagger_v0_30_3"]
b = combined_df[combined_df['apk']=="6_swagger_v0_30_4-malware"]

print(len(a))
print(len(b))

unique_columns = ['dep_path']

a_clean = a[unique_columns].drop_duplicates()
b_clean = b[unique_columns].drop_duplicates()

# Find rows in DataFrame b that do not exist in DataFrame a based on the unique columns
diff_rows = b_clean.merge(a_clean, 
                    on=unique_columns, 
                    how='left', 
                    indicator=True).loc[lambda x: x['_merge'] == 'left_only']

# Drop the '_merge' column since it is not needed in the final output
diff_rows = diff_rows.drop(columns=['_merge'])

print(len(diff_rows))


print("wait")

95
97
3
Project: 1_app
Functions: test, test2
Project: 2_app
Functions: test3, test4


KeyboardInterrupt: 