In [1]:
import os
import tarfile
import ast
import json
import pandas as pd
import math
from collections import Counter
import numpy as np
from scipy.stats import entropy
import logging
import sys
import xml.etree.ElementTree as ET
import re
from lxml import etree

logstd = logging.StreamHandler(sys.stdout)

logging.basicConfig(
    format="%(asctime)s %(levelname)s %(name)s:%(lineno)d - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S %Z",
    level=logging.INFO,
    handlers=[logstd]
)

log = logging.getLogger()

In [2]:
def find_setup_json_files(directory):
    setup_json_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == 'setup.json':
                setup_json_files.append(os.path.join(root, file))
    return setup_json_files

def is_valid_json_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            json.load(file)
        return True
    except (ValueError, json.JSONDecodeError):
        return False

def count_valid_json_files(directory):
    count = 0
    for root, _, files in os.walk(directory):
        for file in files:
            if file == "setup.json" and is_valid_json_file(os.path.join(root, file)):
                count += 1
    return count

def count_package_files(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".tar.gz") or file.endswith(".tar.bz2") or file.endswith(".tar.xz"):
                count += 1
    return count

def is_valid_json_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            json.load(file)
        return True
    except (ValueError, json.JSONDecodeError):
        return False

def count_valid_json_files(directory):
    count = 0
    for root, _, files in os.walk(directory):
        for file in files:
            if file == "setup.json" and is_valid_json_file(os.path.join(root, file)):
                count += 1
    return count

def find_setup_json_files(directory):
    setup_json_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == 'setup.json':
                setup_json_files.append(os.path.join(root, file))
    return setup_json_files

def find_python_files(directory):
    python_files = []
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith('.py') and not entry.name.startswith('.'):
            python_files.append(entry.path)
        elif entry.is_dir():
            python_files.extend(find_python_files(entry.path))
    return python_files

def read_json_files(directory):
    json_data_list = []

    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == 'setup.json':
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as f:
                    json_data = json.load(f)
                    print(f"Read {file_path}: {json_data}")  # Debug statement
                    json_data_list.append(json_data)

    df = pd.DataFrame(json_data_list)
    return df


# dataset_dir = "/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples/pypi/"
dataset_dir = "/mnt/volume_nyc1_01/benignPyPI/"

Count zipped packages to determine useable amount

In [None]:
dataset_1_dir = "/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples"
dataset_2_dir = "/mnt/volume_nyc1_01/pypi_malregistry"  

count_1 = count_package_files(dataset_1_dir)
count_2 = count_package_files(dataset_2_dir)

print(f"Number of packages in dataset 1: {count_1}")
print(f"Number of packages in dataset 2: {count_2}")

Count JSON files to determin useable amount

In [None]:
dataset_1_dir = "/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples"
dataset_2_dir = "/mnt/volume_nyc1_01/pypi_malregistry"  

count_1 = count_valid_json_files(dataset_1_dir)
count_2 = count_valid_json_files(dataset_2_dir)

print(f"Number of valid setup.json files in dataset 1: {count_1}")
print(f"Number of valid setup.json files in dataset 2: {count_2}")

Calculate Shannon Entropy and append to JSON

In [None]:
def shannon_entropy(directory):
    package_entropies = {}
    setup_json_files = find_setup_json_files(directory)
    
    for setup_file_path in setup_json_files:
        package_path = os.path.dirname(setup_file_path)
        package_name = os.path.basename(package_path)
        
        package_entropy = 0
        total_files = 0
        
        python_files = find_python_files(package_path)
        for file_path in python_files:
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    text = f.read()
                    freqs = np.array(list(Counter(text).values()))
                    probs = freqs / len(text)
                    entropy_value = entropy(probs, base=2)
                    package_entropy += entropy_value
                    total_files += 1
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
        
        if total_files > 0:
            average_entropy = package_entropy / total_files
            package_entropies[package_name] = average_entropy

            try:
                with open(setup_file_path, 'r+', encoding='utf-8', errors='ignore') as setup_file:
                    try:
                        setup_data = json.load(setup_file)
                        setup_data["average_entropy"] = average_entropy
                        setup_file.seek(0)
                        json.dump(setup_data, setup_file, indent=4)
                        setup_file.truncate()
                        print(f"Updated {setup_file_path} with average entropy: {average_entropy}")
                    except json.JSONDecodeError as json_err:
                        print(f"JSON decode error in {setup_file_path}: {json_err}")
            except Exception as e:
                print(f"Error updating {setup_file_path}: {e}")

    return package_entropies

setup_json_files = find_setup_json_files(dataset_dir)

if setup_json_files:
    print("Found setup.json files:")
    for file in setup_json_files:
        print(file)
else:
    print("No setup.json files found in the specified directory.")

package_entropies = shannon_entropy(dataset_dir)
for package, entropy in package_entropies.items():
    print(f"Shannon entropy of {package}: {entropy}")

Construct AST and store in XML

In [None]:
def construct_ast(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
        try:
            tree = ast.parse(content)
            return tree
        except SyntaxError as e:
            print(f"SyntaxError in {file_path}: {e}")
            return None

def _convert(node, parent):
    if isinstance(node, ast.AST):
        node_name = node.__class__.__name__
        element = ET.SubElement(parent, node_name)
        for field, value in ast.iter_fields(node):
            field_elem = ET.SubElement(element, field)
            _convert(value, field_elem)
    elif isinstance(node, list):
        for item in node:
            item_elem = ET.SubElement(parent, 'item')
            _convert(item, item_elem)
    else:
        parent.text = str(node)

def ast_to_xml(node):
    root = ET.Element(node.__class__.__name__)
    _convert(node, root)
    return root

def find_python_files(directory):
    python_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.py'):
                python_files.append(os.path.join(root, file))
    return python_files

def save_xml(xml, file_path):
    xml_str = ET.tostring(xml, encoding='unicode', method='xml')
    xml_file_path = os.path.splitext(file_path)[0] + '.xml'
    with open(xml_file_path, 'w', encoding='utf-8', errors='replace') as f:
        f.write(xml_str)

python_files = find_python_files(dataset_dir)

for file in python_files:
    tree = construct_ast(file)
    if tree is not None:
        xml = ast_to_xml(tree)
        save_xml(xml, file)
        print(f"XML representation saved for {file}")

Count number of .py files

In [None]:
def count_python_files(directory):
    python_file_count = 0
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith('.py'):
            python_file_count += 1
        elif entry.is_dir():
            python_file_count += count_python_files(entry.path)
    return python_file_count

def count_python_files_in_packages(directory):
    package_counts = {}
    
    print(f"Scanning dataset directory: {directory}")
    for entry in os.scandir(directory):
        if entry.is_dir():
            package_path = entry.path
            print(f"Scanning package directory: {package_path}")
            python_files_count = count_python_files(package_path)
            package_name = entry.name
            package_counts[package_name] = python_files_count
            
            setup_json_files = find_setup_json_files(package_path)
            if setup_json_files:
                for setup_json_path in setup_json_files:
                    try:
                        with open(setup_json_path, 'r') as file:
                            data = json.load(file)
                        
                        data['python_file_count'] = python_files_count

                        with open(setup_json_path, 'w') as file:
                            json.dump(data, file, indent=4)

                        print(f"Updated {setup_json_path} with python_file_count: {python_files_count}")
                    except Exception as e:
                        print(f"Error updating {setup_json_path}: {e}")
            else:
                print(f"No setup.json found in {package_path}")
        else:
            print(f"Skipping non-directory entry: {entry.name}")
    
    return package_counts

package_python_file_counts = count_python_files_in_packages(dataset_dir)

for package, count in package_python_file_counts.items():
    print(f"Package '{package}' has {count} Python files.")

Calculate the size of each package in bytes

In [None]:
def calculate_package_sizes(directory):
    package_sizes = {}

    # Find all setup.json files in the directory
    setup_json_files = find_setup_json_files(directory)

    for setup_json_path in setup_json_files:
        package_dir = os.path.dirname(setup_json_path)
        package_name = os.path.basename(package_dir)

        # Filter out tarfiles (.tar.gz, .tar.bz2, .tar.xz)
        filtered_files = []
        for root, dirs, files in os.walk(package_dir):
            filtered_files += [os.path.join(root, filename) for filename in files 
                               if not (filename.endswith('.tar.gz') or filename.endswith('.tar.bz2') or filename.endswith('.tar.xz'))]

        # Calculate size of remaining files
        package_size = sum(os.path.getsize(filename) for filename in filtered_files)
        package_sizes[package_name] = package_size

        try:
            # Read the existing setup.json file
            with open(setup_json_path, 'r') as file:
                data = json.load(file)
            print(f"Original setup.json data for {package_name}: {data}")

            # Append the package size to the setup.json data
            data['package_size'] = package_size
            print(f"Updated setup.json data for {package_name}: {data}")

            # Write the updated data back to setup.json
            with open(setup_json_path, 'w') as f:
                json.dump(data, f, indent=4)
            print(f"Successfully updated {setup_json_path} with package size: {package_size}")
        except Exception as e:
            print(f"Error updating {setup_json_path}: {e}")

    return package_sizes

# Calculate and print out the package sizes
package_sizes = calculate_package_sizes(dataset_dir)
for package_name, size_in_bytes in package_sizes.items():
    print(f"Package '{package_name}' size: {size_in_bytes} bytes")

Remove unwanted keys from setup.json

In [None]:
def remove__key(file_path, key):
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
        
        if key in data:
            del data[key]
            
            with open(file_path, 'w') as file:
                json.dump(data, file, indent=4)
                
            print(f"Removed {key} from {file_path}")
        else:
            print(f" {key} not found in {file_path}")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Find all setup.json files in the directory
setup_json_files = find_setup_json_files(dataset_dir)

# Remove the 'package_size_bytes' key from each setup.json file
for file_path in setup_json_files:
    remove__key(file_path, 'containsIP')

Prepend the directory name to setup.json

In [None]:
def prepend_directory_name_to_setup_json(directory):
    for root, dirs, files in os.walk(directory):
        if 'setup.json' in files:
            setup_json_path = os.path.join(root, 'setup.json')
            
            # Extract the relative path and split to find the correct directory name
            relative_path = os.path.relpath(root, directory)
            directory_name = relative_path.split(os.sep)[0]
            
            # Read the existing content of the setup.json file
            with open(setup_json_path, 'r') as file:
                content = file.read()
                data = json.loads(content)
            
            # Prepend the directory name
            data = {"directory_name": directory_name, **data}
            
            # Write the updated content back to the setup.json file
            with open(setup_json_path, 'w') as file:
                json.dump(data, file, indent=4)
            
            print(f"Prepended directory name '{directory_name}' to {setup_json_path}")

prepend_directory_name_to_setup_json(dataset_dir)


Parse .xml files for features

In [7]:
import os
import json
import re
from lxml import etree

def extract_features_from_xml(directory):
    setup_json_files = find_setup_json_files(directory)
    successful_parsing_count = 0
    unsuccessful_parsing_count = 0

    for package in os.listdir(directory):
        package_path = os.path.join(directory, package)
        if not os.path.isdir(package_path):
            continue

        xml_files = []

        for root, dirs, files in os.walk(package_path):
            for file in files:
                if file.endswith(".xml"):
                    xml_file_path = os.path.join(root, file)
                    xml_files.append(xml_file_path)

        if xml_files:
            print(f"XML files found in {package_path}: {xml_files}")
        else:
            print(f"No XML files found in {package_path}")

        # Features to extract
        contains_ip = 0
        contains_domain = 0
        contains_bytestrings = 0
        contains_base64 = 0
        contains_eval = 0
        contains_import_subprocess = 0
        contains_import_os = 0
        contains_import_network_modules = 0
        contains_os_environ_access = 0

        # Patterns
        ip_address_pattern = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')
        domain_pattern = re.compile(r'\b(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}\b')
        bytestring_pattern = re.compile(r"b'[^']*'")
        base64_pattern = re.compile(r'(?:(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?)')

        for xml_file in xml_files:
            try:
                parser = etree.XMLParser(recover=True)  # Ignore errors and continue parsing
                tree = etree.parse(xml_file, parser=parser)
                xml_root = tree.getroot()

                if xml_root is None:
                    print(f"Error: Root element is None for file {xml_file}")
                    unsuccessful_parsing_count += 1
                    continue

                successful_parsing_count += 1

                for element in xml_root.iter():
                    if element.text:
                        if not contains_ip:
                            ips = ip_address_pattern.findall(element.text)
                            if ips:
                                contains_ip = 1

                        if not contains_domain:
                            domains = domain_pattern.findall(element.text)
                            if domains:
                                contains_domain = 1

                        if not contains_bytestrings:
                            bytestrings = bytestring_pattern.findall(element.text)
                            if bytestrings:
                                contains_bytestrings = 1

                        if not contains_base64:
                            base64s = base64_pattern.findall(element.text)
                            if base64s:
                                contains_base64 = 1

                        if contains_ip and contains_domain and contains_bytestrings and contains_base64:
                            break

                    if element.tag == 'Call' and element.find('func') is not None and element.find('func').text == 'eval':
                        contains_eval = 1

                    if element.tag in ['Import', 'ImportFrom']:
                        for child in element:
                            if child.tag == 'names':
                                for item in child:
                                    if item.tag == 'item':
                                        for alias in item:
                                            if alias.tag == 'alias' and alias.find('name') is not None:
                                                module_name = alias.find('name').text
                                                if module_name == 'os':
                                                    contains_import_os = 1
                                                elif module_name == 'subprocess':
                                                    contains_import_subprocess = 1
                                                elif module_name.startswith('os'):
                                                    contains_import_os = 1
                                                elif module_name in ['socket', 'requests', 'http', 'urllib']:
                                                    contains_import_network_modules = 1

                    # Enhanced check for os.environ
                    if element.tag == 'Attribute' and element.find('attr') is not None and 'environ' in element.find('attr').text:
                        parent = element.find('value')
                        if parent is not None and parent.tag == 'Attribute' and parent.find('value') is not None:
                            grandparent = parent.find('value')
                            if grandparent is not None and grandparent.tag == 'Name' and grandparent.text == 'os':
                                print(f"Detected os.environ access in: {etree.tostring(element, pretty_print=True)}")
                                contains_os_environ_access = 1
                                break

            except etree.XMLSyntaxError as e:
                print(f"Error parsing {xml_file}: {e}")
                unsuccessful_parsing_count += 1

        # Find the corresponding setup.json file for this package
        setup_json_file = None
        for file_path in setup_json_files:
            if file_path.startswith(package_path):
                setup_json_file = file_path
                break

        if setup_json_file:
            try:
                with open(setup_json_file, 'r') as f:
                    setup_data = json.load(f)

                setup_data['contains_ip'] = contains_ip
                setup_data['contains_domain'] = contains_domain
                setup_data['contains_bytestrings'] = contains_bytestrings
                setup_data['contains_base64'] = contains_base64
                setup_data['contains_eval'] = contains_eval
                setup_data['contains_import_subprocess'] = contains_import_subprocess
                setup_data['contains_import_os'] = contains_import_os
                setup_data['contains_import_network_modules'] = contains_import_network_modules
                setup_data['contains_os_environ_access'] = contains_os_environ_access

                with open(setup_json_file, 'w') as f:
                    json.dump(setup_data, f, indent=4)
                print(f"Updated {setup_json_file} with extracted features")

            except Exception as e:
                print(f"Error updating {setup_json_file}: {e}")

    print(f"Successful XML file parsing count: {successful_parsing_count}")
    print(f"Unsuccessful XML file parsing count: {unsuccessful_parsing_count}")

# Example call to the function
extract_features_from_xml(dataset_dir)

XML files found in /mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples/pypi/discord-wbehook: ['/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples/pypi/discord-wbehook/0.1/discord-wbehook-0.1/discord-wbehook-0.1/setup.xml', '/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples/pypi/discord-wbehook/0.1/discord-wbehook-0.1/discord-wbehook-0.1/discord-wbehook/__init__.xml']
Updated /mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples/pypi/discord-wbehook/0.1/discord-wbehook-0.1/discord-wbehook-0.1/setup.json with extracted features
XML files found in /mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples/pypi/selfpepvirtual: ['/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples/pypi/selfpepvirtual/1.67/selfpepvirtual-1.67/selfpepvirtual-1.67/setup.xml']
Updated /mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples/pypi/selfpepvirtual/1.67/selfpepvirtual-1.67/selfpepvirtual-1.67/setup.json with extracted features
XML files found in /mnt/volume_nyc1_01/Bac

Cosntruct dependency graphs

In [None]:
# TODO - Add code to construct dependency graphs

Convert to dataframe with option to save as CSV

In [8]:
# Specify the directory containing the packages
# dataset_dir = '/mnt/volume_nyc1_01/benignPyPI'

# Call the function and get the DataFrame
df = read_json_files(dataset_dir)

# columns_to_drop = ['package_size_bytes', 'package_size']
# df = df.drop(columns=columns_to_drop)

# #save to CSV
# df.to_csv('benignPyPI.csv', index=False)

# #load df from CSV
# df = pd.read_csv('benignPyPI.csv')

# Display the DataFrame
df.head()

Read /mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples/pypi/discord-wbehook/0.1/discord-wbehook-0.1/discord-wbehook-0.1/setup.json: {'directory_name': 'discord-wbehook', 'name': 'discord-wbehook', 'packages': ['discord-wbehook'], 'version': '0.1', 'average_entropy': 1.8801900148857729, 'python_file_count': 2, 'package_size': 1235605, 'contains_ip': 0, 'contains_domain': 1, 'contains_bytestrings': 0, 'contains_base64': 1, 'contains_eval': 0, 'contains_import_subprocess': 0, 'contains_import_os': 1, 'contains_import_network_modules': 0, 'contains_os_environ_access': 0}
Read /mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples/pypi/selfpepvirtual/1.67/selfpepvirtual-1.67/selfpepvirtual-1.67/setup.json: {'directory_name': 'selfpepvirtual', 'average_entropy': 5.217188273480492, 'python_file_count': 1, 'package_size': 23510, 'contains_ip': 0, 'contains_domain': 1, 'contains_bytestrings': 0, 'contains_base64': 1, 'contains_eval': 0, 'contains_import_subprocess': 1, 'contains_impo

Unnamed: 0,directory_name,name,packages,version,average_entropy,python_file_count,package_size,contains_ip,contains_domain,contains_bytestrings,...,download_url,py_module,setup_requires,options,post_install,authors,test_suite,platform,bugtrack_url,package
0,discord-wbehook,discord-wbehook,[discord-wbehook],0.1,1.88019,2,1235605,0.0,1.0,0.0,...,,,,,,,,,,
1,selfpepvirtual,,,,5.217188,1,23510,0.0,1.0,0.0,...,,,,,,,,,,
2,pycryptro,pycryptro,"Call(func=Name(id='find_packages', ctx=Load())...",1.6,2.358721,4,69748,0.0,1.0,0.0,...,,,,,,,,,,
3,discorrd-webhook,discorrd-webhook,[discorrd-webhook],0.1,1.880265,2,1235613,0.0,1.0,0.0,...,,,,,,,,,,
4,tinyad1,,,,1.651381,3,11520,0.0,1.0,0.0,...,,,,,,,,,,
