In [13]:
import os
import tarfile
import ast
import json
import pandas as pd
import math
from collections import Counter
import numpy as np
from scipy.stats import entropy
import logging
import sys
import xml.etree.ElementTree as ET

logstd = logging.StreamHandler(sys.stdout)

logging.basicConfig(
    format="%(asctime)s %(levelname)s %(name)s:%(lineno)d - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S %Z",
    level=logging.INFO,
    handlers=[logstd]
)

log = logging.getLogger()

In [17]:
def find_setup_json_files(directory):
    setup_json_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == 'setup.json':
                setup_json_files.append(os.path.join(root, file))
    return setup_json_files

def is_valid_json_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            json.load(file)
        return True
    except (ValueError, json.JSONDecodeError):
        return False

def count_valid_json_files(directory):
    count = 0
    for root, _, files in os.walk(directory):
        for file in files:
            if file == "setup.json" and is_valid_json_file(os.path.join(root, file)):
                count += 1
    return count

def count_package_files(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".tar.gz") or file.endswith(".tar.bz2") or file.endswith(".tar.xz"):
                count += 1
    return count

def is_valid_json_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            json.load(file)
        return True
    except (ValueError, json.JSONDecodeError):
        return False

def count_valid_json_files(directory):
    count = 0
    for root, _, files in os.walk(directory):
        for file in files:
            if file == "setup.json" and is_valid_json_file(os.path.join(root, file)):
                count += 1
    return count

def find_setup_json_files(directory):
    setup_json_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == 'setup.json':
                setup_json_files.append(os.path.join(root, file))
    return setup_json_files

def find_python_files(directory):
    python_files = []
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith('.py') and not entry.name.startswith('.'):
            python_files.append(entry.path)
        elif entry.is_dir():
            python_files.extend(find_python_files(entry.path))
    return python_files

def read_json_files(directory):
    json_data_list = []

    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == 'setup.json':
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as f:
                    json_data = json.load(f)
                    print(f"Read {file_path}: {json_data}")  # Debug statement
                    json_data_list.append(json_data)

    df = pd.DataFrame(json_data_list)
    return df


dataset_dir = "/mnt/volume_nyc1_01/benignPyPI"

Count zipped packages to determine useable amount

In [None]:
dataset_1_dir = "/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples"
dataset_2_dir = "/mnt/volume_nyc1_01/pypi_malregistry"  

count_1 = count_package_files(dataset_1_dir)
count_2 = count_package_files(dataset_2_dir)

print(f"Number of packages in dataset 1: {count_1}")
print(f"Number of packages in dataset 2: {count_2}")

Count JSON files to determin useable amount

In [None]:
dataset_1_dir = "/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples"
dataset_2_dir = "/mnt/volume_nyc1_01/pypi_malregistry"  

count_1 = count_valid_json_files(dataset_1_dir)
count_2 = count_valid_json_files(dataset_2_dir)

print(f"Number of valid setup.json files in dataset 1: {count_1}")
print(f"Number of valid setup.json files in dataset 2: {count_2}")

Calculate Shannon Entropy and append to JSON

In [None]:
def shannon_entropy(directory):
    package_entropies = {}
    setup_json_files = find_setup_json_files(directory)
    
    for setup_file_path in setup_json_files:
        package_path = os.path.dirname(setup_file_path)
        package_name = os.path.basename(package_path)
        
        package_entropy = 0
        total_files = 0
        
        python_files = find_python_files(package_path)
        for file_path in python_files:
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    text = f.read()
                    freqs = np.array(list(Counter(text).values()))
                    probs = freqs / len(text)
                    entropy_value = entropy(probs, base=2)
                    package_entropy += entropy_value
                    total_files += 1
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
        
        if total_files > 0:
            average_entropy = package_entropy / total_files
            package_entropies[package_name] = average_entropy

            try:
                with open(setup_file_path, 'r+', encoding='utf-8', errors='ignore') as setup_file:
                    try:
                        setup_data = json.load(setup_file)
                        setup_data["average_entropy"] = average_entropy
                        setup_file.seek(0)
                        json.dump(setup_data, setup_file, indent=4)
                        setup_file.truncate()
                        print(f"Updated {setup_file_path} with average entropy: {average_entropy}")
                    except json.JSONDecodeError as json_err:
                        print(f"JSON decode error in {setup_file_path}: {json_err}")
            except Exception as e:
                print(f"Error updating {setup_file_path}: {e}")

    return package_entropies

setup_json_files = find_setup_json_files(dataset_dir)

if setup_json_files:
    print("Found setup.json files:")
    for file in setup_json_files:
        print(file)
else:
    print("No setup.json files found in the specified directory.")

package_entropies = shannon_entropy(dataset_dir)
for package, entropy in package_entropies.items():
    print(f"Shannon entropy of {package}: {entropy}")

Construct AST and store in XML

In [None]:
def construct_ast(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
        try:
            tree = ast.parse(content)
            return tree
        except SyntaxError as e:
            print(f"SyntaxError in {file_path}: {e}")
            return None

def _convert(node, parent):
    if isinstance(node, ast.AST):
        node_name = node.__class__.__name__
        element = ET.SubElement(parent, node_name)
        for field, value in ast.iter_fields(node):
            field_elem = ET.SubElement(element, field)
            _convert(value, field_elem)
    elif isinstance(node, list):
        for item in node:
            item_elem = ET.SubElement(parent, 'item')
            _convert(item, item_elem)
    else:
        parent.text = str(node)

def ast_to_xml(node):
    root = ET.Element(node.__class__.__name__)
    _convert(node, root)
    return root

def find_python_files(directory):
    python_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.py'):
                python_files.append(os.path.join(root, file))
    return python_files

def save_xml(xml, file_path):
    xml_str = ET.tostring(xml, encoding='unicode', method='xml')
    xml_file_path = os.path.splitext(file_path)[0] + '.xml'
    with open(xml_file_path, 'w', encoding='utf-8', errors='replace') as f:
        f.write(xml_str)

python_files = find_python_files(dataset_dir)

for file in python_files:
    tree = construct_ast(file)
    if tree is not None:
        xml = ast_to_xml(tree)
        save_xml(xml, file)
        print(f"XML representation saved for {file}")

Count number of .py files

In [18]:
def count_python_files(directory):
    python_file_count = 0
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith('.py'):
            python_file_count += 1
        elif entry.is_dir():
            python_file_count += count_python_files(entry.path)
    return python_file_count

def find_setup_json_files(directory):
    setup_json_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == 'setup.json':
                setup_json_files.append(os.path.join(root, file))
    return setup_json_files

def count_python_files_in_packages(dataset_directory):
    package_counts = {}
    
    print(f"Scanning dataset directory: {dataset_directory}")
    for entry in os.scandir(dataset_directory):
        if entry.is_dir():
            package_path = entry.path
            print(f"Scanning package directory: {package_path}")
            python_files_count = count_python_files(package_path)
            package_name = entry.name
            package_counts[package_name] = python_files_count
            
            setup_json_files = find_setup_json_files(package_path)
            if setup_json_files:
                for setup_json_path in setup_json_files:
                    try:
                        with open(setup_json_path, 'r') as file:
                            data = json.load(file)
                        
                        data['python_file_count'] = python_files_count

                        with open(setup_json_path, 'w') as file:
                            json.dump(data, file, indent=4)

                        print(f"Updated {setup_json_path} with python_file_count: {python_files_count}")
                    except Exception as e:
                        print(f"Error updating {setup_json_path}: {e}")
            else:
                print(f"No setup.json found in {package_path}")
        else:
            print(f"Skipping non-directory entry: {entry.name}")
    
    return package_counts

dataset_directory = '/mnt/volume_nyc1_01/benignPyPI/'

package_python_file_counts = count_python_files_in_packages(dataset_directory)

for package, count in package_python_file_counts.items():
    print(f"Package '{package}' has {count} Python files.")

Scanning dataset directory: /mnt/volume_nyc1_01/benignPyPI/
Scanning package directory: /mnt/volume_nyc1_01/benignPyPI/types-python-dateutil
Updated /mnt/volume_nyc1_01/benignPyPI/types-python-dateutil/types-python-dateutil-2.9.0.20240316/types-python-dateutil-2.9.0.20240316/setup.json with python_file_count: 1
Scanning package directory: /mnt/volume_nyc1_01/benignPyPI/iso8601
No setup.json found in /mnt/volume_nyc1_01/benignPyPI/iso8601
Scanning package directory: /mnt/volume_nyc1_01/benignPyPI/types-cachetools
Updated /mnt/volume_nyc1_01/benignPyPI/types-cachetools/types-cachetools-5.3.0.7/types-cachetools-5.3.0.7/setup.json with python_file_count: 1
Scanning package directory: /mnt/volume_nyc1_01/benignPyPI/cmdstanpy
No setup.json found in /mnt/volume_nyc1_01/benignPyPI/cmdstanpy
Scanning package directory: /mnt/volume_nyc1_01/benignPyPI/types-toml
Updated /mnt/volume_nyc1_01/benignPyPI/types-toml/types-toml-0.10.8.20240310/types-toml-0.10.8.20240310/setup.json with python_file_coun

Convert to dataframe with option to save as CSV

In [21]:
# Specify the directory containing the packages
directory = '/mnt/volume_nyc1_01/benignPyPI'

# Call the function and get the DataFrame
df = read_json_files(directory)

# #save to CSV
# df.to_csv('benignPyPI.csv', index=False)

# #load df from CSV
# df = pd.read_csv('benignPyPI.csv')

# Display the DataFrame
df.head()

Read /mnt/volume_nyc1_01/benignPyPI/types-python-dateutil/types-python-dateutil-2.9.0.20240316/types-python-dateutil-2.9.0.20240316/setup.json: {'name': "Name(id='name', ctx=Load())", 'version': '2.9.0.20240316', 'description': "Name(id='description', ctx=Load())", 'long_description': "Name(id='long_description', ctx=Load())", 'long_description_content_type': 'text/markdown', 'url': 'https://github.com/python/typeshed', 'project_urls': {'GitHub': 'https://github.com/python/typeshed', 'Changes': 'https://github.com/typeshed-internal/stub_uploader/blob/main/data/changelogs/python-dateutil.md', 'Issue tracker': 'https://github.com/python/typeshed/issues', 'Chat': 'https://gitter.im/python/typing'}, 'install_requires': [], 'packages': ['dateutil-stubs'], 'package_data': {'dateutil-stubs': ['__init__.pyi', '_common.pyi', 'easter.pyi', 'parser/__init__.pyi', 'parser/isoparser.pyi', 'relativedelta.pyi', 'rrule.pyi', 'tz/__init__.pyi', 'tz/_common.pyi', 'tz/tz.pyi', 'utils.pyi', 'zoneinfo/__in

Unnamed: 0,name,version,description,long_description,long_description_content_type,url,project_urls,install_requires,packages,package_data,...,cmake_source_dir,dev_require,use_calver,cmake_languages,cmake_minimum_required_version,cmake_process_manifest_hook,cmake_install_dir,cmake_install_target,readme,use_2to3
0,"Name(id='name', ctx=Load())",2.9.0.20240316,"Name(id='description', ctx=Load())","Name(id='long_description', ctx=Load())",text/markdown,https://github.com/python/typeshed,{'GitHub': 'https://github.com/python/typeshed...,[],[dateutil-stubs],"{'dateutil-stubs': ['__init__.pyi', '_common.p...",...,,,,,,,,,,
1,"Name(id='name', ctx=Load())",5.3.0.7,"Name(id='description', ctx=Load())","Name(id='long_description', ctx=Load())",text/markdown,https://github.com/python/typeshed,{'GitHub': 'https://github.com/python/typeshed...,[],[cachetools-stubs],"{'cachetools-stubs': ['__init__.pyi', 'func.py...",...,,,,,,,,,,
2,"Name(id='name', ctx=Load())",0.10.8.20240310,"Name(id='description', ctx=Load())","Name(id='long_description', ctx=Load())",text/markdown,https://github.com/python/typeshed,{'GitHub': 'https://github.com/python/typeshed...,[],[toml-stubs],"{'toml-stubs': ['__init__.pyi', 'decoder.pyi',...",...,,,,,,,,,,
3,watchtower,3.2.0,Python CloudWatch Logging,Call(func=Attribute(value=Call(func=Name(id='o...,,https://github.com/kislyuk/watchtower,,"[boto3 >= 1.9.253, < 2]","Call(func=Name(id='find_packages', ctx=Load())...",{'watchtower': ['py.typed']},...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
