In [16]:
import os
import tarfile
import ast
import json
import pandas as pd
import math
from collections import Counter
import numpy as np
from scipy.stats import entropy
import logging
import sys

logstd = logging.StreamHandler(sys.stdout)

logging.basicConfig(
    format="%(asctime)s %(levelname)s %(name)s:%(lineno)d - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S %Z",
    level=logging.INFO,
    handlers=[logstd]
)

log = logging.getLogger()

In [None]:
repo_dir = "/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples"
#Find all tar.gz files in the repository
tar_files = []
for root, dirs, files in os.walk(repo_dir):
    for file in files:
        if file.endswith(".tar.gz") or file.endswith(".tar.bz2") or file.endswith(".tar.xz"):
            tar_files.append(os.path.join(root, file))

#Unzip the files with different compression methods
for tar_file in tar_files:
    if tar_file.endswith(".tar.gz"):
        mode = 'r:gz'
    elif tar_file.endswith(".tar.bz2"):
        mode = 'r:bz2'
    elif tar_file.endswith(".tar.xz"):
        mode = 'r:xz'
    else:
        mode = 'r'
    
    try:
        with tarfile.open(tar_file, mode) as tar_ref:
            extract_dir = os.path.splitext(os.path.splitext(tar_file)[0])[0]
            tar_ref.extractall(extract_dir)
            print(f"Extracted {tar_file} to {extract_dir}")
    except (tarfile.ReadError, tarfile.CompressionError) as e:
        print(f"Failed to extract {tar_file}: {e}")

print("Extraction process completed.")

In [None]:
dataset_1_dir = "/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples"
dataset_2_dir = "/mnt/volume_nyc1_01/pypi_malregistry"  

def count_package_files(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".tar.gz") or file.endswith(".tar.bz2") or file.endswith(".tar.xz"):
                count += 1
    return count

count_1 = count_package_files(dataset_1_dir)
count_2 = count_package_files(dataset_2_dir)

print(f"Number of packages in dataset 1: {count_1}")
print(f"Number of packages in dataset 2: {count_2}")

In [22]:
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)

def parse_setup_py(setup_py_path):
    setup_args = {}
    try:
        with open(setup_py_path, 'r', encoding='utf-8') as f:
            content = f.read()

        # Remove BOM if present
        if content.startswith('\ufeff'):
            content = content[1:]

        # Parse the content with AST
        tree = ast.parse(content, filename=setup_py_path)

        for node in tree.body:
            if (isinstance(node, ast.Expr) and isinstance(node.value, ast.Call) and
                    isinstance(node.value.func, ast.Name) and node.value.func.id == 'setup'):
                for kwarg in node.value.keywords:
                    try:
                        value = ast.literal_eval(kwarg.value)
                    except (ValueError, SyntaxError):
                        # Fallback to using the repr of the value if literal_eval fails
                        value = ast.dump(kwarg.value)
                    setup_args[kwarg.arg] = value

    except SyntaxError as e:
        log.error(f"SyntaxError in {setup_py_path}: {e}")
        return None

    return setup_args

def convert_setup_to_json(dataset_dir):
    setup_py_files = []
    for root, _, files in os.walk(dataset_dir):
        for file in files:
            if file == 'setup.py':
                setup_py_files.append(os.path.join(root, file))

    for setup_py in setup_py_files:
        setup_args = parse_setup_py(setup_py)
        if setup_args is None:
            continue  # Skip this setup.py file due to SyntaxError

        json_path = os.path.join(os.path.dirname(setup_py), 'setup.json')
        with open(json_path, 'w', encoding='utf-8') as json_file:
            json.dump(setup_args, json_file, indent=2)
        log.debug(f'Converted {setup_py} to {json_path}')

# Replace 'dataset_dir' with the variable containing your packages directory path
dataset_dir = '/mnt/volume_nyc1_01/benignPyPI'
convert_setup_to_json(dataset_dir)


  long_description = re.sub(r'!\['+ext+'\]\((.*)\)', '!['+ext+']('+'https://raw.githubusercontent.com/{}/{}'.format(cfg['user'],cfg['lib_name'])+'/'+cfg['branch']+'/\\1)', long_description)
  code_block = '(:\n\n)?\.\. code-block::.*'
  if re.match('\s+', lines[n]):
  "sys.exit\(\n"
  "   load_entry_point\(([^\)]+)\)\(\)\n"
  "\)\n")
  "return self.api_call\(", "return await self.api_call(", async_source
  "Union\[Future, SlackResponse\]", "AsyncSlackResponse", async_source
  "class WebClient\(BaseClient\):",
  major_regex = """major_version\s*?=\s*?["']{1}(\d+)["']{1}"""
  minor_regex = """minor_version\s*?=\s*?["']{1}(\d+)["']{1}"""
  patch_regex = """patch_version\s*?=\s*?["']{1}(\d+)["']{1}"""
  match = re.search("__version_info__ = (\(.*\))", data)


In [18]:
dataset_1_dir = "/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples"
dataset_2_dir = "/mnt/volume_nyc1_01/benignPyPI"

def is_valid_json_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            json.load(file)
        return True
    except (ValueError, json.JSONDecodeError):
        return False

def count_valid_json_files(directory):
    count = 0
    for root, _, files in os.walk(directory):
        for file in files:
            if file == "setup.json" and is_valid_json_file(os.path.join(root, file)):
                count += 1
    return count

count_1 = count_valid_json_files(dataset_1_dir)
count_2 = count_valid_json_files(dataset_2_dir)

print(f"Number of valid setup.json files in dataset 1: {count_1}")
print(f"Number of valid setup.json files in dataset 2: {count_2}")

Number of valid setup.json files in dataset 1: 2984
Number of valid setup.json files in dataset 2: 1529


In [25]:
def find_setup_json_files(directory):
    setup_json_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == 'setup.json':
                setup_json_files.append(os.path.join(root, file))
    return setup_json_files
    
setup_json_files = find_setup_json_files(dataset_dir)
if setup_json_files:
    print("Found setup.json files:")
    for file in setup_json_files:
        print(file)
else:
    print("No setup.json files found in the specified directory.")


def find_python_files(directory):
    python_files = []
    for entry in os.scandir(directory):
        if entry.is_file() and entry.name.endswith('.py') and not entry.name.startswith('.'):
            python_files.append(entry.path)
        elif entry.is_dir():
            python_files.extend(find_python_files(entry.path))
    return python_files

def shannon_entropy(directory):
    package_entropies = {}
    setup_json_files = find_setup_json_files(directory)
    
    for setup_file_path in setup_json_files:
        package_path = os.path.dirname(setup_file_path)
        package_name = os.path.basename(package_path)
        
        package_entropy = 0
        total_files = 0
        
        python_files = find_python_files(package_path)
        for file_path in python_files:
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    text = f.read()
                    freqs = np.array(list(Counter(text).values()))
                    probs = freqs / len(text)
                    entropy_value = entropy(probs, base=2)
                    package_entropy += entropy_value
                    total_files += 1
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
        
        if total_files > 0:
            average_entropy = package_entropy / total_files
            package_entropies[package_name] = average_entropy

            try:
                with open(setup_file_path, 'r+', encoding='utf-8', errors='ignore') as setup_file:
                    try:
                        setup_data = json.load(setup_file)
                        setup_data["average_entropy"] = average_entropy
                        setup_file.seek(0)
                        json.dump(setup_data, setup_file, indent=4)
                        setup_file.truncate()
                        print(f"Updated {setup_file_path} with average entropy: {average_entropy}")
                    except json.JSONDecodeError as json_err:
                        print(f"JSON decode error in {setup_file_path}: {json_err}")
            except Exception as e:
                print(f"Error updating {setup_file_path}: {e}")

    return package_entropies

package_entropies = shannon_entropy(dataset_dir)
for package, entropy in package_entropies.items():
    print(f"Shannon entropy of {package}: {entropy}")

Found setup.json files:
/mnt/volume_nyc1_01/benignPyPI/types-python-dateutil/types-python-dateutil-2.9.0.20240316/types-python-dateutil-2.9.0.20240316/setup.json
/mnt/volume_nyc1_01/benignPyPI/types-cachetools/types-cachetools-5.3.0.7/types-cachetools-5.3.0.7/setup.json
/mnt/volume_nyc1_01/benignPyPI/types-toml/types-toml-0.10.8.20240310/types-toml-0.10.8.20240310/setup.json
/mnt/volume_nyc1_01/benignPyPI/watchtower/watchtower-3.2.0/watchtower-3.2.0/setup.json
/mnt/volume_nyc1_01/benignPyPI/pre-commit/pre_commit-3.7.1/pre_commit-3.7.1/setup.json
/mnt/volume_nyc1_01/benignPyPI/rfc3986/rfc3986-2.0.0/rfc3986-2.0.0/setup.json
/mnt/volume_nyc1_01/benignPyPI/pynndescent/pynndescent-0.5.13/pynndescent-0.5.13/setup.json
/mnt/volume_nyc1_01/benignPyPI/pytest-forked/pytest-forked-1.6.0/pytest-forked-1.6.0/setup.json
/mnt/volume_nyc1_01/benignPyPI/googletrans/googletrans-3.0.0/googletrans-3.0.0/setup.json
/mnt/volume_nyc1_01/benignPyPI/watchdog/watchdog-4.0.1/watchdog-4.0.1/setup.json
/mnt/volume

In [26]:
def read_json_files(directory):
    # Initialize an empty list to hold the JSON data
    json_data_list = []

    # Walk through the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == 'setup.json':
                # Construct the full file path
                file_path = os.path.join(root, file)
                
                # Read the JSON file
                with open(file_path, 'r') as f:
                    json_data = json.load(f)
                    json_data_list.append(json_data)

    # Convert the list of JSON data to a DataFrame
    df = pd.DataFrame(json_data_list)
    return df

# Specify the directory containing the packages
directory = '/mnt/volume_nyc1_01/benignPyPI'

# Call the function and get the DataFrame
df = read_json_files(directory)

# Display the DataFrame
df.head()

Unnamed: 0,name,version,description,long_description,long_description_content_type,url,project_urls,install_requires,packages,package_data,...,cmake_source_dir,dev_require,use_calver,cmake_languages,cmake_minimum_required_version,cmake_process_manifest_hook,cmake_install_dir,cmake_install_target,readme,use_2to3
0,"Name(id='name', ctx=Load())",2.9.0.20240316,"Name(id='description', ctx=Load())","Name(id='long_description', ctx=Load())",text/markdown,https://github.com/python/typeshed,{'GitHub': 'https://github.com/python/typeshed...,[],[dateutil-stubs],"{'dateutil-stubs': ['__init__.pyi', '_common.p...",...,,,,,,,,,,
1,"Name(id='name', ctx=Load())",5.3.0.7,"Name(id='description', ctx=Load())","Name(id='long_description', ctx=Load())",text/markdown,https://github.com/python/typeshed,{'GitHub': 'https://github.com/python/typeshed...,[],[cachetools-stubs],"{'cachetools-stubs': ['__init__.pyi', 'func.py...",...,,,,,,,,,,
2,"Name(id='name', ctx=Load())",0.10.8.20240310,"Name(id='description', ctx=Load())","Name(id='long_description', ctx=Load())",text/markdown,https://github.com/python/typeshed,{'GitHub': 'https://github.com/python/typeshed...,[],[toml-stubs],"{'toml-stubs': ['__init__.pyi', 'decoder.pyi',...",...,,,,,,,,,,
3,watchtower,3.2.0,Python CloudWatch Logging,Call(func=Attribute(value=Call(func=Name(id='o...,,https://github.com/kislyuk/watchtower,,"[boto3 >= 1.9.253, < 2]","Call(func=Name(id='find_packages', ctx=Load())...",{'watchtower': ['py.typed']},...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
