In [None]:
import os
import tarfile
import ast
import json
import pandas as pd

In [None]:
repo_dir = "/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples"
#Find all tar.gz files in the repository
tar_files = []
for root, dirs, files in os.walk(repo_dir):
    for file in files:
        if file.endswith(".tar.gz") or file.endswith(".tar.bz2") or file.endswith(".tar.xz"):
            tar_files.append(os.path.join(root, file))

#Unzip the files with different compression methods
for tar_file in tar_files:
    if tar_file.endswith(".tar.gz"):
        mode = 'r:gz'
    elif tar_file.endswith(".tar.bz2"):
        mode = 'r:bz2'
    elif tar_file.endswith(".tar.xz"):
        mode = 'r:xz'
    else:
        mode = 'r'
    
    try:
        with tarfile.open(tar_file, mode) as tar_ref:
            extract_dir = os.path.splitext(os.path.splitext(tar_file)[0])[0]
            tar_ref.extractall(extract_dir)
            print(f"Extracted {tar_file} to {extract_dir}")
    except (tarfile.ReadError, tarfile.CompressionError) as e:
        print(f"Failed to extract {tar_file}: {e}")

print("Extraction process completed.")

In [None]:
dataset_1_dir = "/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples"
dataset_2_dir = "/mnt/volume_nyc1_01/pypi_malregistry"  

def count_package_files(directory):
    count = 0
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".tar.gz") or file.endswith(".tar.bz2") or file.endswith(".tar.xz"):
                count += 1
    return count

count_1 = count_package_files(dataset_1_dir)
count_2 = count_package_files(dataset_2_dir)

print(f"Number of packages in dataset 1: {count_1}")
print(f"Number of packages in dataset 2: {count_2}")

In [None]:
def parse_setup_py(setup_py_path):
    setup_args = {}
    try:
        with open(setup_py_path, 'r', encoding='utf-8') as f:
            tree = ast.parse(f.read(), filename=setup_py_path)

        for node in tree.body:
            if isinstance(node, ast.Expr) and isinstance(node.value, ast.Call) and isinstance(node.value.func, ast.Name) and node.value.func.id == 'setup':
                for kwarg in node.value.keywords:
                    try:
                        value = ast.literal_eval(kwarg.value)
                    except (ValueError, SyntaxError):
                        # Fallback to using the repr of the value if literal_eval fails
                        value = ast.dump(kwarg.value)
                    setup_args[kwarg.arg] = value

    except SyntaxError as e:
        print(f"SyntaxError in {setup_py_path}: {e}")
        return None

    return setup_args

def convert_setup_to_json(dataset_dir):
    setup_py_files = []
    for root, _, files in os.walk(dataset_dir):
        for file in files:
            if file == 'setup.py':
                setup_py_files.append(os.path.join(root, file))

    for setup_py in setup_py_files:
        setup_args = parse_setup_py(setup_py)
        if setup_args is None:
            continue  # Skip this setup.py file due to SyntaxError

        json_path = os.path.join(os.path.dirname(setup_py), 'setup.json')
        with open(json_path, 'w', encoding='utf-8') as json_file:
            json.dump(setup_args, json_file, indent=2)
        print(f'Converted {setup_py} to {json_path}')

# Replace 'dataset_dir' with the variable containing your packages directory path
dataset_dir = '/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples/pypi/'
convert_setup_to_json(dataset_dir)


In [25]:
def read_json_files(directory):
    # Initialize an empty list to hold the JSON data
    json_data_list = []

    # Walk through the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file == 'setup.json':
                # Construct the full file path
                file_path = os.path.join(root, file)
                
                # Read the JSON file
                with open(file_path, 'r') as f:
                    json_data = json.load(f)
                    json_data_list.append(json_data)

    # Convert the list of JSON data to a DataFrame
    df = pd.DataFrame(json_data_list)
    return df

# Specify the directory containing the packages
directory = '/mnt/volume_nyc1_01/Backstabbers-Knife-Collection/samples/pypi/'

# Call the function and get the DataFrame
df = read_json_files(directory)

# Display the DataFrame
df.head()


                  name                                           packages  \
0      discord-wbehook                                  [discord-wbehook]   
1                  NaN                                                NaN   
2            pycryptro  Call(func=Name(id='find_packages', ctx=Load())...   
3     discorrd-webhook                                 [discorrd-webhook]   
4                  NaN                                                NaN   
...                ...                                                ...   
2979               NaN                                                NaN   
2980               NaN                                                NaN   
2981      reqinstaller  Call(func=Name(id='find_packages', ctx=Load())...   
2982  robloxpinreaderr  Call(func=Name(id='find_packages', ctx=Load())...   
2983          gitpyhon                                         [gitpyhon]   

                             version license         author  \
0           