In [1]:
import requests
import subprocess
import sys
import os
import tarfile
import time
import json

In [2]:
def get_top_pypi_packages(url='https://hugovk.github.io/top-pypi-packages/top-pypi-packages-30-days.json'):
    response = requests.get(url)
    response.raise_for_status() 
    
    data = response.json()
    packages = [package['project'] for package in data['rows'][:1500]]

    return packages

top_packages = get_top_pypi_packages()
print(top_packages)


['boto3', 'botocore', 'urllib3', 'requests', 'wheel', 'certifi', 'idna', 'typing-extensions', 'charset-normalizer', 'pip', 'setuptools', 'python-dateutil', 's3transfer', 'packaging', 'aiobotocore', 'pyyaml', 'six', 's3fs', 'fsspec', 'numpy', 'cryptography', 'grpcio-status', 'google-api-core', 'cffi', 'pycparser', 'pypular', 'pandas', 'importlib-metadata', 'pyasn1', 'rsa', 'zipp', 'attrs', 'click', 'pydantic', 'protobuf', 'jmespath', 'platformdirs', 'pytz', 'jinja2', 'colorama', 'markupsafe', 'pyjwt', 'awscli', 'tomli', 'wrapt', 'google-auth', 'googleapis-common-protos', 'filelock', 'cachetools', 'requests-oauthlib', 'oauthlib', 'pluggy', 'virtualenv', 'pyarrow', 'docutils', 'jsonschema', 'pyasn1-modules', 'pytest', 'exceptiongroup', 'aiohttp', 'pyparsing', 'sqlalchemy', 'scipy', 'isodate', 'multidict', 'psutil', 'pyopenssl', 'yarl', 'iniconfig', 'decorator', 'soupsieve', 'pygments', 'tzdata', 'async-timeout', 'beautifulsoup4', 'frozenlist', 'aiosignal', 'tqdm', 'grpcio', 'pillow', 'req

In [3]:
def dl_packages(packages, target_directory):
    count = 0
    for idx, package in enumerate(packages, start=1):
        package_dir = os.path.join(target_directory, package)
        os.makedirs(package_dir, exist_ok=True)
        retries = 2
        delay = 1
        try:
            r = requests.get(f"https://pypi.org/pypi/{package}/json").content
            d = json.loads(r)
            for url in d["urls"]:
                if url["packagetype"] == "sdist":
                    download_url = url["url"]
                    print(f"Downloading {package} from {download_url}")
                    response = requests.get(download_url)
                    file_name = os.path.basename(download_url)
                    with open(os.path.join(package_dir, file_name), 'wb') as file:
                        file.write(response.content)
                    print(f"Downloaded {package} into {package_dir}/{file_name}")
                    count += 1
                    print(f"Progress: {count}/{len(packages)}")
                    break
        except Exception as e:
            print(f"Failed to download {package}: {e}")
            retries -= 1
            if retries == 0:
                print(f"Max retries reached for {package}. Skipping.")
            else:
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2  # Exponential backoff
                continue

download_directory = '/mnt/volume_nyc1_01/benignPyPI/'

dl_packages(top_packages, download_directory)

Downloading boto3 from https://files.pythonhosted.org/packages/94/42/f34ab93ea175b4e6c96e73a3b3f24d073f63418971925c8149d41f6a252a/boto3-1.34.136.tar.gz
Downloaded boto3 into /mnt/volume_nyc1_01/benignPyPI/boto3/boto3-1.34.136.tar.gz
Progress: 1/1500
Downloading botocore from https://files.pythonhosted.org/packages/3c/ec/09d963aa91a1d09a87c21c014da5092a1eccde8b44cd51bbe8a27e3576fd/botocore-1.34.136.tar.gz
Downloaded botocore into /mnt/volume_nyc1_01/benignPyPI/botocore/botocore-1.34.136.tar.gz
Progress: 2/1500
Downloading urllib3 from https://files.pythonhosted.org/packages/43/6d/fa469ae21497ddc8bc93e5877702dca7cb8f911e337aca7452b5724f1bb6/urllib3-2.2.2.tar.gz
Downloaded urllib3 into /mnt/volume_nyc1_01/benignPyPI/urllib3/urllib3-2.2.2.tar.gz
Progress: 3/1500
Downloading requests from https://files.pythonhosted.org/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz
Downloaded requests into /mnt/volume_nyc1_01/benignPyPI/requests/requests-2

: 

In [None]:
repo_dir = "/mnt/volume_nyc1_01/benignPyPI/"
#Find all tar.gz files in the repository
tar_files = []
for root, dirs, files in os.walk(repo_dir):
    for file in files:
        if file.endswith(".tar.gz") or file.endswith(".tar.bz2") or file.endswith(".tar.xz"):
            tar_files.append(os.path.join(root, file))

#Unzip the files with different compression methods
for tar_file in tar_files:
    if tar_file.endswith(".tar.gz"):
        mode = 'r:gz'
    elif tar_file.endswith(".tar.bz2"):
        mode = 'r:bz2'
    elif tar_file.endswith(".tar.xz"):
        mode = 'r:xz'
    else:
        mode = 'r'
    
    try:
        with tarfile.open(tar_file, mode) as tar_ref:
            extract_dir = os.path.splitext(os.path.splitext(tar_file)[0])[0]
            tar_ref.extractall(extract_dir)
            print(f"Extracted {tar_file} to {extract_dir}")
    except (tarfile.ReadError, tarfile.CompressionError) as e:
        print(f"Failed to extract {tar_file}: {e}")

print("Extraction process completed.")