In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import requests
import sys
import csv
import time
from urllib.parse import quote

In [None]:
def get_top_npm_packages():
    """
    Fetches the top 5,000 most downloaded npm packages from the Hugging Face API
    and saves them to a CSV file with all available data fields.
    """
    base_url = "https://datasets-server.huggingface.co/rows?dataset=deepklarity%2Ftop-npm-packages&config=default&split=train&length=100"
    all_packages = []
    
    try:
        # The API returns 100 packages per request, so we loop 10 times to get 5000.
        print("Fetching package data...", file=sys.stderr)
        for offset in range(0, 5000, 100):
            url = f"{base_url}&offset={offset}"
            
            # Fetch the data from the API
            response = requests.get(url)
            response.raise_for_status()  # Raise an exception for bad status codes
            
            data = response.json()
            rows = data.get('rows', [])
            
            # Extract the package data from the 'row' key in each item
            for item in rows:
                if 'row' in item:
                    all_packages.append(item['row'])
            
            print(f"Fetched {len(all_packages)} packages...", file=sys.stderr)
            # A small delay to be respectful to the API
            time.sleep(0.1)

        if not all_packages:
            print("No packages were found.", file=sys.stderr)
            return

        # Sort packages by weekly_downloads in descending order
        sorted_packages = sorted(all_packages, key=lambda x: x.get('weekly_downloads', 0), reverse=True)
        
        # Define the output filename
        filename = "top_5000_npm_packages.csv"
        
        # Get headers from the keys of the first package object
        headers = list(sorted_packages[0].keys())

        # Save the data to a CSV file
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(headers) # Write the header row
            
            # Write the top 1,000 packages to the CSV file
            for pkg in sorted_packages[:5000]:
                # Write a row for each package, getting values for each header
                writer.writerow([pkg.get(header) for header in headers])

        print(f"\nSuccessfully saved data for {len(sorted_packages)} packages to {filename}", file=sys.stderr)
            
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}", file=sys.stderr)
    except (ValueError, KeyError, IndexError) as e:
        print(f"Error processing data: {e}", file=sys.stderr)

In [33]:
get_top_npm_packages()

Fetching package data...
Fetched 100 packages...
Fetched 200 packages...
Fetched 300 packages...
Fetched 400 packages...
Fetched 500 packages...
Fetched 600 packages...
Fetched 700 packages...
Fetched 800 packages...
Fetched 900 packages...
Fetched 1000 packages...
Fetched 1100 packages...
Fetched 1200 packages...
Fetched 1300 packages...
Fetched 1400 packages...
Fetched 1500 packages...
Fetched 1600 packages...
Fetched 1700 packages...
Fetched 1800 packages...
Fetched 1900 packages...
Fetched 2000 packages...
Fetched 2100 packages...
Fetched 2200 packages...
Fetched 2300 packages...
Fetched 2400 packages...
Fetched 2500 packages...
Fetched 2600 packages...
Fetched 2700 packages...
Fetched 2800 packages...
Fetched 2900 packages...
Fetched 3000 packages...
Fetched 3100 packages...
Fetched 3200 packages...
Fetched 3300 packages...
Fetched 3400 packages...
Fetched 3500 packages...
Fetched 3600 packages...
Fetched 3700 packages...
Fetched 3800 packages...
Fetched 3900 packages...
Fetched 4

In [37]:
def get_package_dependencies(package_name):
    """
    Fetch dependencies and devDependencies for a given npm package.
    
    Args:
        package_name (str): Name of the npm package
        
    Returns:
        tuple: (dependencies_list, devDependencies_list)
    """
    try:
        # URL encode the package name (handles both scoped and regular packages)
        encoded_name = quote(package_name, safe='')
        url = f"https://api.npms.io/v2/package/{encoded_name}"
        
        # Make the API request
        response = requests.get(url, timeout=10)
        
        # Check if request was successful
        if response.status_code == 200:
            data = response.json()
            
            # Extract dependencies from the collected.metadata section
            metadata = data.get('collected', {}).get('metadata', {})
            dependencies = metadata.get('dependencies', {})
            dev_dependencies = metadata.get('devDependencies', {})
            
            dependencies_list = list(dependencies.keys()) if dependencies else []
            devDependencies_list = list(dev_dependencies.keys()) if dev_dependencies else []
            
            return (dependencies_list, devDependencies_list)
        else:
            print(f"  Error: HTTP {response.status_code}")
            return ([], [])
            
    except requests.exceptions.Timeout:
        print(f"  Error: Timeout")
        return ([], [])
    except Exception as e:
        print(f"  Error: {str(e)}")
        return ([], [])

In [38]:
df = pd.read_csv('top_5000_npm_packages.csv') 

if 'dependencies_list' not in df.columns:
    df['dependencies_list'] = None
if 'devDependencies_list' not in df.columns:
    df['devDependencies_list'] = None

# Process each package and save after every iteration
for idx in df.index:
    package_name = df.loc[idx, 'package_name']
    
    # Skip if already processed (in case of resuming after timeout)
    if pd.notna(df.loc[idx, 'dependencies_list']) and df.loc[idx, 'dependencies_list'] != '':
        print(f"Skipping {idx + 1}/{len(df)}: {package_name} (already processed)")
        continue
    
    print(f"Processing {idx + 1}/{len(df)}: {package_name}")
    
    # Get dependencies
    deps_list, dev_deps_list = get_package_dependencies(package_name)
    
    # Update the dataframe
    df.loc[idx, 'dependencies_list'] = str(deps_list)
    df.loc[idx, 'devDependencies_list'] = str(dev_deps_list)
    
    # Save to CSV after each iteration
    df.to_csv('top_5000_npm_packages.csv', index=False)
    
    # Add a small delay to be respectful to the API
    if idx < df.index[-1]:  # Don't sleep after the last item
        time.sleep(0.5)  # 500ms delay between requests

print("\nProcessing complete!")
print(f"Results saved to top_5000_npm_packages.csv")

Processing 1/5000: ansi-styles
Processing 2/5000: supports-color
Processing 3/5000: semver
Processing 4/5000: chalk
Processing 5/5000: debug
Processing 6/5000: has-flag
Processing 7/5000: color-convert
Processing 8/5000: color-name
Processing 9/5000: tslib
Processing 10/5000: ms
Processing 11/5000: minimatch
Processing 12/5000: strip-ansi
Processing 13/5000: lru-cache
Processing 14/5000: type-fest
Processing 15/5000: ansi-regex
Processing 16/5000: source-map
Processing 17/5000: glob
Processing 18/5000: commander
Processing 19/5000: readable-stream
Processing 20/5000: string-width
Processing 21/5000: brace-expansion
Processing 22/5000: escape-string-regexp
Processing 23/5000: wrap-ansi
Processing 24/5000: find-up
Processing 25/5000: yallist
Processing 26/5000: p-locate
Processing 27/5000: locate-path
Processing 28/5000: p-limit
Processing 29/5000: emoji-regex
Processing 30/5000: safe-buffer
Processing 31/5000: uuid
Processing 32/5000: minipass
Processing 33/5000: @types/node
Processing 

In [None]:
df = pd.read_csv('top_5000_npm_packages.csv')
df['num_dependencies'] = df['dependencies_list'].apply(lambda x: len(eval(x)) if pd.notna(x) and x != '' else 0)
df['num_devDependencies'] = df['devDependencies_list'].apply(lambda x: len(eval(x)) if pd.notna(x) and x != '' else 0)

df['total_dependencies'] = df['num_dependencies'] + df['num_devDependencies']

      package_name  num_dependencies  num_devDependencies  total_dependencies  \
0      ansi-styles                 0                    4                   4   
1   supports-color                 0                    6                   6   
2           semver                 1                    3                   4   
3            chalk                 0                   10                  10   
4            debug                 1                   11                  12   
..             ...               ...                  ...                 ...   
95          globby                 5                   10                  15   
96      picocolors                 0                    9                   9   
97            mime                 0                    9                   9   
98            y18n                 0                   12                  12   
99    @babel/types                 0                    0                   0   

    num_devDependencies  
0