# Imports

In [None]:
import os
import numpy as np
import subprocess
import pandas as pd
import requests
from urllib.parse import urljoin
import sqlite3
import json
import time

In [None]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)
os.chdir("/content/gdrive/Shareddrives/ECS 260")

Mounted at /content/gdrive/


# DB Init

In [None]:
# db init
# db_columns = ['package', 'id', 'key', 'value', 'meta_data']
# df = pd.DataFrame(columns=db_columns)

conn = sqlite3.connect('npm_names.db')
cursor = conn.cursor()

# Create a table in the database
cursor.execute('''
    CREATE TABLE IF NOT EXISTS npm_package_names (
        package TEXT
    )
''')
conn.commit()

# Function Definitions and inits

In [None]:
def fetch_response_with_retry(url, limit, skip, startkey = '', max_retries=3):
    retry_count = 0

    while retry_count < max_retries:
        try:
            params = {
                'limit': limit,
                'skip': skip,
                'startkey_docid': startkey,
                # 'include_docs': 'true'  # include package metadata in the response
            }

            response = requests.get(url, params=params)

            if response.status_code == 200:
                return response.json()
            else:
                print(f"Error: {response.status_code}")
                print(params)
                retry_count += 1
                print(f"Retrying in 5 seconds... (Retry {retry_count}/{max_retries})")
                time.sleep(5)

        except Exception as e:
            print(f"Exception occurred: {e}")
            retry_count += 1
            print(f"Retrying in 5 seconds... (Retry {retry_count}/{max_retries})")
            time.sleep(5)

    print(f"Max retries reached. Unable to fetch data from {url}.")
    return None


In [None]:
npm_base = "https://replicate.npmjs.com/"
all_endpoint = '_all_docs'
url = urljoin(npm_base, all_endpoint)

In [2]:
cursor.execute('SELECT COUNT(*) FROM npm_package_names')
rows_fetched = cursor.fetchone()[0]
print(f"Names mined so far: {rows_fetched}")

Names mined so far: 1230000


In [3]:
cursor.execute('''
    SELECT * FROM npm_package_names
    ORDER BY package DESC
''')

last_row = cursor.fetchone()
last_row_key = last_row[0]
print(f"Last package mined --> {last_row_key}")

Last package mined --> discord-minimal


In [None]:
total_packages_available = 2682257
total_rows = total_packages_available/2 + 1
rows_per_page = 250

In [None]:
page_json_response = fetch_response_with_retry(url, rows_per_page, 0, last_row_key)

In [None]:
page_json_response["rows"][0]

{'id': 'discord-minimal',
 'key': 'discord-minimal',
 'value': {'rev': '72-6b78b945880a0d6dca55602b2854d15e'}}

# Mining

In [None]:
curr_offset = 1 if last_row_key else 0

if last_row_key:
  while rows_fetched < total_rows:
    api_call_start_time = time.time()


    page_json_response = fetch_response_with_retry(url, rows_per_page, curr_offset, last_row_key)


    api_call_end_time = time.time()
    elapsed_api_call_time = api_call_end_time - api_call_start_time

    db_update_start_time = time.time()

    if page_json_response:
      rows_fetched += rows_per_page


      print("Rows fetched: " + str(rows_fetched) + "\tin " + str(elapsed_api_call_time) + "\tseconds")

      curr_offset = 1
      last_row_key = page_json_response['rows'][-1]['key']

      packages = [(row['key'],) for row in page_json_response['rows']]
      # updating db
      cursor.executemany('''
              INSERT INTO npm_package_names (package)
              VALUES (?)
          ''', packages)
      conn.commit()

      db_update_end_time = time.time()
      elapsed_db_update_time = db_update_end_time - db_update_start_time
      print("Dataframe and database updated in " + str(elapsed_db_update_time) + " seconds")
      print()

Rows fetched: 1230250	in 20.583998680114746	seconds
Dataframe and database updated in 0.03553342819213867 seconds

Rows fetched: 1230500	in 14.896684408187866	seconds
Dataframe and database updated in 0.021087646484375 seconds

Rows fetched: 1230750	in 20.776349544525146	seconds
Dataframe and database updated in 0.01895308494567871 seconds

Rows fetched: 1231000	in 13.225515127182007	seconds
Dataframe and database updated in 0.020948410034179688 seconds

Rows fetched: 1231250	in 20.10891318321228	seconds
Dataframe and database updated in 0.01836228370666504 seconds

Rows fetched: 1231500	in 18.559289932250977	seconds
Dataframe and database updated in 0.034837961196899414 seconds

Rows fetched: 1231750	in 6.800427675247192	seconds
Dataframe and database updated in 0.018511056900024414 seconds

Rows fetched: 1232000	in 4.994453191757202	seconds
Dataframe and database updated in 0.02300262451171875 seconds

Rows fetched: 1232250	in 8.093596696853638	seconds
Dataframe and database updated 

In [None]:
# closing the db connection
conn.close()