# Imports

In [None]:
import os
import numpy as np
import subprocess
import pandas as pd
import requests
from urllib.parse import urljoin
import sqlite3
import json
import time

In [None]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)
os.chdir("/content/gdrive/Shareddrives/ECS 260")

Mounted at /content/gdrive/


# DB Init

In [None]:
# db init
# db_columns = ['package', 'id', 'key', 'value', 'meta_data']
# df = pd.DataFrame(columns=db_columns)

conn = sqlite3.connect('npm_names_rev.db')
cursor = conn.cursor()

# Create a table in the database
cursor.execute('''
    CREATE TABLE IF NOT EXISTS npm_package_names (
        package TEXT
    )
''')
conn.commit()

# Function Definitions and inits

In [None]:
def fetch_response_with_retry(url, limit, skip, startkey = '', max_retries=3):
    retry_count = 0

    while retry_count < max_retries:
        try:
            if not startkey == '':
              params = {
                  'limit': limit,
                  'skip': skip,
                  'startkey_docid': startkey,
                  'descending': 'true',
                  # 'include_docs': 'true'  # include package metadata in the response
              }
            else:
              params = {
                  'limit': limit,
                  'skip': skip,
                  'descending': 'true',
                  # 'include_docs': 'true'  # include package metadata in the response
              }

            response = requests.get(url, params=params)

            if response.status_code == 200:
                return response.json()
            else:
                print(f"Error: {response.status_code}")
                print(params)
                retry_count += 1
                print(f"Retrying in 5 seconds... (Retry {retry_count}/{max_retries})")
                time.sleep(5)

        except Exception as e:
            print(f"Exception occurred: {e}")
            retry_count += 1
            print(f"Retrying in 5 seconds... (Retry {retry_count}/{max_retries})")
            time.sleep(5)

    print(f"Max retries reached. Unable to fetch data from {url}.")
    return None


In [None]:
npm_base = "https://replicate.npmjs.com/"
all_endpoint = '_all_docs'
url = urljoin(npm_base, all_endpoint)

In [2]:
cursor.execute('SELECT COUNT(*) FROM npm_package_names')
rows_fetched = cursor.fetchone()[0]
print(f"Names mined so far: {rows_fetched}")

Names mined so far: 1273250


In [4]:
cursor.execute('''
    SELECT * FROM npm_package_names
    ORDER BY package
''')

last_row = cursor.fetchone()
last_row_key = last_row[0]
print(f"Last package mined --> {last_row_key}")

Last package mined --> fritzy


In [None]:
total_packages_available = 2682257
total_rows = total_packages_available/2
rows_per_page = 250

In [None]:
page_json_response = fetch_response_with_retry(url, rows_per_page, 0, last_row_key)

In [None]:
page_json_response["rows"][0]

{'id': 'fritzy',
 'key': 'fritzy',
 'value': {'rev': '3-14a90b516b342c03e6a1caa48c4f75b2'}}

# Mining

In [None]:
curr_offset = 1 if last_row_key else 0

if last_row_key:
  while rows_fetched < total_rows:
    api_call_start_time = time.time()


    page_json_response = fetch_response_with_retry(url, rows_per_page, curr_offset, last_row_key)


    api_call_end_time = time.time()
    elapsed_api_call_time = api_call_end_time - api_call_start_time

    db_update_start_time = time.time()

    if page_json_response:
      rows_fetched += rows_per_page


      print("Rows fetched: " + str(rows_fetched) + "\tin " + str(elapsed_api_call_time) + "\tseconds")
      curr_offset = 1
      last_row_key = page_json_response['rows'][-1]['key']

      packages = [(row['key'],) for row in page_json_response['rows']]
      # updating db
      cursor.executemany('''
              INSERT INTO npm_package_names (package)
              VALUES (?)
          ''', packages)
      conn.commit()

      db_update_end_time = time.time()
      elapsed_db_update_time = db_update_end_time - db_update_start_time
      print("Dataframe and database updated in " + str(elapsed_db_update_time) + " seconds")
      print()

Rows fetched: 1273500	in 15.546140432357788	seconds
Dataframe and database updated in 0.02107834815979004 seconds

Rows fetched: 1273750	in 13.182391166687012	seconds
Dataframe and database updated in 0.020333290100097656 seconds

Rows fetched: 1274000	in 19.81485867500305	seconds
Dataframe and database updated in 0.018155336380004883 seconds

Rows fetched: 1274250	in 16.590994596481323	seconds
Dataframe and database updated in 0.01592278480529785 seconds

Rows fetched: 1274500	in 11.68268609046936	seconds
Dataframe and database updated in 0.03165793418884277 seconds

Rows fetched: 1274750	in 27.864516258239746	seconds
Dataframe and database updated in 0.02157735824584961 seconds

Rows fetched: 1275000	in 15.32732892036438	seconds
Dataframe and database updated in 0.017379045486450195 seconds

Rows fetched: 1275250	in 9.135756969451904	seconds
Dataframe and database updated in 0.0164639949798584 seconds

Rows fetched: 1275500	in 5.309165000915527	seconds
Dataframe and database updated 

In [None]:
# closing the db connection
conn.close()