# Imports

In [1]:
import os
import numpy as np
import subprocess
import pandas as pd
import requests
from urllib.parse import urljoin
import sqlite3
import json
import time
from datetime import datetime, timedelta
import re
from google.colab import drive
import random

drive.mount('/content/gdrive/', force_remount=True)
os.chdir("/content/gdrive/Shareddrives/ECS 260/final")

Mounted at /content/gdrive/


# Funtion Definitions

In [2]:
def GET(json_obj, keys):
  if not isinstance(keys, list):
    keys = [keys]
  current = json_obj
  try:
    for key in keys:
      if isinstance(current, list):
        key = int(key)
      current = current[key]
    return current
  except (TypeError, IndexError, KeyError):
    return None

def isValid(value):
  if isinstance(value, list):
    return not len(value) == 0
  return value != None and value != ""

def getN(arr):
  return len(arr) if isValid(arr) else None

def extract_github_url(repo_url):
  if not isValid(repo_url):
    return None

  github_https_pattern = re.compile(r'https://github\.com/([^/]+/[^/]+)\.git')
  github_git_pattern = re.compile(r'git://github\.com/([^/]+/[^/]+)\.git')
  github_ssh_pattern = re.compile(r'git@github\.com:([^/]+/[^/]+)\.git')

  match_https = github_https_pattern.match(repo_url)
  match_git = github_git_pattern.match(repo_url)
  match_ssh = github_ssh_pattern.match(repo_url)

  if match_https:
    return f"https://github.com/{match_https.group(1)}"
  elif match_git:
    return f"https://github.com/{match_git.group(1)}"
  elif match_ssh:
    return f"https://github.com/{match_ssh.group(1)}"
  else:
    return None

In [3]:
# fetch json from an API endpoint
def fetch_response_without_fail(url, params = {}, headers = {}, shouldScrap = False):
  retry_count = 0
  max_retries = 15
  time_interval = 10
  status = ""
  while retry_count < max_retries:
    try:
      response = requests.get(url, params=params, headers=headers)

      if response.status_code == 200:
        status = str(response.status_code)
        if shouldScrap:
          return (status, response)
        else:
          return (status, response.json())
      elif response.status_code == 404:
        status = str(response.status_code)
        print(f"Error: {response.status_code}, skipping url {url}")
        return (status, None)
      else:
        status = str(response.status_code)
        print(f"Error: {response.status_code}")
        print(params)
        retry_count += 1
        print(f"Retrying in {time_interval} seconds... (Retry {retry_count}/{max_retries})")
        if response.status_code == 429:
          retry_after = response.headers.get('Retry-After')
          if retry_after:
            print("Retrying after", retry_after + 5, "Seconds")
            time.sleep(int(retry_after) + 5)
          else:
            time.sleep(time_interval)
        else:
          time.sleep(time_interval)

    except Exception as e:
      status = f"Exception occurred: {e}"
      print(status)
      retry_count += 1
      print(f"Retrying in {time_interval} seconds... (Retry {retry_count}/{max_retries})")
      time.sleep(time_interval)

  print(f"Max retries reached. Unable to fetch data from {url}.")
  return (status, None)

# Distribution

In [4]:
with open('final_packages.txt', "r") as file:
  final_packages = file.read()
final_packages = json.loads(final_packages)

In [5]:
# put index according to the no. of partitions
idx = 1

# final_packages.txt should have json dump of list of npm package names to be mined
with open('final_packages.txt', "r") as file:
  final_packages = file.read()
final_packages = json.loads(final_packages)

# specify how many parts you want to distribute the mining into
total_parts = 6
i = idx - 1
part_size = len(final_packages) // total_parts
all_packages = final_packages[i * part_size:(i + 1) * part_size if i < total_parts - 1 else len(final_packages)]
print("Total rows to be mined: ", len(all_packages))

Total rows to be mined:  5000


In [6]:
# create a list of months from May 2018 to January 2024
start_date = datetime(2018, 5, 1)
end_date = datetime(2024, 1, 1)
date_range = pd.date_range(start=start_date, end=end_date, freq='MS')

# DB Init

In [7]:
conn = sqlite3.connect(f"download_history_{idx}.db")
cursor = conn.cursor()

column_names = ", ".join(f"m_{month.strftime('%Y%m')} INTEGER" for month in date_range)
sql_query = f"""
        CREATE TABLE IF NOT EXISTS download_data (
            package TEXT,
            {column_names},
            average INTEGER
        )
    """

cursor.execute(sql_query)
conn.commit()

In [8]:
df = pd.read_sql_query(f"SELECT * FROM download_data", conn)
df

Unnamed: 0,package,m_201805,m_201806,m_201807,m_201808,m_201809,m_201810,m_201811,m_201812,m_201901,...,m_202305,m_202306,m_202307,m_202308,m_202309,m_202310,m_202311,m_202312,m_202401,average


# Mining

In [None]:
cursor.execute(f"SELECT * FROM download_data")
rows = cursor.fetchall()
packages_done = [row[0] for row in rows]
package_list = list(set(all_packages) - set(packages_done))
print(f"Rows done so far\t{len(rows)}/{len(all_packages)}")

Rows done so far	0/5000


In [None]:
scoped_packages = [package for package in package_list if package.startswith('@')]
non_scoped_packages = [package for package in package_list if not package.startswith('@')]
scoped_package_list = [package for package in all_packages if package.startswith('@')]
non_scoped_package_list = [package for package in all_packages if not package.startswith('@')]

print(f"Scoped packages to mine\t\t{len(scoped_packages)}/{len(scoped_package_list)}")
print(f"Non scoped packages to mine\t{len(non_scoped_packages)}/{len(non_scoped_package_list)}")

Scoped packages to mine		1673/1673
Non scoped packages to mine	3327/3327


## Bulk Mining

In [None]:
# bulk fetch for non-scoped packages
rows_fetched = 0
bulk_batch_size = 128
for i in range(0, len(non_scoped_packages), bulk_batch_size):
  batch = non_scoped_packages[i:i + bulk_batch_size]
  bulk_package_list = ",".join(batch)
  api_call_start_time = time.time()
  total_counts = [None] * bulk_batch_size
  bulk_package_data = {f"m_{month.strftime('%Y%m')}": [None] * bulk_batch_size for month in date_range}

  for month in date_range:
    start_date_str = month.strftime("%Y-%m-%d")
    end_date_str = (month + pd.DateOffset(months=1) - timedelta(days=1)).strftime("%Y-%m-%d")

    bulk_download_api_url = f"https://api.npmjs.org/downloads/point/{start_date_str}:{end_date_str}/{bulk_package_list}"
    (bulk_response_status, bulk_response_data) = fetch_response_without_fail(bulk_download_api_url)

    if bulk_response_status == '200':
      for package_idx in range(len(batch)):
        package = batch[package_idx]
        download_count = GET(bulk_response_data, [package, "downloads"])
        if not download_count == None:
          bulk_package_data[f"m_{month.strftime('%Y%m')}"][package_idx] = download_count

          if total_counts[package_idx] == None:
            total_counts[package_idx] = download_count
          else:
            total_counts[package_idx] += download_count
    else:
      print(f"Failed to fetch data for {bulk_package_list} in {start_date_str}")

  rows_fetched += bulk_batch_size
  api_call_end_time = time.time()
  elapsed_api_call_time = api_call_end_time - api_call_start_time
  print("Rows fetched in " + str(elapsed_api_call_time) + "\tseconds")

  db_update_start_time = time.time()
  for package_idx in range(len(batch)):
    package = batch[package_idx]
    package_data = {"package": package}
    valid_data_len = 0
    for month in date_range:
      package_data[f"m_{month.strftime('%Y%m')}"] = bulk_package_data[f"m_{month.strftime('%Y%m')}"][package_idx]
      if not bulk_package_data[f"m_{month.strftime('%Y%m')}"][package_idx] == None:
        valid_data_len += 1
    package_data["average"] = int(total_counts[package_idx] / valid_data_len) if not total_counts[package_idx] == None else None
    columns = ", ".join(package_data.keys())
    placeholders = ", ".join("?" for _ in package_data)
    sql_query = f"""
        INSERT INTO download_data ({columns})
        VALUES ({placeholders})
    """
    cursor.execute(sql_query, tuple(package_data.values()))
    conn.commit()
  db_update_end_time = time.time()
  elapsed_db_update_time = db_update_end_time - db_update_start_time

  print("==============================================================================")
  print("Database updated in " + str(elapsed_db_update_time) + " seconds")
  print(f"Rows fetched so far: {rows_fetched}/{len(non_scoped_packages)}")
  print("==============================================================================")
  print()
  print()

Rows fetched in 20.14516592025757	seconds
Database updated in 1.4343268871307373 seconds
Rows fetched so far: 128/3327


Rows fetched in 20.11069631576538	seconds
Database updated in 1.4965219497680664 seconds
Rows fetched so far: 256/3327


Rows fetched in 20.493934392929077	seconds
Database updated in 1.5455455780029297 seconds
Rows fetched so far: 384/3327


Rows fetched in 20.57086682319641	seconds
Database updated in 1.6286582946777344 seconds
Rows fetched so far: 512/3327


Rows fetched in 20.44608426094055	seconds
Database updated in 1.6562221050262451 seconds
Rows fetched so far: 640/3327


Rows fetched in 20.42780566215515	seconds
Database updated in 1.717111349105835 seconds
Rows fetched so far: 768/3327


Rows fetched in 20.94654870033264	seconds
Database updated in 1.7458183765411377 seconds
Rows fetched so far: 896/3327


Rows fetched in 20.71972346305847	seconds
Database updated in 1.8308556079864502 seconds
Rows fetched so far: 1024/3327


Rows fetched in 20.743690252304

## Independent mining

In [None]:
full_date_range = [(date.strftime("%Y-%m-%d"), (date + pd.DateOffset(months=1) - timedelta(days=1)).strftime("%Y-%m-%d")) for date in date_range]

chunk_size = 17
date_range_chunks = [full_date_range[i:i+chunk_size] for i in range(0, len(full_date_range), chunk_size)]
date_range_chunks = [(date_range_chunk[0][0], date_range_chunk[-1][-1]) for date_range_chunk in date_range_chunks]

In [None]:
rows_fetched = 0
for package in scoped_packages:
  package_data = {"package": package}

  api_call_start_time = time.time()
  for date_range_chunk in date_range_chunks:
    start_date_str = date_range_chunk[0]
    end_date_str = date_range_chunk[1]

    download_api_url = f"https://api.npmjs.org/downloads/range/{start_date_str}:{end_date_str}/{package}"
    (response_status, response_data) = fetch_response_without_fail(download_api_url)

    if response_status == '200':
      download_counts = GET(response_data, "downloads")
      for download_count in download_counts:
        curr_month = download_count['day'][:7]
        curr_downloads = download_count['downloads']
        curr_key = f"m_{curr_month.replace('-', '')}"
        package_data[curr_key] = package_data.get(curr_key, 0) + curr_downloads
    else:
      print(f"Failed to fetch data for {package} in {start_date_str}-{end_date_str}")

  rows_fetched += 1
  api_call_end_time = time.time()
  elapsed_api_call_time = api_call_end_time - api_call_start_time
  print("Rows fetched in " + str(int(elapsed_api_call_time)) + "\tseconds")

  total_count = None
  valid_data_len = 0
  for month in date_range:
    download_count = package_data.get(f"m_{month.strftime('%Y%m')}", None)
    if not download_count == None:
      valid_data_len += 1
      if total_count == None:
        total_count = download_count
      else:
        total_count += download_count
  package_data["average"] = int(total_count / valid_data_len) if not total_count == None else None
  db_update_start_time = time.time()

  columns = ", ".join(package_data.keys())
  placeholders = ", ".join("?" for _ in package_data)
  sql_query = f"""
    INSERT INTO download_data ({columns})
    VALUES ({placeholders})
  """
  cursor.execute(sql_query, tuple(package_data.values()))
  conn.commit()

  db_update_end_time = time.time()
  elapsed_db_update_time = db_update_end_time - db_update_start_time

  db_update_end_time = time.time()
  elapsed_db_update_time = db_update_end_time - db_update_start_time
  print("==============================================================================")
  print("Database updated in " + str(int(elapsed_db_update_time)) + " seconds")
  print(f"Rows fetched so far: {rows_fetched}/{len(scoped_packages)}")
  print("==============================================================================")
  print()
  print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Rows fetched in 1	seconds
{'package': '@base-framework/atoms', 'm_201805': 0, 'm_201806': 0, 'm_201807': 0, 'm_201808': 0, 'm_201809': 0, 'm_201810': 0, 'm_201811': 0, 'm_201812': 0, 'm_201901': 0, 'm_201902': 0, 'm_201903': 0, 'm_201904': 0, 'm_201905': 0, 'm_201906': 0, 'm_201907': 0, 'm_201908': 0, 'm_201909': 0, 'm_201910': 0, 'm_201911': 0, 'm_201912': 0, 'm_202001': 0, 'm_202002': 0, 'm_202003': 0, 'm_202004': 0, 'm_202005': 0, 'm_202006': 0, 'm_202007': 0, 'm_202008': 0, 'm_202009': 0, 'm_202010': 0, 'm_202011': 0, 'm_202012': 0, 'm_202101': 0, 'm_202102': 0, 'm_202103': 0, 'm_202104': 0, 'm_202105': 0, 'm_202106': 0, 'm_202107': 0, 'm_202108': 0, 'm_202109': 0, 'm_202110': 0, 'm_202111': 0, 'm_202112': 0, 'm_202201': 0, 'm_202202': 0, 'm_202203': 0, 'm_202204': 0, 'm_202205': 0, 'm_202206': 0, 'm_202207': 0, 'm_202208': 0, 'm_202209': 0, 'm_202210': 0, 'm_202211': 0, 'm_202212': 0, 'm_202301': 0, 'm_202302': 0, 

In [None]:
conn.close()