# Imports

In [None]:
import os
import numpy as np
import subprocess
import pandas as pd
import requests
from urllib.parse import urljoin
import sqlite3
import json
import time
from datetime import datetime, timedelta
import re
from google.colab import drive
import random
from bs4 import BeautifulSoup
import csv
from urllib.parse import urlparse

In [None]:
drive.mount('/content/gdrive/', force_remount=True)
# change location as per your convenience
# sample_<idx>.txt should be present at this location
# database will be saved at this location
# the following is the root location of drive
os.chdir("/content/gdrive/Shareddrives/ECS 260/final")

Mounted at /content/gdrive/


# Function Definitions

In [None]:
def add_column_if_not_exists(cursor, table_name, column_name, column_definition):
    # check if the column already exists
    cursor.execute(f"PRAGMA table_info({table_name});")
    existing_columns = [column[1] for column in cursor.fetchall()]

    if column_name not in existing_columns:
        # add the column if it does not exist
        cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN {column_name} {column_definition};")

In [None]:
def fetch_package_data_from_db(db_path, package_name, table_name):
  curr_conn = sqlite3.connect(db_path)
  curr_cursor = curr_conn.cursor()

  curr_cursor.execute(f"SELECT * FROM {table_name} WHERE package=?", (package_name,))
  curr_row = curr_cursor.fetchone()

  curr_conn.close()

  if curr_row:
    column_names = [description[0] for description in curr_cursor.description]
    package_data = dict(zip(column_names, curr_row))
    return package_data
  else:
    return None

# DB Init

In [None]:
# !rm final_database.db

In [None]:
# connect to the destination database
source_databases = [f'database_{i + 1}.db' for i in range(9)]
destination_database = 'final_database.db'

dest_conn = sqlite3.connect(destination_database)
dest_cursor = dest_conn.cursor()

In [None]:
dest_cursor.execute("""
    CREATE TABLE IF NOT EXISTS package_data (
        package TEXT
    )
""")

dest_conn.commit()

In [None]:
# columns other than `package`
columns = [
    ['npm_api_status', 'TEXT DEFAULT NULL'],
    # Def: response status from npm API
    #      possible values: 200 if everything is mined
    #                       404 if package is missing/not found
    #                       etc...
    # IMP: further mining is continued only if this status is 200

    ['latest_version', 'TEXT DEFAULT NULL'],
    # Def: latest version name of the package
    # Source: npm API

    ['no_of_versions', 'INT DEFAULT NULL'],
    # Def: total no of versions released for this package
    # Source: npm API, derived

    ['keywords', 'TEXT DEFAULT NULL'],
    # Def: json dump of list of keywords
    # Source: npm API

    ['no_of_users', 'INT DEFAULT NULL'],
    # Def: no of npm users who have starred this package w.r.t. npm (not github)
    # Source: npm API, derived

    ['has_readme', 'INT DEFAULT NULL'],
    # Def: if the package has a readme file provided with it
    # Source: npm API, derived

    ['has_homepage', 'INT DEFAULT NULL'],
    # Def: if the package has a homepage
    # Source: npm API, derived

    ['last_modified_timestamp', 'TEXT DEFAULT NULL'],
    # Def: timestamp when the package was last modified in npm registry
    # Source: npm API

    ['created_timestamp', 'TEXT DEFAULT NULL'],
    # Def: timestamp when the package was first published in npm registry
    # Source: npm API

    ['unpublished_timestamp', 'TEXT DEFAULT NULL'],
    # Def: timestamp when the package was unpublished from npm registry
    # Source: npm API
    # IMP: further mining is discontinued if this has been found valid
    #      as we do not get any repository url from this response

    ['version_history_timestamps', 'TEXT DEFAULT NULL'],
    # Def: json dump of dictionary of version-release_timestamp pairs
    # Source: npm API

    ['is_deprecated', 'INT DEFAULT NULL'],
    # Def: flag to denote if package is deprecated
    # Source: npm API, derived

    ['no_of_dependencies', 'INT DEFAULT 0'],
    # Def: total no. of dependencies and devDependencies
    # Source: npm API, derived

    ['lagging_dependencies_count', 'INT DEFAULT NULL'],
    # Def: count of lagging dependencies
    #      lagging dependency: dependency that is not the latest version of itself
    #                          for each dependency listed in the package's metadata,
    #                          compare the version specified in the package with
    #                          the latest available version.
    # Source: npm API, derived

    ['lagging_dependencies_count_status', 'INT DEFAULT NULL'],
    # Def: response status on calling lagging dependencies count function
    # Source: npm API, derived

    ['unpack_size', 'INT DEFAULT NULL'],
    # Def: the total byte of the unpacked files in the tarball
    # Source: npm API

    ['file_count', 'INT DEFAULT NULL'],
    # Def: the number of files in the tarball, folder excluded
    # Source: npm API

    ['has_repository', 'INT DEFAULT NULL'],
    # Def: if the package has a repository present (can be any type including github, bitbucket, etc.)
    # Source: npm API, derived

    ['repository_type', 'TEXT DEFAULT NULL'],
    # Def: type of repository as mentioned in metadata
    # Source: npm API

    ['repository_url', 'TEXT DEFAULT NULL'],
    # Def: url of repository as mentioned in metadata
    # Source: npm API

    ['git_repository_url', 'TEXT DEFAULT NULL'],
    # Def: github repository url, formatted properly
    # Source: npm API, derived using extract_github_url(repository_url)

    #####################################################################################################################

    ['li_api_status', 'TEXT DEFAULT NULL'],
    # Def: response status from calling libraries.io API
    #      possible values: 200 if API working fine
    #                       404 if API resource not found
    #                       etc...
    # Source: libraries.io API

    ['rank', 'TEXT DEFAULT NULL'],
    # Def: popularity rank
    # Source: libraries.io API

    ['dependants_count', 'TEXT DEFAULT NULL'],
    # Def: no. of packages that are dependent on current package
    #      this is overwritten if its obtained from github page scrapping later on
    # Source: libraries.io API

    ['dependant_repos_count', 'TEXT DEFAULT NULL'],
    # Def: no. of repositories that are dependent on current package
    #      this is overwritten if its obtained from github page scrapping later on
    # Source: libraries.io API

    #####################################################################################################################

    ['git_repository_url_final', 'TEXT DEFAULT NULL'],
    # Def: final github repository url after accounting for any redirects in git_repository_url
    # Source: git_repository_url, derived

    ['git_repository_status', 'TEXT DEFAULT NULL'],
    # Def: response status from calling git_repository_url_final
    #      possible values: 200 if repo working fine
    #                       404 if repo not found, possibly made private or removed
    #                       etc...
    # IMP: further github mining is continued only if this status is 200

    #####################################################################################################################

    ['gh_api_status', 'TEXT DEFAULT NULL'],
    # Def: response status from calling github API
    #      possible values: 200 if API working fine
    #                       404 if API resource not found
    #                       etc...
    # IMP: forks, stars, watchers, avg_commit_freq is mined correctly if this status is 200
    # Source: github API

    ['forks', 'INT DEFAULT NULL'],
    # Def: no. of forks
    # Source: github API

    ['stars', 'INT DEFAULT NULL'],
    # Def: no. of stars
    # Source: github API

    ['watchers', 'INT DEFAULT NULL'],
    # Def: no. of watchers
    # Source: github API

    #####################################################################################################################

    ['gh_issue_api_status', 'TEXT DEFAULT NULL'],
    # Def: response status from calling github issues API
    #      possible values: 200 if API working fine
    #                       404 if API resource not found
    #                       etc...
    # IMP: issues is mined correctly if this status is 200
    # Source: github API

    ['issues', 'INT DEFAULT NULL'],
    # Def: latest version name of the package
    # Source: github issues API

    #####################################################################################################################

    ['gh_pr_api_status', 'TEXT DEFAULT NULL'],
    # Def: response status from calling github pr API
    #      possible values: 200 if API working fine
    #                       404 if API resource not found
    #                       etc...
    # IMP: pr is mined correctly if this status is 200
    # Source: github pr API

    ['pr', 'INT DEFAULT NULL'],
    # Def: latest version name of the package
    # Source: github API

    #####################################################################################################################

    ['gh_scrapping_status', 'TEXT DEFAULT NULL'],
    # Def: response status from calling github repository page of current package
    #      possible values: 200 if webpage working fine
    #                       404 if webpage not found
    #                       etc...
    # Source: github repository page scrapping

    ['contributors', 'INT DEFAULT NULL'],
    # Def: no. of contributors in current package's repository
    # Source: github repository page scrapping

    ['no_of_commits', 'INT DEFAULT NULL'],
    # Def: total no. of commits in current package's repository
    # Source: github repository page scrapping

    ['avg_commit_freq', 'REAL DEFAULT NULL'],
    # Def: frequency of commit w.r.t lifespan of repository, in no. per days
    # Source: github repository page scrapping and created_at from github API, derived

    #####################################################################################################################

    ['gh_net_scrapping_status', 'TEXT DEFAULT NULL'],
    # Def: response status from calling github repository's network page scrapping
    #      possible values: 200 if webpage working fine
    #                       404 if webpage not found
    #                       etc...
    # Source: github repository's network page scrapping

    ['dependants_count', 'INT DEFAULT NULL'],
    # Def: no. of packages that are dependent on current package
    # Source: github repository's network page scrapping

    ['dependant_repos_count', 'INT DEFAULT NULL'],
    # Def: no. of repositories that are dependent on current package
    # Source: github repository's network page scrapping

    #####################################################################################################################

    ['total_lines_of_code', 'INT DEFAULT NULL'],
    # Def: LOC count
    # Source: pydriller

    ['filtered_lines_of_code', 'INT DEFAULT NULL'],
    # Def: filtered LOC count
    # Source: pydriller

    ['no_of_files', 'INT DEFAULT NULL'],
    # Def: no. of files
    # Source: pydriller

    #####################################################################################################################

    ['snyk_scrapping_status', 'TEXT DEFAULT NULL'],
    # Def: response status from calling snyk package advisor webpage
    #      possible values: 200 if webpage working fine
    #                       404 if webpage not found
    #                       etc...
    # Source: snyk package advisor webpage scrapping

    ['health_score', 'INT DEFAULT NULL'],
    # Def: comprehensive evaluation of an npm package's overall health, considering multiple factors such as quality,
    #      security, and maintenance. It aids developers in assessing the package's overall quality and reliability
    # Source: snyk package advisor webpage scrapping

    ['security', 'TEXT DEFAULT NULL'],
    # Def: evaluation of package vulnerability, considering the presence and severity of known security issues
    # Source: snyk package advisor webpage scrapping

    ['popularity', 'TEXT DEFAULT NULL'],
    # Def: metric that represents the widespread use of a package in the development community, indicating stability and maintenance
    # Source: snyk package advisor webpage scrapping

    ['maintenance', 'TEXT DEFAULT NULL'],
    # Def: metric to measure the responsiveness of package maintainers to issues, updates, and community interactions, enhancing stability
    # Source: snyk package advisor webpage scrapping

    ['community', 'TEXT DEFAULT NULL'],
    # Def: metric to represent the level of community support and engagement, contributing to the overall health of the package
    # Source: snyk package advisor webpage scrapping
]

# adding columns to the table
for col in columns:
  add_column_if_not_exists(dest_cursor, 'package_data', col[0], col[1])

dest_conn.commit()

In [None]:
df = pd.read_sql_query(f"SELECT * FROM package_data", dest_conn)
df

Unnamed: 0,package,npm_api_status,latest_version,no_of_versions,keywords,no_of_users,has_readme,has_homepage,last_modified_timestamp,created_timestamp,...,gh_net_scrapping_status,total_lines_of_code,filtered_lines_of_code,no_of_files,snyk_scrapping_status,health_score,security,popularity,maintenance,community
0,@gerrico/react-components,200,0.1.27,27.0,,,1.0,1.0,2022-08-12T10:57:20.535Z,2022-08-07T04:17:08.603Z,...,200,1632.0,1604.0,31.0,200,30,SECURITY REVIEW NEEDED,LIMITED,INACTIVE,LIMITED
1,express-simple-app-generator,200,1.0.5,6.0,"[""generate"", ""express""]",,1.0,0.0,2023-09-26T09:25:54.330Z,2023-09-20T09:08:30.306Z,...,,,,,404,,,,,
2,generator-giraffe,200,1.5.11,143.0,"[""yeoman-generator""]",,1.0,1.0,2022-06-18T06:05:39.609Z,2013-10-22T13:10:45.431Z,...,200,3775.0,1332.0,77.0,200,36,SECURITY REVIEW NEEDED,LIMITED,INACTIVE,LIMITED
3,outdated-client,200,1.2.1,21.0,,,1.0,0.0,2022-05-12T09:34:06.984Z,2019-07-08T02:37:37.234Z,...,,,,,200,28,SECURITY REVIEW NEEDED,LIMITED,INACTIVE,LIMITED
4,@semi-bot/semi-theme-shopify,200,0.2.2,1.0,"[""semi-theme"", ""scss""]",,1.0,0.0,2022-08-10T01:57:06.698Z,2022-08-10T01:57:06.319Z,...,,,,,200,40,NO KNOWN SECURITY ISSUES,LIMITED,INACTIVE,LIMITED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,haribotify,200,1.0.0,4.0,"[""browserify"", ""html"", ""components"", ""browseri...",,1.0,1.0,2022-06-18T19:22:16.094Z,2016-01-27T10:34:18.517Z,...,,,,,200,42,NO KNOWN SECURITY ISSUES,LIMITED,INACTIVE,LIMITED
29996,eslint-config-sharecar,200,2.0.0,5.0,"[""eslint"", ""eslintconfig"", ""config"", ""airbnb"",...",,1.0,1.0,2022-06-17T20:06:55.433Z,2017-04-20T07:37:46.942Z,...,200,1712.0,7.0,56.0,200,42,NO KNOWN SECURITY ISSUES,LIMITED,INACTIVE,LIMITED
29997,webpack-to-ardoq,200,0.2.2,3.0,,,1.0,1.0,2023-10-11T07:55:07.798Z,2020-02-13T13:44:59.576Z,...,,,,,200,40,SECURITY REVIEW NEEDED,SMALL,INACTIVE,LIMITED
29998,zywave-content-search,200,0.0.6,6.0,,,1.0,0.0,2022-05-25T18:01:53.677Z,2022-03-04T19:23:37.896Z,...,,,,,200,28,SECURITY REVIEW NEEDED,LIMITED,INACTIVE,LIMITED


# Merging

In [None]:
# iterate through source databases
for source_db in source_databases:
  # connect to the current source database
  source_conn = sqlite3.connect(source_db)
  source_cursor = source_conn.cursor()

  # get all rows from the source database
  source_cursor.execute("SELECT * FROM package_data")
  rows = source_cursor.fetchall()

  # get the column names dynamically
  column_names = [description[0] for description in source_cursor.description]

  # generate the parameter placeholders (?, ?, ?, ...)
  placeholders = ', '.join(['?' for _ in column_names])

  # insert rows into the destination database
  dest_cursor.executemany(f"INSERT INTO package_data ({', '.join(column_names)}) VALUES ({placeholders})", rows)
  # commit changes in the destination connection
  dest_conn.commit()

  # commit changes and close the source connection
  source_conn.commit()
  source_conn.close()


dest_conn.close()

print(f"Rows from {len(source_databases)} databases merged into {destination_database}.")

Rows from 9 databases merged into final_database.db.
