# Imports

In [None]:
import os
import numpy as np
import subprocess
import pandas as pd
import requests
from urllib.parse import urljoin
import sqlite3
import json
import time
from datetime import datetime, timedelta
import re
from google.colab import drive
import random
from bs4 import BeautifulSoup
import csv
from urllib.parse import urlparse

In [None]:
drive.mount('/content/gdrive/', force_remount=True)
# change location as per your convenience
# final_packages.txt (containing json dump of list of npm package names to be mined) should be present at this location
# database will be saved at this location
os.chdir("/content/gdrive/Shareddrives/ECS 260/final")

Mounted at /content/gdrive/


# Function Definitions

In [None]:
def add_column_if_not_exists(cursor, table_name, column_name, column_definition):
  # check if the column already exists
  cursor.execute(f"PRAGMA table_info({table_name});")
  existing_columns = [column[1] for column in cursor.fetchall()]

  if column_name not in existing_columns:
    # add the column if it does not exist
    cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN {column_name} {column_definition};")

In [None]:
def get_column_names(cursor, table_name):
  cursor.execute(f"PRAGMA table_info({table_name});")
  return [column[1] for column in cursor.fetchall()]

In [None]:
def categorize_metric(metric_name, conn, categorization_logic='soft'):
  """
  Soft Categorization:
    Categorizes the specified metric column of a DataFrame into 'HIGH', 'MID', or 'LOW' based on thresholds.

  Hard Categorization:
    Categorizes the specified metric column of a DataFrame without specifying any thresholds and copying the values as is.

  Parameters:
    metric_name (str): Name of the metric column to be categorized.
    conn (sqlite3.Connection): SQLite database connection object.
    categorization_logic: <'soft', 'hard'>

  Returns:
    DataFrame: DataFrame with a new column containing the assigned labels.
  """

  query = f"SELECT package, {metric_name} FROM package_metrics WHERE {metric_name} IS NOT NULL"
  package_data_df = pd.read_sql_query(query, conn)

  if categorization_logic == 'hard':
    package_data_df[f'{metric_name}_valuetype'] = package_data_df[metric_name]
    return package_data_df

  # Convert the specified metric column to numeric, coercing errors to NaN
  package_data_df[metric_name] = pd.to_numeric(package_data_df[metric_name], errors='coerce')

  # Calculate the range of the specified metric
  min_value = package_data_df[metric_name].min()
  max_value = package_data_df[metric_name].max()
  value_range = max_value - min_value

  # Calculate thresholds for categorizing the specified metric
  low_threshold = min_value + (value_range / 3)
  high_threshold = min_value + (2 * value_range / 3)

  # Function to assign label based on thresholds
  def assign_label(value):
    if pd.isna(value):
      return None
    elif value > high_threshold:
      return 'HIGH'
    elif low_threshold <= value <= high_threshold:
      return 'MID'
    else:
      return 'LOW'

  # Create a new column containing the assigned labels
  package_data_df[f'{metric_name}_valuetype'] = package_data_df[metric_name].apply(assign_label)

  # Return the DataFrame with only the newly created column
  return package_data_df

In [None]:
def update_metric_categorization(df, table_name, column_name, identifier_column, conn, cursor):
  values_to_update = [(row[f'{column_name}_valuetype'], row[identifier_column]) for _, row in df.iterrows()]
  update_query = f"UPDATE {table_name} SET {column_name} = ? WHERE {identifier_column} = ?"
  cursor.executemany(update_query, values_to_update)
  conn.commit()

# DB Init

In [None]:
# init database and cursor
conn = sqlite3.connect(f"final_database.db")
cursor = conn.cursor()

In [None]:
cursor.execute("""
    CREATE TABLE IF NOT EXISTS metric_analysis (
        package TEXT
    )
""")

conn.commit()

In [None]:
print("Current Columns present: \n-->", '\n--> '.join(get_column_names(cursor, 'metric_analysis')));

Current Columns present: 
--> package
--> is_deprecated
--> dependants_count
--> forks
--> issues
--> pr
--> contributors
--> unpack_size
--> total_lines_of_code
--> sloc
--> days_since_last_modification
--> avg_commit_freq
--> threat_score
--> file_count
--> state


In [None]:
# getting final metrics to be used for categorization
with open('metrics_for_categorization.txt', "r") as file:
  metrics_for_categorization = file.read()

metrics_for_categorization = json.loads(metrics_for_categorization)
n_cols = len(metrics_for_categorization)
print(f"{n_cols} metrics to be used for categorization:\n{metrics_for_categorization}")

# columns other than `package`
columns = [[metric, 'TEXT DEFAULT NULL'] for metric in metrics_for_categorization]

# adding columns to the table
for col in columns:
  add_column_if_not_exists(cursor, 'metric_analysis', col[0], col[1])

conn.commit()

12 metrics to be used for categorization:
['is_deprecated', 'dependants_count', 'forks', 'issues', 'pr', 'contributors', 'unpack_size', 'avg_commit_freq', 'total_lines_of_code', 'sloc', 'days_since_last_modification', 'file_count']


In [None]:
sql_query = """
    INSERT INTO metric_analysis (package)
    SELECT p.package
    FROM package_data p
    LEFT JOIN metric_analysis m ON p.package = m.package
    WHERE m.package IS NULL
"""
cursor.execute(sql_query)
conn.commit()

In [None]:
df = pd.read_sql_query(f"SELECT * FROM metric_analysis", conn)
df

Unnamed: 0,package,is_deprecated,dependants_count,forks,issues,pr,contributors,unpack_size,total_lines_of_code,sloc,days_since_last_modification,avg_commit_freq,threat_score,file_count,state
0,@gerrico/react-components,0,LOW,LOW,LOW,LOW,LOW,MID,MID,MID,LOW,MID,,MID,
1,express-simple-app-generator,0,LOW,,,,,HIGH,,,LOW,,,,
2,generator-giraffe,0,LOW,LOW,LOW,LOW,LOW,,MID,MID,LOW,MID,,,
3,outdated-client,0,LOW,,,,,MID,,,LOW,,,MID,
4,@semi-bot/semi-theme-shopify,0,,,,,,MID,,,LOW,,,MID,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,haribotify,0,LOW,,,,,,,,LOW,,,,
29996,eslint-config-sharecar,0,LOW,LOW,LOW,LOW,LOW,MID,MID,LOW,LOW,LOW,,LOW,
29997,webpack-to-ardoq,0,LOW,,,,,MID,,,LOW,,,MID,
29998,zywave-content-search,0,LOW,,,,,MID,,,LOW,,,MID,


# Labelling

In [None]:
table_name = 'metric_analysis'
identifier_column = 'package'
for metric in metrics_for_categorization:
  print('----------------------------------------------------------------------------------------------------------------------------')
  print(f"Now updating {metric} ...")
  update_metric_categorization(categorize_metric(metric, conn, categorization_logic='hard' if metric == 'is_deprecated' else 'soft'), table_name, metric, identifier_column, conn, cursor)
  print(f"Updated {metric}")
  print('----------------------------------------------------------------------------------------------------------------------------')

----------------------------------------------------------------------------------------------------------------------------
Now updating sloc ...
Updated sloc
----------------------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------------------
Now updating total_lines_of_code ...
Updated total_lines_of_code
----------------------------------------------------------------------------------------------------------------------------


In [None]:
for metric in metrics_for_categorization:
  cursor.execute(f"SELECT DISTINCT {metric} FROM {table_name}")
  print(f"Metric {metric} --> {[row[0] for row in cursor.fetchall()]}")

Metric is_deprecated --> ['0', '1', None]
Metric dependants_count --> ['LOW', None, 'HIGH', 'MID']
Metric forks --> ['LOW', None, 'HIGH', 'MID']
Metric issues --> ['LOW', None, 'MID', 'HIGH']
Metric pr --> ['LOW', None, 'MID', 'HIGH']
Metric contributors --> ['LOW', None, 'HIGH', 'MID']
Metric unpack_size --> ['MID', 'HIGH', None, 'LOW']
Metric avg_commit_freq --> ['MID', None, 'LOW', 'HIGH']
Metric total_lines_of_code --> ['LOW', None]
Metric sloc --> ['LOW', None]
Metric days_since_last_modification --> ['LOW', None, 'HIGH', 'MID']
Metric file_count --> ['MID', None, 'LOW', 'HIGH']


In [None]:
conn.close()