# Imports

In [None]:
import os
import numpy as np
import subprocess
import pandas as pd
import requests
from urllib.parse import urljoin
import sqlite3
import json
import time
from datetime import datetime, timedelta
import re
from google.colab import drive
import random
from bs4 import BeautifulSoup
import csv
from urllib.parse import urlparse
import math

# DB Init

In [None]:
drive.mount('/content/gdrive/', force_remount=True)
# change location as per your convenience
# final_packages.txt (containing json dump of list of npm package names to be mined) should be present at this location
# database will be saved at this location
os.chdir("/content/gdrive/Shareddrives/ECS 260/final")

Mounted at /content/gdrive/


In [None]:
conn = sqlite3.connect('final_database.db')

In [None]:
# Read the SQLite database into a DataFrame
df = pd.read_sql_query("SELECT * FROM metric_analysis", conn)

# Categorizing all packages based on definition logic

In [None]:
with open('metrics_for_categorization.txt', "r") as file:
  metrics_for_categorization = file.read()
metrics_for_categorization = json.loads(metrics_for_categorization)

In [None]:
# Function to calculate state for a package
def calculate_state(row):
    metrics = {f"{metric}": row[metric] for metric in metrics_for_categorization}
    states = []

    # check for is_deprecated first
    if metrics['is_deprecated'] == '1':
        states.append('DEAD')
    else :
        # Logic for DEAD state
        dead_criteria = {
            'forks': 'LOW',
            'issues': 'LOW',
            'pr': 'LOW',
            'avg_commit_freq': 'LOW',
            'days_since_last_modification': ['HIGH', 'MID']
        }

        dead_count = matches_criteria(dead_criteria, metrics)
        non_none_values_dead = len([metrics[metric] for metric in dead_criteria if metrics[metric] is not None and metrics[metric] != np.nan])

        if non_none_values_dead > 0:
            dead_count_threshold = non_none_values_dead / 2
            if dead_count > dead_count_threshold:
                states.append('DEAD')
            elif dead_count == dead_count_threshold:
                state = tiebreaker_for_dead(metrics)
                if state:
                    states.append(state)

    # Logic for TRIVIAL state
    trivial_criteria = {
        'dependants_count': 'LOW',
        'forks': 'LOW',
        'issues': 'LOW',
        'pr': 'LOW',
        'contributors': 'LOW',
        'unpack_size': 'LOW',
        'total_lines_of_code': 'LOW',
        'sloc': 'LOW',
        'file_count': 'LOW'
    }

    trivial_count = matches_criteria(trivial_criteria, metrics)
    non_none_values_trivial = len([metrics[metric] for metric in trivial_criteria if metrics[metric] is not None and metrics[metric] != np.nan])

    if non_none_values_trivial > 0:
        trivial_count_threshold = non_none_values_trivial / 2
        if trivial_count > trivial_count_threshold:
            states.append('TRIVIAL')
        elif trivial_count == trivial_count_threshold:
            state = tiebreaker_for_trivial(metrics)
            if state:
                states.append(state)

     # Logic for ACTIVE state
    if 'DEAD' not in states:
      # Logic for ACTIVE state
      active_criteria = {
          'forks': ['MID', 'HIGH'],
          'issues': ['MID', 'HIGH'],
          'pr': ['MID', 'HIGH'],
          'contributors': 'HIGH',
          'avg_commit_freq': ['MID', 'HIGH'],
          'days_since_last_modification': 'LOW'
      }

      active_count = matches_criteria(active_criteria, metrics)
      non_none_values_active = len([metrics[metric] for metric in active_criteria if metrics[metric] is not None and metrics[metric] != np.nan])


      if non_none_values_active > 0:
          active_count_threshold = non_none_values_active / 2
          if active_count > active_count_threshold:
              states.append('ACTIVE')
          elif active_count == active_count_threshold:
              state = tiebreaker_for_active(metrics)
              if state:
                  states.append(state)

    # Logic for DEPRECATED state
    deprecated_criteria = {
        'is_deprecated': '1'
    }

    deprecated_count = matches_criteria(deprecated_criteria, metrics)

    if deprecated_count == 1:
        states.append('DEPRECATED')

    return states if states else None

# Function to check if a package matches criteria for a given state
def matches_criteria(criteria, metrics):
    count = 0
    for metric, value in criteria.items():
        if isinstance(value, list):
            if metrics[metric] in value:
                count += 1
        else:
            if metrics[metric] == value:
                count += 1
    return count

# Function for tiebreaker for TRIVIAL state
def tiebreaker_for_trivial(metrics):
    isNone = True
    tiebreaker_order = ['file_count', 'sloc', 'total_lines_of_code', 'dependants_count', 'unpack_size', 'issues', 'pr', 'forks', 'contributors']
    for metric in tiebreaker_order:
        if metrics[metric] == 'LOW':
            return 'TRIVIAL'
        if metrics[metric] == None:
            continue
        else: return None
    return None

# Function for tiebreaker for ACTIVE state
def tiebreaker_for_active(metrics):
    tiebreaker_order = ['days_since_last_modification', 'avg_commit_freq', 'issues', 'pr', 'forks', 'contributors']
    for metric in tiebreaker_order:
        if metric in ['days_since_last_modification'] and metrics[metric] == 'LOW':
            return 'ACTIVE'
        if metric in ['avg_commit_freq','issues', 'pr', 'forks'] and metrics[metric] in ['MID', 'HIGH']:
            return 'ACTIVE'
        if metric == 'contributors' and metrics[metric] == 'HIGH':
            return 'ACTIVE'
        if metrics[metric] == None:
            continue
        else: return None
    return None

# Function for tiebreaker for DEAD state
def tiebreaker_for_dead(metrics):
    tiebreaker_order = ['days_since_last_modification', 'avg_commit_freq', 'z', 'issues', 'forks']
    for metric in tiebreaker_order:
        if metric in ['days_since_last_modification'] and metrics[metric] in ['HIGH', 'MID']:
            return 'DEAD'
        if metric in ['avg_commit_freq', 'pr', 'issues', 'forks'] and metrics[metric] == 'LOW':
            return 'DEAD'
        if metrics[metric] == None:
            continue
        else: return None
    return None

# Apply the state calculation function to each row of the DataFrame
df['state'] = df.apply(calculate_state, axis=1)


# Adding a new column `state` to the metric_analysis table





In [None]:
# Convert the 'state' column in the DataFrame to JSON format
df['state'] = df['state'].apply(json.dumps)

# Add the 'state' column to the 'metric_analysis' table in the SQLite database
df.to_sql('metric_analysis', conn, if_exists='replace', index=False)

# Commit changes to the database
conn.commit()

# Distribution of States

In [None]:
# Initialize variables to store the counts for each combination of states
combination_counts = {
    'ACTIVE': 0,
    'DEAD': 0,
    'DEPRECATED': 0,
    'TRIVIAL': 0,
    'ACTIVE_DEAD': 0,
    'ACTIVE_DEPRECATED': 0,
    'ACTIVE_TRIVIAL': 0,
    'DEAD_DEPRECATED': 0,
    'DEAD_TRIVIAL': 0,
    'DEPRECATED_TRIVIAL': 0,
    'ACTIVE_DEAD_DEPRECATED': 0,
    'ACTIVE_DEAD_TRIVIAL': 0,
    'ACTIVE_DEPRECATED_TRIVIAL': 0,
    'DEAD_DEPRECATED_TRIVIAL': 0,
    'ACTIVE_DEAD_DEPRECATED_TRIVIAL': 0,
    'STATELESS': 0
}

# Loop to count ACTIVE state
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'ACTIVE' in states:
        combination_counts['ACTIVE'] += 1

# Loop to count DEAD state
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'DEAD' in states:
        combination_counts['DEAD'] += 1

# Loop to count DEPRECATED state
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'DEPRECATED' in states:
        combination_counts['DEPRECATED'] += 1

# Loop to count TRIVIAL state
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'TRIVIAL' in states:
        combination_counts['TRIVIAL'] += 1

# Loop to count ACTIVE_DEAD combination
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'ACTIVE' in states and 'DEAD' in states:
        combination_counts['ACTIVE_DEAD'] += 1

# Loop to count ACTIVE_DEPRECATED combination
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'ACTIVE' in states and 'DEPRECATED' in states:
        combination_counts['ACTIVE_DEPRECATED'] += 1

# Loop to count ACTIVE_TRIVIAL combination
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'ACTIVE' in states and 'TRIVIAL' in states:
        combination_counts['ACTIVE_TRIVIAL'] += 1

# Loop to count DEAD_DEPRECATED combination
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'DEAD' in states and 'DEPRECATED' in states:
        combination_counts['DEAD_DEPRECATED'] += 1

# Loop to count DEAD_TRIVIAL combination
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'DEAD' in states and 'TRIVIAL' in states:
        combination_counts['DEAD_TRIVIAL'] += 1

# Loop to count DEPRECATED_TRIVIAL combination
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'DEPRECATED' in states and 'TRIVIAL' in states:
        combination_counts['DEPRECATED_TRIVIAL'] += 1

# Loop to count ACTIVE_DEAD_DEPRECATED combination
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'ACTIVE' in states and 'DEAD' in states and 'DEPRECATED' in states:
        combination_counts['ACTIVE_DEAD_DEPRECATED'] += 1

# Loop to count ACTIVE_DEAD_TRIVIAL combination
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'ACTIVE' in states and 'DEAD' in states and 'TRIVIAL' in states:
        combination_counts['ACTIVE_DEAD_TRIVIAL'] += 1

# Loop to count ACTIVE_DEPRECATED_TRIVIAL combination
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'ACTIVE' in states and 'DEPRECATED' in states and 'TRIVIAL' in states:
        combination_counts['ACTIVE_DEPRECATED_TRIVIAL'] += 1

# Loop to count DEAD_DEPRECATED_TRIVIAL combination
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'DEAD' in states and 'DEPRECATED' in states and 'TRIVIAL' in states:
        combination_counts['DEAD_DEPRECATED_TRIVIAL'] += 1

# Loop to count ACTIVE_DEAD_DEPRECATED_TRIVIAL combination
for index, row in df.iterrows():
    states = row['state']
    if states is not None and 'ACTIVE' in states and 'DEAD' in states and 'DEPRECATED' in states and 'TRIVIAL' in states:
        combination_counts['ACTIVE_DEAD_DEPRECATED_TRIVIAL'] += 1

# Loop to count STATELESS
for index, row in df.iterrows():
    states = row['state']
    if states is None:
        combination_counts['STATELESS'] += 1

# Print counts for each combination of states
print("Counts for each combination of states:")
for combination, count in combination_counts.items():
    print(f"{combination}: {count}")

Counts for each combination of states:
ACTIVE: 18675
DEAD: 11105
DEPRECATED: 1025
TRIVIAL: 19881
ACTIVE_DEAD: 0
ACTIVE_DEPRECATED: 0
ACTIVE_TRIVIAL: 9522
DEAD_DEPRECATED: 1025
DEAD_TRIVIAL: 10350
DEPRECATED_TRIVIAL: 670
ACTIVE_DEAD_DEPRECATED: 0
ACTIVE_DEAD_TRIVIAL: 0
ACTIVE_DEPRECATED_TRIVIAL: 0
DEAD_DEPRECATED_TRIVIAL: 670
ACTIVE_DEAD_DEPRECATED_TRIVIAL: 0
STATELESS: 211
