## Clone Repos

In [1]:
import os
import re
import subprocess
from tqdm import tqdm
import xml.etree.ElementTree as ET
from typing import List, Dict, Any
from collections import defaultdict
from git import Repo, BadName
from pathlib import Path
import json

In [2]:
repo_name = "Checkmk/checkmk"

repo_url = "https://github.com/" + repo_name
destination_path = "./" + repo_name
repo_path = repo_name

commits_with_date = {}

In [3]:
if not os.path.exists(destination_path) or not os.listdir(destination_path):
	Repo.clone_from(repo_url, destination_path)
else:
	print(f"Directory '{destination_path}' already exists and is not empty. Skipping clone.")

In [4]:
repo = Repo(destination_path)

In [5]:
def find_all_files(repo_path):
    found_files = []
    for root, dirs, file_list in os.walk(repo_path):  # renamed 'files' to 'file_list'
        for file in file_list:
            if file == "pom.xml" or file == "requirements.txt" or file == "package.json" or file == "setup.py" or file == "pyproject.toml":
                full_path = os.path.join(root, file)
                found_files.append(full_path)
    return found_files


In [6]:
import fnmatch


def find_all_gradle_files(repo_path):
    gradle_files = []
    patterns = [
        "build.gradle",
        "settings.gradle",
        "gradle.properties",
        "build.gradle.kts",
        "settings.gradle.kts",
        "*.gradle",
        "*.gradle.kts"
    ]

    for root, dirs, files in os.walk(repo_path):
        for file in files:
            for pattern in patterns:
                if fnmatch.fnmatch(file, pattern):
                    full_path = os.path.join(root, file)
                    gradle_files.append(full_path)
                    break
    return gradle_files

In [7]:
def find_commits_with_changes(repo_path, file_paths):
    repo = Repo(repo_path)
    commits_with_changes = {}

    for file_path in file_paths:
        rel_path = os.path.relpath(file_path, repo_path)
        commits = list(repo.iter_commits(paths=rel_path))
        
        commits_with_changes[rel_path] = [commit.hexsha for commit in commits]
    
    return commits_with_changes

In [8]:
# Example usage
repo_path = "./" + repo_name  # Replace with your cloned repo path
file_paths = find_all_files(repo_path)
file_paths_gradle = find_all_gradle_files(repo_path)
file_paths += file_paths_gradle

# Display all found pom.xml paths
for path in file_paths:
    path = path.replace('\\', '/')
    print(path)

print(len(file_paths))

./Checkmk/checkmk/pyproject.toml
./Checkmk/checkmk/requirements.txt
./Checkmk/checkmk/agents/modules/windows/tests/pyproject.toml
./Checkmk/checkmk/cmk/gui/quick_setup/handlers/setup.py
./Checkmk/checkmk/omd/requirements.txt
./Checkmk/checkmk/packages/cmk-agent-based/pyproject.toml
./Checkmk/checkmk/packages/cmk-agent-receiver/pyproject.toml
./Checkmk/checkmk/packages/cmk-ccc/pyproject.toml
./Checkmk/checkmk/packages/cmk-crypto/pyproject.toml
./Checkmk/checkmk/packages/cmk-events/pyproject.toml
./Checkmk/checkmk/packages/cmk-frontend/package.json
./Checkmk/checkmk/packages/cmk-frontend-vue/package.json
./Checkmk/checkmk/packages/cmk-graphing/pyproject.toml
./Checkmk/checkmk/packages/cmk-livestatus-client/pyproject.toml
./Checkmk/checkmk/packages/cmk-messaging/pyproject.toml
./Checkmk/checkmk/packages/cmk-mkp-tool/pyproject.toml
./Checkmk/checkmk/packages/cmk-rulesets/pyproject.toml
./Checkmk/checkmk/packages/cmk-server-side-calls/pyproject.toml
./Checkmk/checkmk/packages/cmk-shared-typ

In [9]:
def get_all_relevant_commits(repo_path: str, file_paths: List[str]) -> Dict[str, List[str]]:
    repo = Repo(repo_path)
    commits_with_changes = {}

    for file_path in file_paths:
        rel_path = os.path.relpath(file_path, repo_path)

        # Get all commits that modified the file
        result = subprocess.run(
            ["git", "-C", repo_path, "log", "--follow", "--pretty=format:%H", "--name-only", "--", rel_path],
            capture_output=True,
            text=True,
            check=True
        )

        lines = result.stdout.strip().split('\n')
        current_commit = None

        for line in lines:
            stripped_line = line.strip()
            
            if len(stripped_line) == 40 and all(c in '0123456789abcdef' for c in stripped_line):  # Check if it looks like a SHA-1 hash
                current_commit = stripped_line
                if current_commit not in commits_with_changes:
                    commits_with_changes[current_commit] = set()  # Using set to avoid duplicates
            elif current_commit:
                modified_file = stripped_line
                if modified_file:  # Ensure we don't add empty lines
                    commits_with_changes[current_commit].add(modified_file)

    # Convert commits to a sorted list based on commit date
    commit_objects = []
    for commit_hash in commits_with_changes.keys():
        try:
            commit_obj = repo.commit(commit_hash)
            commit_objects.append((commit_obj, commit_hash))
        except BadName:
            print(f"Skipping invalid commit hash: {commit_hash}")  # Warn if a bad commit hash is found

    sorted_commits = sorted(commit_objects, key=lambda x: x[0].committed_date)
    
    # Creating a sorted dictionary of commits with their modified files
    sorted_commits_with_changes = {
        commit_hash: list(commits_with_changes[commit_hash]) for _, commit_hash in sorted_commits
        }
    
    global commits_with_date
    commits_with_date = {
        commit_hash: repo.commit(commit_hash).committed_datetime.strftime("%Y-%m-%d %H:%M:%S")
        for _, commit_hash in sorted_commits
    }

    return sorted_commits_with_changes

In [10]:
# Example usage
sorted_commit_hashes = get_all_relevant_commits(repo_path, file_paths)

# Display results
print(f"Total number of commits: {len(sorted_commit_hashes)}")
print("Commits (earliest to latest):")
for commit_hash, files in list(sorted_commit_hashes.items())[:10]:  # Displaying the first 10 for brevity
    print(f"{commit_hash}: {files}")

Total number of commits: 601
Commits (earliest to latest):
81726bfbdcf08fdf2fde2517545bfbd2ee36095e: ['package.json']
2f1798417e3ee1e1d589242cd61cf52ab09e8864: ['package.json']
990bd85041ce07c7aa4f6dcede434c2ce0ea1d80: ['package.json']
af2297ea7460cb8e7524db1568920e5a7078d671: ['package.json']
a9fe31ca0fe39a757742f8a20c23e6b63118f8b9: ['package.json']
90704ae43d88910336c0e6e7e28cc2193f0c205e: ['package.json']
980368866ce18f56cac24b6bd0deb8226ea1d36d: ['package.json']
1c67595631f14ebd0f488aaf8a437d8768f208a6: ['package.json']
cf01ef03d72a2e345f19ade4335b286a1ed253c1: ['package.json']
e279d5960e21773b5f2d8aeb7f5f633bd843baf8: ['package.json']


## java pom.xml parsing 

In [11]:
# Initialize file cache
file_cache = {}
properties_cache = {}
NAMESPACE = {'mvn': 'http://maven.apache.org/POM/4.0.0'}
properties = {}

def parse_pom_xml(content: str, file_path: str, properties: Dict[str, str] = None) -> Dict[str, str]:
    if properties is None:
        properties = {}
    else:
        properties = properties.copy()  # Avoid side effects

    dependencies = {}

    try:
        root = ET.fromstring(content)

        # Update namespace if needed
        if 'xmlns' in root.attrib:
            NAMESPACE['mvn'] = root.attrib['xmlns']

        # Load project properties
        for prop in root.findall(".//mvn:properties/*", NAMESPACE):
            if prop.tag and prop.text:
                prop_name = prop.tag.split('}')[-1]
                properties[prop_name] = prop.text.strip()
                properties_cache[prop_name] = properties[prop_name]
        
        # Load project and parent versions as fallback properties
        project_version = root.find(".//mvn:version", NAMESPACE)
        if project_version is not None and project_version.text:
            properties["project.version"] = project_version.text.strip()

        parent_version = root.find(".//mvn:parent/mvn:version", NAMESPACE)
        if parent_version is not None and parent_version.text:
            properties["parent.version"] = parent_version.text.strip()

        parent_info = {}
        relative_path_elem = root.find(".//mvn:parent/mvn:relativePath", NAMESPACE)
        if relative_path_elem is not None and relative_path_elem.text:
            parent_info = {"parent_pom_path": relative_path_elem.text.strip()}

        # Read dependencies
        for dependency in root.findall(".//mvn:dependency", NAMESPACE):
            group_id = dependency.find("mvn:groupId", NAMESPACE)
            artifact_id = dependency.find("mvn:artifactId", NAMESPACE)
            version = dependency.find("mvn:version", NAMESPACE)

            if group_id is not None and artifact_id is not None:
                dep_key = f"{group_id.text.strip()}:{artifact_id.text.strip()}"
                if "${" in dep_key and "}" in dep_key:
                    dep_key = (resolve_cached(dep_key))

                if version is not None and version.text:
                    version_text = version.text.strip()
                    if version_text.startswith("${") and version_text.endswith("}"):
                        prop_name = version_text[2:-1]
                        resolved_version = properties.get(prop_name, "UNRESOLVED")
                        if resolved_version == "UNRESOLVED":
                            resolved_version = resolve_from_parent(prop_name, file_path, parent_info)
                            if (resolved_version =="UNRESOLVED"):
                                resolved_version = properties_cache.get(prop_name, "UNRESOLVED")
                        if resolved_version.startswith("${") and resolved_version.endswith("}"):
                            resolved_version = resolve_cached(resolved_version)
                        dependencies[dep_key] = resolved_version
                    else:
                        dependencies[dep_key] = version_text
                #else:
                    #dependencies[dep_key] = "UNSPECIFIED"

    except ET.ParseError as e:
        print(f"XML parsing error: {e}")

    return dependencies




def resolve_cached(prop_name, visited=None):
    matches = re.findall(r"\$\{([^}]+)\}", prop_name)
    if not matches:
        return properties_cache.get(prop_name)  # nothing to resolve, return as is

    resolved = prop_name
    for match in matches:
        inner_value = resolve_cached(match)
        if inner_value is None:
            return match
        resolved = resolved.replace(f"${{{match}}}", inner_value)

    return resolved



def resolve_from_parent(prop_name: str, file_path: str, properties: Dict[str, str]) -> str:
    parent_pom_path = properties.get("parent_pom_path")   
    if parent_pom_path is None:
        final_path = "pom.xml"
        parent_file = str(final_path).replace("\\", "/")
        if parent_file not in file_cache:
            return "UNRESOLVED"
        content = file_cache[parent_file]
        parent_prop_value = get_all_properties(content, prop_name, final_path, {})
        return parent_prop_value

    file_path = Path(file_path) if not isinstance(file_path, Path) else file_path
    parent_pom_path = Path(parent_pom_path) if not isinstance(parent_pom_path, Path) else parent_pom_path
    combined = file_path.parent / parent_pom_path

    stack = []
    for part in combined.parts:
        if part == "..":
            if stack and stack[-1] != "..":
                stack.pop()
            else:
                stack.append(part)
        elif part != ".":
            stack.append(part)
    final_path = Path(*stack)
    parent_file = str(final_path).replace("\\", "/")
    if parent_file not in file_cache:
        #print(f"[DEBUG] Skipping missing file: {parent_file}")
        return "UNRESOLVED"
    content = file_cache[parent_file]
    parent_prop_value = get_all_properties(content, prop_name, final_path, {})
    return parent_prop_value

def get_all_properties(content: str, target_prop: str, file_path: str, properties: Dict[str, str] = None):
    if properties is None:
        properties = {}
    else:
        properties = properties.copy()  # Avoid side effects

    try:
        root = ET.fromstring(content)

        if 'xmlns' in root.attrib:
            NAMESPACE['mvn'] = root.attrib['xmlns']

        # Properly assign keys without shadowing target_prop
        for prop in root.findall(".//mvn:properties/*", NAMESPACE):
            if prop.tag and prop.text:
                key = prop.tag.split('}')[-1]
                properties[key] = prop.text.strip()

        return properties.get(target_prop, "UNRESOLVED")
    except Exception as e:
        return "UNRESOLVED"    

def load_file_at_commit(repo, commit_hash, file_path):
    try:
        commit = repo.commit(commit_hash)
        blob = commit.tree / file_path
        content = blob.data_stream.read().decode('utf-8')
        file_cache[file_path] = content
        return content
    except Exception as e:
        print(f"[DEBUG]Error loading file '{file_path}' at commit '{commit_hash}': {e}")
        return None


def process_commit(repo, commit_hash, changed_files, dependencies_snapshot):
    for file_path in changed_files:
        if file_path.endswith("pom.xml"):
            content = load_file_at_commit(repo, commit_hash, file_path)
            if content:
                new_dependencies = parse_pom_xml(content, file_path, properties)
                dependencies_snapshot[file_path] = new_dependencies
    return dependencies_snapshot


def process_commits(repo, commits_with_changes):
    dependencies_over_time = {}
    dependencies_snapshot = {}

    for commit_hash, changed_files in tqdm(commits_with_changes.items(), desc="Processing commits"):
        dependencies_snapshot = process_commit(repo, commit_hash, changed_files, dependencies_snapshot)
        dependencies_over_time[commit_hash] = dependencies_snapshot.copy()

    return dependencies_over_time

dependencies_over_time = process_commits(repo, sorted_commit_hashes)

import json

with open("java_pom.json", "w") as f:
    json.dump(dependencies_over_time, f, indent=4)

Processing commits: 100%|██████████| 601/601 [00:00<00:00, 599899.26it/s]


## python and javascript parsing: 

In [12]:
import tomli
import ast


# Initialize cache and data structures
file_cache = {}
unique_entries = set()
dependency_changes = defaultdict(list)

# Supported version specifiers in requirements.txt
VERSION_SPECIFIERS = [
    '==', '>=', '<=', '~=', '!=', '>', '<'
]


def load_file_at_commit(repo_path, commit_hash, file_path):
    try:
        result = subprocess.run(
            ["git", "-C", repo_path, "show", f"{commit_hash}:{file_path}"],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=True
        )
        return result.stdout
    except subprocess.CalledProcessError as e:
        print(f"Error loading file '{file_path}' at commit '{commit_hash}': {e.stderr}")
        return None


def process_commit_file_content(repo_path, sha, file_path, type):
    content = load_file_at_commit(repo_path, sha, file_path)
        
    if content:
        parse_file_content(content, sha, file_path, type)

def parse_pyproject_toml(content: str) -> dict:
    try:
        data = tomli.loads(content)
    except tomli.TOMLDecodeError:
        return {}

    # PEP 621 style
    deps = data.get("project", {}).get("dependencies", [])
    if not deps:
        # Poetry-style
        poetry_deps = data.get("tool", {}).get("poetry", {}).get("dependencies", {})
        deps = [f"{pkg}{v if isinstance(v, str) else ''}" for pkg, v in poetry_deps.items() if pkg.lower() != "python"]

    result = {}
    for dep in deps:
        parts = re.split(r"([<>=!~]+)", dep, maxsplit=1)
        name = parts[0].strip()
        version = ''.join(parts[1:]).strip() if len(parts) > 1 else "latest-version-available"
        result[name] = version

    return result


# === ADDED ===
def parse_setup_py(content: str) -> dict:
    try:
        tree = ast.parse(content)
    except SyntaxError:
        return {}

    class SetupVisitor(ast.NodeVisitor):
        def __init__(self):
            self.install_requires = []

        def visit_Call(self, node):
            if isinstance(node.func, ast.Name) and node.func.id == "setup":
                for keyword in node.keywords:
                    if keyword.arg == "install_requires":
                        if isinstance(keyword.value, (ast.List, ast.Tuple)):
                            for elt in keyword.value.elts:
                                if isinstance(elt, ast.Str):
                                    self.install_requires.append(elt.s)
            self.generic_visit(node)

    visitor = SetupVisitor()
    visitor.visit(tree)

    result = {}
    for dep in visitor.install_requires:
        parts = re.split(r"([<>=!~]+)", dep, maxsplit=1)
        name = parts[0].strip()
        version = ''.join(parts[1:]).strip() if len(parts) > 1 else "latest-version-available"
        result[name] = version

    return result

def parse_file_content(content, sha, filename, type):
    dependencies = {}
    hash_pattern = re.compile(r"--hash=sha256:[a-fA-F0-9]{64}")

    if type == "py" and "requirements.txt" in filename:
        for line in content.splitlines():
            line = line.strip()

            if not line or line.startswith('#') or line.startswith("--"):  # Ignore comments and empty lines
                continue

            # Check if line is a hash
            if hash_pattern.match(line):
                continue

            # Remove inline comments
            if '#' in line:
                line = line.split('#', 1)[0].strip()

            # Find the version specifier used
            version = "UNSPECIFIED"
            for specifier in VERSION_SPECIFIERS:
                if specifier in line:
                    package, version = line.split(specifier, 1)
                    package = package.strip()
                    version = version.strip()
                    dependencies[package] = version
                    break
            else:  # No version specifier found
                package = line
                dependencies[package] = "latest-version-available"

    elif type == "js" and "package.json" in filename:
        import json
        try:
            parsed_json = json.loads(content)
            if "dependencies" in parsed_json:
                dependencies.update(parsed_json["dependencies"])
            if "devDependencies" in parsed_json:
                dependencies.update(parsed_json["devDependencies"])
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON file '{filename}': {e}")
    
    elif type == "py" and "setup.py" in filename:
        dependencies = parse_setup_py(content)
    elif type == "py" and "pyproject.toml" in filename:
        dependencies = parse_pyproject_toml(content)


    # Store the parsed dependencies for this commit
    dependency_changes[sha] = {
        "filename": filename,
        "dependencies": dependencies,
        "ecosystem": type
    }


def process_commits(repo_path, commits_with_files):
    for sha, files in tqdm(commits_with_files.items(), desc="Processing commits"):
        for file_path in files:
            if file_path.endswith("requirements.txt"):
                process_commit_file_content(repo_path, sha, file_path, "py")
            elif file_path.endswith("package.json"):
                process_commit_file_content(repo_path, sha, file_path, "js")
            elif file_path.endswith("setup.py"):
                process_commit_file_content(repo_path, sha, file_path, "py")
            elif file_path.endswith("pyproject.toml"):
                process_commit_file_content(repo_path, sha, file_path, "py")


def save_results(output_file="python_and_javascript.json"):
    import json
    with open(output_file, "w") as f:
        json.dump(dependency_changes, f, indent=4)


process_commits(repo_path, sorted_commit_hashes)
save_results()

Processing commits:   9%|▉         | 55/601 [00:04<00:44, 12.17it/s]

Error loading file 'pyproject.toml' at commit '46620c5fda243677e84b160fbbed24a2bdddec46': fatal: path 'pyproject.toml' exists on disk, but not in '46620c5fda243677e84b160fbbed24a2bdddec46'



Processing commits: 100%|██████████| 601/601 [00:43<00:00, 13.87it/s]


In [13]:
file_cache = {}

def is_gradle_related(file_path):
    return (
        file_path.endswith("build.gradle")
        or file_path.endswith("build.gradle.kts")
        or file_path.endswith("settings.gradle")
        or file_path.endswith("settings.gradle.kts")
        or file_path.endswith("gradle.properties")
        or file_path.endswith(".gradle")
        or file_path.endswith(".gradle.kts")
    )

def parse_gradle_properties(content):
    props = {}
    for line in content.splitlines():
        line = line.strip()
        if line and not line.startswith("#") and "=" in line:
            key, value = line.split("=", 1)
            props[key.strip()] = value.strip()
    return props


def extract_dependencies_block(content: str) -> str:
    """Returns only the content inside dependencies { ... } block(s)."""
    lines = content.splitlines()
    inside = False
    depth = 0
    collected = []

    for line in lines:
        stripped = line.strip()
        if stripped.startswith("dependencies"):
            if "{" in stripped:
                inside = True
                depth = stripped.count("{") - stripped.count("}")
                continue

        if inside:
            if "{" in stripped:
                depth += stripped.count("{")
            if "}" in stripped:
                depth -= stripped.count("}")
            collected.append(line)
            if depth <= 0:
                inside = False

    return "\n".join(collected)


def parse_gradle_dependencies(file_content: str, properties: Dict[str, str], commit_sha: str, file_path: str) -> List[Dict[str, Any]]:
    dependencies = []

    # NEW: extract only content inside dependencies block(s)
    block = extract_dependencies_block(file_content)

    # Patterns for Kotlin + Groovy
    patterns = [
        re.compile(r'(?P<config>\w+)\s*\(\s*["\'](?P<group>[^:\'"]+):(?P<artifact>[^:\'"]+):(?P<version>[^\'"]+)["\']\s*\)'),  # Kotlin DSL
        re.compile(r'(?P<config>\w+)\s+["\'](?P<group>[^:\'"]+):(?P<artifact>[^:\'"]+):(?P<version>[^\'"]+)["\']')  # Groovy DSL
    ]

    for pattern in patterns:
        for match in pattern.finditer(block):
            config = match.group("config")
            group = match.group("group")
            artifact = match.group("artifact")
            version = match.group("version")

            version_source = "literal"
            if version.startswith("$") or "${" in version:
                var_name = version.replace("$", "").replace("{", "").replace("}", "")
                if var_name in properties:
                    version = properties[var_name]
                    version_source = "resolved_from_property"
                else:
                    version_source = "unresolved"

            if "$" not in group and "$" not in artifact and "$" not in version:
                dependencies.append({
                    "commit": commit_sha,
                    "file": file_path,
                    #"tool": "gradle",
                    #"configuration": config,
                    "group": group,
                    "artifact": artifact,
                    "version": version,
                    #"version_source": version_source
                })

    return dependencies


def load_file_at_commit(repo, commit_hash, file_path):
    try:
        commit = repo.commit(commit_hash)
        blob = commit.tree / file_path
        content = blob.data_stream.read().decode('utf-8')
        file_cache[file_path] = content
        return content
    except Exception as e:
        print(f"[DEBUG] Error loading file '{file_path}' at commit '{commit_hash}': {e}")
        return None
    
def parse_local_kotlin_variables(content: str) -> Dict[str, str]:
    props = {}
    # Matches lines like: val log4jVersion = "2.20.0" or var version = "1.0"
    pattern = re.compile(r'(val|var)\s+(\w+)\s*=\s*["\']([^"\']+)["\']')
    for match in pattern.finditer(content):
        var_name = match.group(2)
        value = match.group(3)
        props[var_name] = value
    return props

def process_commit(repo, commit_hash, changed_files, dependencies_snapshot, all_dependencies, properties_by_commit):
    for file_path in changed_files:
        if not is_gradle_related(file_path):
            continue

        content = load_file_at_commit(repo, commit_hash, file_path)
        if content:
            dependencies_snapshot[file_path] = content

            # Parse gradle.properties
            if file_path.endswith("gradle.properties"):
                props = parse_gradle_properties(content)
                properties_by_commit[commit_hash] = props

            # Parse dependencies from .gradle or .gradle.kts files
            elif file_path.endswith(".gradle") or file_path.endswith(".gradle.kts"):
                props = {}
                #props.update(parse_ext_block(content))                  # ext { ... }
                #props.update(parse_extra_assignments(content))          # extra["..."] = ...
                #props.update(extract_project_properties(content))       # group/version/name
                props.update(parse_local_kotlin_variables(content))     # val/var = "..."

                if commit_hash not in properties_by_commit:
                    properties_by_commit[commit_hash] = {}
                properties_by_commit[commit_hash].update(props)

                resolved_props = properties_by_commit[commit_hash]
                parsed_deps = parse_gradle_dependencies(content, resolved_props, commit_hash, file_path)
                all_dependencies.extend(parsed_deps)

    return dependencies_snapshot, all_dependencies

def process_commits(repo, commits_with_changes):
    dependencies_over_time = {}
    dependencies_snapshot = {}
    all_dependencies = []
    properties_by_commit = {}

    for commit_hash, changed_files in tqdm(commits_with_changes.items(), desc="Processing commits"):
        dependencies_snapshot, all_dependencies = process_commit(
            repo, commit_hash, changed_files, dependencies_snapshot, all_dependencies, properties_by_commit
        )
        dependencies_over_time[commit_hash] = dependencies_snapshot.copy()

    return dependencies_over_time, all_dependencies


dependencies_over_time, all_dependencies = process_commits(repo, sorted_commit_hashes)

# Optionally print the parsed dependencies
#for dep in all_dependencies:
    #print(dep)

cleaned_dependencies = [
    dep for dep in all_dependencies
    if "$" not in dep["group"]
    and "$" not in dep["artifact"]
    and "$" not in dep["version"]
]

for dep in cleaned_dependencies [:10]:
    print(dep)

Processing commits: 100%|██████████| 601/601 [00:00<00:00, 301204.05it/s]


In [14]:
def group_gradle_dependencies(dependencies):
    grouped = defaultdict(lambda: {"gradle": defaultdict(dict)})

    for dep in dependencies:
        commit = dep["commit"]
        file = dep["file"]
        group_artifact = f"{dep['group']}:{dep['artifact']}"
        version = dep["version"]

        grouped[commit]["gradle"][file][group_artifact] = version

    return grouped

grouped_dependencies = group_gradle_dependencies(cleaned_dependencies)
import json
# Convert the defaultdict to a normal dict recursively
def to_dict(d):
    if isinstance(d, defaultdict):
        d = {k: to_dict(v) for k, v in d.items()}
    elif isinstance(d, dict):
        d = {k: to_dict(v) for k, v in d.items()}
    return d

#print(json.dumps(to_dict(grouped_dependencies), indent=2))

def save_results(data, output_file="dependencies_over_time.json"):
    import json
    with open(output_file, "w") as f:
        json.dump(data, f, indent=4)

save_results(to_dict(grouped_dependencies), "java_gradle.json")

In [15]:
def merge_dependencies(python_and_javascript: Dict, java: Dict, gradle: Dict) -> Dict:
    merged_data = {}

    # Combine all unique commit hashes from both datasets
    all_commits = set(python_and_javascript) | set(java) | set(gradle)

    for commit in all_commits:
        merged_data[commit] = {
            "java": {},
            "gradle": {},
            "python": {},
            "javascript": {}
        }

        # Add Java dependencies if present
        if commit in java:
            for file, content in java[commit].items():
                merged_data[commit]["java"][file] = content  # No renaming!
        
        if commit in gradle:
            gradle_files = gradle[commit].get("gradle", {})  # Extract only the inner part
            for file, content in gradle_files.items():
                merged_data[commit]["gradle"][file] = content

        # Add Python and JavaScript dependencies if present
        if commit in python_and_javascript:
            content = python_and_javascript[commit]

            # Check if it's a Python or JavaScript file based on 'ecosystem'
            if content.get("ecosystem") == "py":
                merged_data[commit]["python"][content["filename"]] = content["dependencies"]
            elif content.get("ecosystem") == "js":
                merged_data[commit]["javascript"][content["filename"]] = content["dependencies"]


    return merged_data

def remove_empty_objects(data):
    if isinstance(data, dict):
        return {
            k: remove_empty_objects(v)
            for k, v in data.items()
            if not (isinstance(v, dict) and not v)  # remove empty dicts
        }
    elif isinstance(data, list):
        return [remove_empty_objects(item) for item in data]
    else:
        return data

def save_merged_data(merged_data: Dict, output_file: str):
    with open(output_file, 'w') as f:
        json.dump(merged_data, f, indent=4)


with open('python_and_javascript.json', 'r') as f:
    python_and_javascript = json.load(f)

with open('java_pom.json', 'r') as f:
    java = json.load(f)

with open('java_gradle.json', 'r') as f:
    gradle = json.load(f)

merged_data = merge_dependencies(python_and_javascript, java, gradle)
new_merged_data = remove_empty_objects(merged_data)
save_merged_data(new_merged_data, 'merged_dependencies.json')


In [16]:
import json
import copy

# Load input data
with open("merged_dependencies.json", "r") as f:
    data = json.load(f)

# Your commit-to-date mapping
commit_dates = commits_with_date  # assumed to be defined already

# Ensure chronological commit order
sorted_commits = sorted(data.keys(), key=lambda c: commit_dates.get(c))

# Global latest version per dependency
latest_versions = {}

# Last known dependency set per file
last_file_state = {}  # (system, filename) -> {dep: version}

# All changes (add, update, remove)
changes = []

for commit in sorted_commits:
    date = commit_dates.get(commit)
    commit_data = data[commit]

    for system, files in commit_data.items():
        for filename, curr_deps in files.items():
            file_key = (system, filename)
            prev_deps = last_file_state.get(file_key, {})

            # Check for added/updated
            for dep, new_version in curr_deps.items():
                old_version = latest_versions.get(dep)
                if dep not in prev_deps:
                    changes.append({
                        "commit": commit,
                        "date": date,
                        "filename": filename,
                        "system": system,
                        "dependency": dep,
                        "change_type": "added",
                        "old_version": None,
                        "new_version": new_version
                    })
                elif prev_deps[dep] != new_version:
                    changes.append({
                        "commit": commit,
                        "date": date,
                        "filename": filename,
                        "system": system,
                        "dependency": dep,
                        "change_type": "updated",
                        "old_version": prev_deps[dep],
                        "new_version": new_version
                    })

                # Always update the global version
                latest_versions[dep] = new_version

            # Save current state for next round
            last_file_state[file_key] = copy.deepcopy(curr_deps)

# Save all detected changes
with open("dependency_changes_with_removed.json", "w") as f:
    json.dump(changes, f, indent=2)


In [22]:
import json
import requests
import urllib.parse
from datetime import datetime
from collections import defaultdict
from packaging.version import Version, InvalidVersion


def normalize_system(system):
    mapping = {
        "java": "maven",
        "javascript": "npm",
        "python": "pypi",
        "go": "go"
    }
    return mapping.get(system, system)


def get_major_version(version_str):
    try:
        return Version(version_str).major
    except InvalidVersion:
        return None

cached_versions = defaultdict(dict)

def fetch_package_data(package_name, system):
    encoded_name = urllib.parse.quote(package_name, safe="")
    url = f"https://api.deps.dev/v3alpha/systems/{system}/packages/{encoded_name}"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

def get_cached_package_data(package_name, system):
    if package_name not in cached_versions[system]:
        try:
            cached_versions[system][package_name] = fetch_package_data(package_name, system)
        except requests.RequestException as e:
            print(f"❌ Failed to fetch data for {package_name} ({system}): {e}")
            cached_versions[system][package_name] = {}  # fallback to empty
    return cached_versions[system][package_name]

def analyze_entry(entry):
    package_name = entry["dependency"]
    raw_new = entry["new_version"]
    has_caret = raw_new.startswith("^")
    new_version = raw_new.lstrip("^")    
    date_str = entry["date"]
    commit_date = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
    system = normalize_system(entry["system"])

    # Get versions once per package
    package_data = get_cached_package_data(package_name, system)
    all_versions = package_data.get("versions", [])

    commit_versions = []
    for v in all_versions:
        ver_str = v.get("versionKey", {}).get("version")
        pub_str = v.get("publishedAt")
        if not ver_str or not pub_str:
            continue
        try:
            pub_date = datetime.strptime(pub_str, "%Y-%m-%dT%H:%M:%SZ")
        except ValueError:
            continue

        # only consider versions published before or at the commit
        if pub_date <= commit_date:
            try:
                commit_versions.append(Version(ver_str))
            except InvalidVersion:
                continue

    if commit_versions:
        # highest major first
        max_major = max(v.major for v in commit_versions)
        # among those, pick highest full version
        highest = max(v for v in commit_versions if v.major == max_major)
        entry["newest_version_at_commit"] = str(highest)
    else:
        entry["newest_version_at_commit"] = None

    released_date = None
    latest_at_commit_time = None
    latest_publish_date = None
    released_date_dt = None

    is_npm = (system == "npm")
    target_major = get_major_version(new_version) if is_npm else None
    
    latency = -1 # Default to -1 for no version found

    for v in all_versions:
        version = v.get("versionKey").get("version")
        v_date_str =  v.get("publishedAt")

        if not v_date_str or not version:
            continue


        try:
            parsed = Version(version)
        except InvalidVersion:
            #continue
            parsed = None

        if is_npm and parsed != None and parsed.major != target_major:
            continue  # npm: nur gleiche Major-Version berücksichtigen

        v_date = datetime.strptime(v_date_str, "%Y-%m-%dT%H:%M:%SZ")

        # Find release date of the version in the change
        if version == new_version:
            released_date = v_date.isoformat()
            released_date_dt = v_date

        # Find latest version before or at commit time
        if v_date <= commit_date:
            if latest_publish_date is None or v_date > latest_publish_date:
                latest_publish_date = v_date
                latest_at_commit_time = version
                

    '''for v in all_versions:
        version = v.get("versionKey").get("version")
        v_date_str =  v.get("publishedAt")
        if not v_date_str or not version:
            continue
        v_date = datetime.strptime(v_date_str, "%Y-%m-%dT%H:%M:%SZ")
        if released_date_dt and v_date >= released_date_dt:
            if v_date <= commit_date:
                latency += 1
    '''

        #print(f"📦 Version {version} released at: {v_date}, {latest_publish_date}, {latest_at_commit_time} ")


    #print(f"🕓 Commit: {commit_date}")
    #print(f"📦 Released: {released_date}")
    #print(f"✅ Latest at commit time: {latest_at_commit_time} ({latest_publish_date})")

    # Add info to the entry
    entry["released_date"] = released_date
    entry["latest_at_commit_time"] = latest_at_commit_time
    #entry["latency"] = latency

    try:
        baseline = Version(new_version)
    except InvalidVersion:
        baseline = None

    if has_caret and baseline:
        same_major = [v for v in commit_versions if v.major == baseline.major]
        if same_major:
            baseline = max(same_major)

    if baseline and commit_versions:
        # count how many versions published ≤ commit_date are strictly > baseline
        entry["latency"] = sum(1 for v in commit_versions if v > baseline)
    else:
        entry["latency"] = -1
    return entry

def process_json_file(input_path, output_path):
    with open(input_path, "r") as f:
        entries = json.load(f)

    total = len(entries)
    enriched = []
    for i, entry in enumerate(entries, start=1):
        enriched_entry = analyze_entry(entry)
        enriched.append(enriched_entry)
        dep = enriched_entry.get("dependency", "???")
        print(f"[{i}/{total}] {dep} → done")

    with open(output_path, "w") as f:
        json.dump(enriched, f, indent=2)

# Run
process_json_file("dependency_changes_with_removed.json", "dependency_changes_enriched.json")


[1/1663] webpack → done
[2/1663] webpack-cli → done
[3/1663] @babel/core → done
[4/1663] @babel/preset-env → done
[5/1663] babel-loader → done
[6/1663] eslint → done
[7/1663] eslint-plugin-compat → done
[8/1663] script-loader → done
[9/1663] jquery → done
[10/1663] select2 → done
[11/1663] css-loader → done
[12/1663] mini-css-extract-plugin → done
[13/1663] style-loader → done
[14/1663] autoprefixer → done
[15/1663] extract-loader → done
[16/1663] file-loader → done
[17/1663] node-sass → done
[18/1663] postcss-clean → done
[19/1663] postcss-loader → done
[20/1663] sass-lint → done
[21/1663] sass-loader → done
[22/1663] webpack-fix-style-only-entries → done
[23/1663] @yaireo/tagify → done
[24/1663] sass-lint → done
[25/1663] @yaireo/tagify → done
[26/1663] node-sass → done
[27/1663] jquery → done
[28/1663] sass-lint → done
[29/1663] d3 → done
[30/1663] d3-flextree → done
[31/1663] @yaireo/tagify → done
[32/1663] d3 → done
[33/1663] d3-flextree → done
[34/1663] select2 → done
[35/1663] @

In [None]:
import json
import requests
import urllib.parse
from datetime import datetime
from collections import defaultdict
from packaging.version import Version, InvalidVersion

def normalize_system(system):
    mapping = {
        "java": "maven",
        "javascript": "npm",
        "python": "pypi",
        "go": "go"
    }
    return mapping.get(system, system)

def get_major_version(version_str):
    try:
        return Version(version_str).major
    except InvalidVersion:
        return None

cached_versions = defaultdict(dict)

def fetch_package_data(package_name, system):
    encoded_name = urllib.parse.quote(package_name, safe="")
    url = f"https://api.deps.dev/v3alpha/systems/{system}/packages/{encoded_name}"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

def get_cached_package_data(package_name, system):
    if package_name not in cached_versions[system]:
        try:
            cached_versions[system][package_name] = fetch_package_data(package_name, system)
        except requests.RequestException as e:
            print(f"❌ Failed to fetch data for {package_name} ({system}): {e}")
            cached_versions[system][package_name] = {}  # fallback to empty
    return cached_versions[system][package_name]

def analyze_entry(entry):
    package_name    = entry["dependency"]
    raw_new         = entry["new_version"]
    has_caret       = raw_new.startswith("^")
    new_version     = raw_new.lstrip("^")
    commit_date     = datetime.strptime(entry["date"], "%Y-%m-%d %H:%M:%S")
    system          = normalize_system(entry["system"])

    # — fetch and filter to versions ≤ commit time —
    pkg_data        = get_cached_package_data(package_name, system)
    all_versions    = pkg_data.get("versions", [])
    commit_versions = []
    for v in all_versions:
        ver_str = v.get("versionKey", {}).get("version")
        pub_str = v.get("publishedAt")
        if not ver_str or not pub_str:
            continue
        try:
            pub_dt = datetime.strptime(pub_str, "%Y-%m-%dT%H:%M:%SZ")
        except ValueError:
            continue
        if pub_dt <= commit_date:
            try:
                commit_versions.append(Version(ver_str))
            except InvalidVersion:
                continue

    # — newest overall at commit time (highest major → highest) —
    if commit_versions:
        max_major = max(v.major for v in commit_versions)
        highest   = max(v for v in commit_versions if v.major == max_major)
        entry["newest_version_at_commit"] = str(highest)
    else:
        entry["newest_version_at_commit"] = None

    # — find release date and latest_at_commit_time (existing logic) —
    released_date         = None
    released_date_dt      = None
    latest_publish_date   = None
    latest_at_commit_time = None

    is_npm       = (system == "npm")
    target_major = get_major_version(new_version) if is_npm else None

    for v in all_versions:
        ver     = v.get("versionKey", {}).get("version")
        pub_str = v.get("publishedAt")
        if not ver or not pub_str:
            continue

        try:
            parsed = Version(ver)
        except InvalidVersion:
            parsed = None

        # npm-only: restrict to same major
        if is_npm and parsed and parsed.major != target_major:
            continue

        pub_dt = datetime.strptime(pub_str, "%Y-%m-%dT%H:%M:%SZ")
        if ver == new_version:
            released_date    = pub_dt.isoformat()
            released_date_dt = pub_dt

        if pub_dt <= commit_date and (latest_publish_date is None or pub_dt > latest_publish_date):
            latest_publish_date   = pub_dt
            latest_at_commit_time = ver

    entry["released_date"]          = released_date
    entry["latest_at_commit_time"]  = latest_at_commit_time

    # — new “caret-aware” latency logic —
    try:
        baseline = Version(new_version)
    except InvalidVersion:
        baseline = None

    # if ^, bump baseline to the highest same-major at commit
    if has_caret and baseline:
        same_major = [v for v in commit_versions if v.major == baseline.major]
        if same_major:
            baseline = max(same_major)

    if baseline and commit_versions:
        entry["latency"] = sum(1 for v in commit_versions if v > baseline)
    else:
        entry["latency"] = -1

    return entry

def process_json_file(input_path, output_path):
    with open(input_path, "r") as f:
        entries = json.load(f)

    total    = len(entries)
    enriched = []
    for i, entry in enumerate(entries, start=1):
        enriched_entry = analyze_entry(entry)
        enriched.append(enriched_entry)
        print(f"[{i}/{total}] {entry['dependency']} → done")

    with open(output_path, "w") as f:
        json.dump(enriched, f, indent=2)

# Run
process_json_file(
    "dependency_changes_with_removed.json",
    "dependency_changes_enriched.json"
)


[1/1663] webpack → done
[2/1663] webpack-cli → done
[3/1663] @babel/core → done
[4/1663] @babel/preset-env → done
[5/1663] babel-loader → done
[6/1663] eslint → done
[7/1663] eslint-plugin-compat → done
[8/1663] script-loader → done
[9/1663] jquery → done
[10/1663] select2 → done
[11/1663] css-loader → done
[12/1663] mini-css-extract-plugin → done
[13/1663] style-loader → done
[14/1663] autoprefixer → done
[15/1663] extract-loader → done
[16/1663] file-loader → done
[17/1663] node-sass → done
[18/1663] postcss-clean → done
[19/1663] postcss-loader → done
[20/1663] sass-lint → done
[21/1663] sass-loader → done
[22/1663] webpack-fix-style-only-entries → done
[23/1663] @yaireo/tagify → done
[24/1663] sass-lint → done
[25/1663] @yaireo/tagify → done
[26/1663] node-sass → done
[27/1663] jquery → done
[28/1663] sass-lint → done
[29/1663] d3 → done
[30/1663] d3-flextree → done
[31/1663] @yaireo/tagify → done
[32/1663] d3 → done
[33/1663] d3-flextree → done
[34/1663] select2 → done
[35/1663] @