# Clone Repos

In [1]:
from git import Repo
import os
import re
import subprocess
from tqdm import tqdm
import xml.etree.ElementTree as ET
from typing import List, Dict
import os
from collections import defaultdict

In [2]:
repo_name = "intelowlproject/IntelOwl"

repo_url = "https://github.com/" + repo_name
destination_path = "./" + repo_name
repo_path = repo_name

In [10]:
Repo.clone_from(repo_url, destination_path)

GitCommandError: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v -- https://github.com/intelowlproject/IntelOwl ./intelowlproject/IntelOwl
  stderr: 'fatal: destination path './intelowlproject/IntelOwl' already exists and is not an empty directory.
'

In [19]:
repo = Repo(destination_path)

In [11]:
def find_all_files(repo_path):
    pom_files = []
    for root, dirs, files in os.walk(repo_path):
        for file in files:
            if file == "pom.xml" or file == "requirements.txt" or file =="package.json":
                full_path = os.path.join(root, file)
                pom_files.append(full_path)
    return pom_files

In [12]:
def find_commits_with_changes(repo_path, file_paths):
    repo = Repo(repo_path)
    commits_with_changes = {}

    for file_path in file_paths:
        rel_path = os.path.relpath(file_path, repo_path)
        commits = list(repo.iter_commits(paths=rel_path))
        
        commits_with_changes[rel_path] = [commit.hexsha for commit in commits]
    
    return commits_with_changes

In [13]:
# Example usage
repo_path = "./" + repo_name  # Replace with your cloned repo path
file_paths = find_all_files(repo_path)

# Display all found pom.xml paths
for path in file_paths:
    print(path)

print(len(file_paths))

./intelowlproject/IntelOwl\frontend\package.json
./intelowlproject/IntelOwl\integrations\nuclei_analyzer\requirements.txt
./intelowlproject/IntelOwl\integrations\pcap_analyzers\requirements.txt
./intelowlproject/IntelOwl\integrations\phishing_analyzers\requirements.txt
./intelowlproject/IntelOwl\integrations\thug\requirements.txt
./intelowlproject/IntelOwl\integrations\tor_analyzers\requirements.txt
6


In [14]:
from typing import List, Dict
from git import Repo
import os
import subprocess

def get_all_relevant_commits(repo_path: str, file_paths: List[str]) -> Dict[str, List[str]]:
    repo = Repo(repo_path)
    commits_with_changes = {}

    for file_path in file_paths:
        rel_path = os.path.relpath(file_path, repo_path)

        # Get all commits that modified the file
        result = subprocess.run(
            ["git", "-C", repo_path, "log", "--follow", "--pretty=format:%H", "--name-only", "--", rel_path],
            capture_output=True,
            text=True,
            check=True
        )

        lines = result.stdout.strip().split('\n')
        current_commit = None

        for line in lines:
            if len(line) == 40:  # SHA-1 hash length
                current_commit = line.strip()
                if current_commit not in commits_with_changes:
                    commits_with_changes[current_commit] = set()  # Using set to avoid duplicates
            elif current_commit:
                modified_file = line.strip()
                if modified_file:  # Ensure we don't add empty lines
                    commits_with_changes[current_commit].add(modified_file)

    # Convert commits to a sorted list based on commit date
    commit_objects = [(repo.commit(commit_hash), commit_hash) for commit_hash in commits_with_changes.keys()]
    sorted_commits = sorted(commit_objects, key=lambda x: x[0].committed_date)
    
    # Creating a sorted dictionary of commits with their modified files
    sorted_commits_with_changes = {commit_hash: list(commits_with_changes[commit_hash]) for _, commit_hash in sorted_commits}

    return sorted_commits_with_changes


# Example usage
sorted_commit_hashes = get_all_relevant_commits(repo_path, file_paths)

# Display results
print(f"Total number of commits: {len(sorted_commit_hashes)}")
print("Commits (earliest to latest):")
for commit_hash, files in list(sorted_commit_hashes.items())[:10]:  # Displaying the first 10 for brevity
    print(f"{commit_hash}: {files}")


Total number of commits: 122
Commits (earliest to latest):
4ca770ba9f2aa37f47d1d79816cb8196982f65e3: ['integrations/peframe/requirements.txt']
d8c20f24b56cf7aafa18ccff9e577b3d72ba959b: ['integrations/peframe/requirements.txt']
9715a01c6032f78a3bef6c5d6e8ad922ab79491e: ['integrations/thug/requirements.txt']
3a0cb6e5f5a8db1e7f3e5a09291e402ee8ad2a4d: ['integrations/thug/requirements.txt', 'integrations/peframe/requirements.txt']
6aaa418fdc7845d1dc0adf4f485d217e62e027c5: ['integrations/box-js/requirements.txt']
c4e8f88327fc829d47c02217bf6ec4bfa9c8d025: ['integrations/capa/requirements.txt']
625c1c14623dbf0b800c94dc4ac93f90dd514a32: ['integrations/thug/requirements.txt', 'integrations/box-js/requirements.txt']
77898109061ffa186eaf3d7033088335aa930635: ['integrations/apk_analyzers/requirements.txt']
74878c7636d573819825834544bd5547a1a4c2e0: ['integrations/thug/requirements.txt', 'integrations/apk_analyzers/requirements.txt']
ee0627edb006b1ec083cbadb82251415c1d17a23: ['integrations/thug/requi

In [17]:
import subprocess
import os
from collections import defaultdict
import re
import requests

# Initialize cache and data structures
file_cache = {}
unique_entries = set()
dependency_changes = defaultdict(list)

# Supported version specifiers in requirements.txt
VERSION_SPECIFIERS = [
    '==', '>=', '<=', '~=', '!=', '>', '<'
]


def load_file_at_commit(repo_path, commit_hash, file_path):
    try:
        result = subprocess.run(
            ["git", "-C", repo_path, "show", f"{commit_hash}:{file_path}"],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            check=True
        )
        return result.stdout
    except subprocess.CalledProcessError as e:
        print(f"Error loading file '{file_path}' at commit '{commit_hash}': {e.stderr}")
        return None


def process_commit_file_content(repo_path, sha, file_path, type):
    content = load_file_at_commit(repo_path, sha, file_path)

    if content:
        parse_file_content(content, sha, file_path, type)


def parse_file_content(content, sha, filename, type):
    dependencies = {}
    hash_pattern = re.compile(r"--hash=sha256:[a-fA-F0-9]{64}")

    if type == "py" and "requirements.txt" in filename:
        for line in content.splitlines():
            line = line.strip()

            if not line or line.startswith('#') or line.startswith("--"):  # Ignore comments and empty lines
                continue

            # Check if line is a hash
            if hash_pattern.match(line):
                continue

            # Remove inline comments
            if '#' in line:
                line = line.split('#', 1)[0].strip()

            # Find the version specifier used
            version = "UNSPECIFIED"
            for specifier in VERSION_SPECIFIERS:
                if specifier in line:
                    package, version = line.split(specifier, 1)
                    package = package.strip()
                    version = version.strip()
                    dependencies[package] = version
                    break
            else:  # No version specifier found
                package = line
                dependencies[package] = "latest-version-available"

    elif type == "js" and "package.json" in filename:
        import json
        try:
            parsed_json = json.loads(content)
            if "dependencies" in parsed_json:
                dependencies.update(parsed_json["dependencies"])
            if "devDependencies" in parsed_json:
                dependencies.update(parsed_json["devDependencies"])
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON file '{filename}': {e}")

    # Store the parsed dependencies for this commit
    dependency_changes[sha] = {
        "filename": filename,
        "dependencies": dependencies,
        "ecosystem": type
    }


def process_commits(repo_path, commits_with_files):
    for sha, files in commits_with_files.items():
        for file_path in files:
            if file_path.endswith("requirements.txt"):
                process_commit_file_content(repo_path, sha, file_path, "py")
            elif file_path.endswith("package.json"):
                process_commit_file_content(repo_path, sha, file_path, "js")


def save_results(output_file="dependencies_over_time_py.json"):
    import json
    with open(output_file, "w") as f:
        json.dump(dependency_changes, f, indent=4)


# Example usage
# repo_path = '/path/to/your/local/repo'
# commits_with_files = {
#     '1726bfbdcf08fdf2fde2517545bfbd2ee36095e': ['package.json'],
#     '2f1798417e3ee1e1d589242cd61cf52ab09e8864': ['package.json']
# }
process_commits(repo_path, sorted_commit_hashes)
save_results()


Error loading file 'integrations/thug/requirements.txt' at commit '04ed7af5fd10dff0d163e6975939ae0842253f3d': fatal: path 'integrations/thug/requirements.txt' exists on disk, but not in '04ed7af5fd10dff0d163e6975939ae0842253f3d'

Error loading file 'integrations/apk_analyzers/requirements.txt' at commit '04ed7af5fd10dff0d163e6975939ae0842253f3d': fatal: path 'integrations/apk_analyzers/requirements.txt' does not exist in '04ed7af5fd10dff0d163e6975939ae0842253f3d'

Error loading file 'integrations/thug/requirements.txt' at commit '0035119b3610aa52dfe1846c82fe14a0feff142c': fatal: path 'integrations/thug/requirements.txt' exists on disk, but not in '0035119b3610aa52dfe1846c82fe14a0feff142c'



In [20]:
# Initialize file cache
file_cache = {}


NAMESPACE = {'mvn': 'http://maven.apache.org/POM/4.0.0'}


def parse_pom_xml(content: str, properties: Dict[str, str] = None) -> Dict[str, str]:
    if properties is None:
        properties = {}
    dependencies = {}

    try:
        root = ET.fromstring(content)

        # Register namespace if needed
        if 'xmlns' in root.attrib:
            namespace_uri = root.attrib['xmlns']
            NAMESPACE['mvn'] = namespace_uri

        # Load project properties
        for prop in root.findall(".//mvn:properties/*", NAMESPACE):
            if prop.tag and prop.text:
                prop_name = prop.tag.split('}')[-1]  # Remove namespace part
                properties[prop_name] = prop.text.strip()

        project_version = root.find(".//mvn:version", NAMESPACE)
        if project_version is not None and project_version.text:
            properties["project.version"] = project_version.text.strip()

        parent_version = root.find(".//mvn:parent/mvn:version", NAMESPACE)
        if parent_version is not None and parent_version.text:
            properties["parent.version"] = parent_version.text.strip()

        for dependency in root.findall(".//mvn:dependency", NAMESPACE):
            group_id = dependency.find("mvn:groupId", NAMESPACE)
            artifact_id = dependency.find("mvn:artifactId", NAMESPACE)
            version = dependency.find("mvn:version", NAMESPACE)

            if group_id is not None and artifact_id is not None:
                group_id = group_id.text.strip()
                artifact_id = artifact_id.text.strip()
                dep_key = f"{group_id}:{artifact_id}"

                if version is not None:
                    version_text = version.text.strip()

                    if version_text.startswith("${") and version_text.endswith("}"):
                        prop_name = version_text[2:-1]
                        resolved_version = properties.get(prop_name, "UNRESOLVED")
                        if resolved_version == "UNRESOLVED":
                            resolved_version = resolve_from_parent(dep_key, properties)
                        dependencies[dep_key] = resolved_version
                    else:
                        dependencies[dep_key] = version_text

    except ET.ParseError as e:
        print(f"XML parsing error: {e}")

    return dependencies


def resolve_from_parent(dep_key: str, properties: Dict[str, str]) -> str:
    parent_pom_path = properties.get("parent_pom_path")
    if parent_pom_path and os.path.exists(parent_pom_path):
        with open(parent_pom_path, "r") as f:
            parent_content = f.read()
            parent_properties = parse_pom_xml(parent_content, properties)
            return parent_properties.get(dep_key, "UNRESOLVED")
    return "UNRESOLVED"


def load_file_at_commit(repo, commit_hash, file_path):
    cache_key = (commit_hash, file_path)
    if cache_key in file_cache:
        return file_cache[cache_key]

    try:
        commit = repo.commit(commit_hash)
        blob = commit.tree / file_path
        content = blob.data_stream.read().decode('utf-8')
        file_cache[cache_key] = content
        return content
    except Exception as e:
        file_cache[cache_key] = None
        print(f"Error loading file '{file_path}' at commit '{commit_hash}': {e}")
        return None


def process_commit(repo, commit_hash, changed_files, dependencies_snapshot):
    for file_path in changed_files:
        if file_path.endswith("pom.xml"):
            content = load_file_at_commit(repo, commit_hash, file_path)
            if content:
                properties = {"parent_pom_path": "../pom.xml"}  # Provide the parent path if known
                new_dependencies = parse_pom_xml(content, properties)
                dependencies_snapshot[file_path] = new_dependencies
    return dependencies_snapshot


def process_commits(repo, commits_with_changes):
    dependencies_over_time = {}
    dependencies_snapshot = {}

    for commit_hash, changed_files in tqdm(commits_with_changes.items(), desc="Processing commits"):
        dependencies_snapshot = process_commit(repo, commit_hash, changed_files, dependencies_snapshot)
        dependencies_over_time[commit_hash] = dependencies_snapshot.copy()

    return dependencies_over_time


dependencies_over_time = process_commits(repo, sorted_commit_hashes)

import json

with open("dependencies_over_time.json", "w") as f:
    json.dump(dependencies_over_time, f, indent=4)


Processing commits: 100%|██████████| 122/122 [00:00<?, ?it/s]
