In [100]:
import json
import subprocess
import time

from icecream import ic
from collections import deque


In [101]:
class SystemMapping:
    settings = json.load(open("../settings/settings.json"))

    @classmethod
    def map_all(cls):
        psql_results = cls.map_postgres()
        os_results = cls.map_os()

        return os_results, psql_results

    @classmethod
    def map_postgres(cls):
        postgres_settings = cls.settings.get("tools", "").get("postgres", "")
        username = postgres_settings.get("username", "")
        databases = postgres_settings.get("databases", [])  # list of Databses
        mapping_tables_command = postgres_settings.get("mapping_tables_command", [])  # list of command parts

        table_results = []

        mapping_tables_command[2] = username


        if databases and mapping_tables_command and username:
            for database in databases:
                mapping_tables_command[5] = database

                try:
                    # Store the result of subprocess.run()
                    result = subprocess.run(
                        mapping_tables_command,
                        capture_output=True,
                        text=True,
                        check=True
                    )

                    # Append both database name and command output to psql_results
                    table_results.append({
                        "database": database,
                        "tables_table": result.stdout
                    })

                except subprocess.CalledProcessError as e:
                    ic()
                    ic(f"stdout: {e.stdout}")
                    ic(f"stderr: {e.stderr}")
                    return []
                except Exception as e:
                    ic()
                    ic(e)
                    return []

        return table_results

    @classmethod
    def map_os(cls):
        os_mapping_vars = cls.settings.get("os_mapping")
        tree_command = os_mapping_vars.get("tree_command", None)
        tree_file_path = os_mapping_vars.get("tree_file_path", "../database/system_tree.json")
        delete_tree_command = os_mapping_vars.get("delete_tree_command", None)

        if any(var is None for var in [tree_command, tree_file_path, delete_tree_command]):
            ic()
            ic("Fucking Issues!!!")
            return {}

        tree_command.append(tree_file_path)
        delete_tree_command.append(tree_file_path)

        if tree_command:
            try:
                # Try to delete existing tree file
                subprocess.run(
                    delete_tree_command,
                    capture_output=True,
                    text=True,
                    check=True
                )

                try:
                    # Generate new tree file
                    subprocess.run(
                        tree_command,
                        capture_output=True,
                        text=True,
                        check=True
                    )
                    # Process the generated tree file
                    return cls.process_os_mapping(tree_file_path)

                except subprocess.CalledProcessError as e:
                    ic()
                    ic(f"stdout: {e.stdout}")
                    ic(f"stderr: {e.stderr}")
                    return {}
                except Exception as e:
                    ic()
                    ic(f"error: {e}")
                    return {}

            except subprocess.CalledProcessError as e:
                # If deletion fails, try to generate anyway
                try:
                    subprocess.run(
                        tree_command,
                        capture_output=True,
                        text=True,
                        check=True
                    )
                    # Process the generated tree file
                    return cls.process_os_mapping(tree_file_path)

                except subprocess.CalledProcessError as e:
                    ic()
                    ic(f"stdout: {e.stdout}")
                    ic(f"stderr: {e.stderr}")
                    return {}
                except Exception as e:
                    ic()
                    ic(f"error: {e}")
                    return {}

            except Exception as e:
                # If another exception occurs during deletion, try to generate anyway
                try:
                    subprocess.run(
                        tree_command,
                        capture_output=True,
                        text=True,
                        check=True
                    )
                    # Process the generated tree file
                    return cls.process_os_mapping(tree_file_path)

                except Exception as e:
                    ic()
                    ic(f"error: {e}")
                    return {}

        else:
            ic()
            ic("Fucking Else!#############################")
            return {}

    @classmethod
    def process_os_mapping_old(cls, tree_file_path=None):
        if tree_file_path is None:
            tree_file_path = cls.settings.get("tree_file_path", "../database/system_tree.json")

        try:
            with open(tree_file_path, 'r') as file:
                json_output = json.load(file)

            base_tree, report = json_output
            root_dirs = base_tree.get('contents', None)

            if not root_dirs:
                ic()
                ic(f"Empty root directory list: {root_dirs}")
                return {}

            directory_dict = {}
            empty_directories = 0
            directories = []

            for root_dir in root_dirs:
                if root_dir["type"] == "directory":
                    directories.append(root_dir["name"])
                if root_dir["type"] == "directory" and root_dir.get("contents", False):
                    directory_dict[root_dir["name"]] = [item["name"] for item in root_dir["contents"] if
                                                        item["type"] == "file"]
                    root_dirs.extend([item for item in root_dir["contents"] if item["type"] == "directory"])

                elif root_dir["type"] == "directory" and not root_dir.get("contents", False):
                    directory_dict[root_dir["name"]] = []
                    empty_directories += 1

            biggie_count = 0
            for path, content in directory_dict.items():
                if len(content) > 2000:
                    biggie_count += 1

            ic()
            ic(biggie_count)

            return directory_dict

        except Exception as e:
            ic()
            ic(f"error: {e}")
            return {}

    @classmethod
    def process_os_mapping(cls, tree_file_path=None):
        if tree_file_path is None:
            tree_file_path = cls.settings.get("tree_file_path", "./database/system_tree.json")

        try:
            with open(tree_file_path, 'r') as file:
                json_output = json.load(file)

            base_tree, report = json_output
            root_dirs = base_tree.get('contents', None)

            if not root_dirs:
                ic()
                ic(f"Empty root directory list: {root_dirs}")
                return {}

            directory_dict = {}
            empty_directories = 0
            directories = []

            for directory in root_dirs:
                if type(directory) == dict:
                    if directory["type"] == "directory":
                        directories.append(directory["name"])
                    if directory["type"] == "directory" and directory.get("contents", False):
                        for item in directory["contents"]:
                            if item["type"] == "file":
                                item_name = item["name"].split("/")[-1]
                                directory_dict[item["name"]] = {"filetype": item["type"], "item": item_name}
                            elif item["type"] == "directory":
                                root_dirs.append(item)


                elif directory["type"] == "directory" and not directory.get("contents", False):
                        item_name = directory["name"].split("/")[-1]
                        directory_dict[directory["name"]] = {"filetype": directory["type"], "item": item_name}
                        empty_directories += 1

            # Remove Big Items for now. Split up later on.
            for path, content in directory_dict.items():
                if len(content) > 2000:
                    del directory_dict[path]

            ic()
            ic(len(directory_dict))

            return directory_dict

        except Exception as e:
            ic()
            ic(f"error: {e}")
            return {}

    def fast_process_os_mapping(self, tree_file_path):
        data, _ = json.loads(open(tree_file_path, 'rb').read())
        queue = deque(data.get("contents", []))
        directory_dict = {}

        while queue:
            node = queue.popleft()
            if node["type"] != "directory":
                continue

            # collect file names in this directory
            files = [c["name"] for c in node.get("contents", [])
                     if c["type"] == "file"]
            directory_dict[node["name"]] = files

            # enqueue subdirectories
            for c in node.get("contents", []):
                if c["type"] == "directory":
                    queue.append(c)

        return directory_dict


In [102]:
# Path to your system tree file (adjust if necessary)
tree_file_path = "../database/system_tree.json"

# Measure time for process_os_mapping
print("Start process_os_mapping...")
start_time = time.time()
result_process_os = SystemMapping.process_os_mapping(tree_file_path)
end_time = time.time()
process_os_mapping_time = end_time - start_time

# Measure time for fast_process_os_mapping
print("Start fast_process_os_mapping...")
system_mapping_instance = SystemMapping()  # fast_process_os_mapping is an instance method
start_time = time.time()
result_process_os_old = system_mapping_instance.process_os_mapping_old(tree_file_path)
end_time = time.time()
fast_process_os_mapping_time = end_time - start_time

# Print the timing results
print(f"Time taken by process_os_mapping: {process_os_mapping_time:.6f} seconds")
print(f"Time taken by fast_process_os_mapping: {fast_process_os_mapping_time:.6f} seconds")

# Optional: Check if results are identical
print(f"Results are identical: {result_process_os == result_process_os_old}")

Start process_os_mapping...


ic| 572392395.py:236 in process_os_mapping() at 10:12:05.941
ic| len(directory_dict): 877198


Start fast_process_os_mapping...


ic| 572392395.py:182 in process_os_mapping_old() at 10:12:09.974
ic| biggie_count: 10


Time taken by process_os_mapping: 5.050888 seconds
Time taken by fast_process_os_mapping: 4.328471 seconds
Results are identical: False


In [103]:
result_process_os
print(type(result_process_os))

<class 'dict'>


In [85]:
import chromadb
from chromadb.config import Settings

client = chromadb.PersistentClient(
    path="/home/m/PycharmProjects/terminAl/database/vector_db",
    settings=Settings(anonymized_telemetry=False)
)

collection = client.get_collection("Main_Collection")
num_entries = collection.count()
print(f"Number of entries: {num_entries}")


Number of entries: 1


In [104]:
import torch
from tqdm import tqdm
from chromadb.utils import embedding_functions
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

# Set up embedding function
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_function = SentenceTransformerEmbeddingFunction(
    model_name="intfloat/multilingual-e5-small",
    device=device
)

# Prepare list of keys (file paths)
file_paths = list(result_process_os.keys())
print(len(file_paths))

# Embed with progress bar
embeddings = []
for path in tqdm(file_paths, desc="Embedding paths"):
    embed = embedding_function([path])[0]  # Model returns a list
    embeddings.append((path, embed))

# Example output
print(f"\nEmbedded {len(embeddings)} paths. First 2 results:")
for path, vector in embeddings[:2]:
    print(f"{path} → Vector dim: {len(vector)}")


877198


Embedding paths:   8%|▊         | 72066/877198 [11:37<2:09:57, 103.25it/s]


KeyboardInterrupt: 

In [None]:
import os
import torch
from tqdm import tqdm
import chromadb
from chromadb.config import Settings
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

# Your input data
# result_process_os = {'/path/file1': {...}, '/path/file2': {...}, ...}
# Replace this with your actual data source
result_process_os = {
    "/etc/nginx/nginx.conf": {"filetype": "file", "item": "nginx.conf"},
    "/usr/bin/python3": {"filetype": "file", "item": "python3"},
    "/var/log/syslog": {"filetype": "file", "item": "syslog"},
}

# -------------------
# 1. Set up embedding
# -------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_function = SentenceTransformerEmbeddingFunction(
    model_name="intfloat/multilingual-e5-small",
    device=device
)

file_paths = list(result_process_os.keys())
print(f"Total files: {len(file_paths)}")

# --------------------------
# 2. Compute embeddings only
# --------------------------
embeddings = []
for path in tqdm(file_paths, desc="Embedding paths"):
    emb = embedding_function([path])[0]
    embeddings.append(emb)

# -----------------------------
# 3. Insert into ChromaDB cleanly
# -----------------------------
# Set ChromaDB persistent path
CHROMA_DB_PATH = "./chroma_store"

client = chromadb.PersistentClient(path=CHROMA_DB_PATH, settings=Settings(anonymized_telemetry=False))

# Recreate the collection cleanly
collection_name = "test_collection"
try:
    client.delete_collection(collection_name)
except:
    pass  # Collection may not exist yet

collection = client.create_collection(name=collection_name)

# --------------------------
# 4. Prepare metadata & IDs
# --------------------------
metadatas = [
    {
        "filetype": meta["filetype"],
        "item": meta["item"],
        "tool": "bash"
    } for meta in result_process_os.values()
]

ids = [str(i) for i in range(len(file_paths))]

# --------------------------
# 5. Batch insert (recommended)
# --------------------------
batch_size = 1000
for i in tqdm(range(0, len(file_paths), batch_size), desc="Uploading to ChromaDB"):
    start, end = i, i + batch_size
    collection.add(
        documents=file_paths[start:end],
        embeddings=embeddings[start:end],
        metadatas=metadatas[start:end],
        ids=ids[start:end]
    )

print(f"\n✅ Successfully inserted {len(file_paths)} documents into '{collection_name}'.")
