In [1]:
import pandas as pd
import networkx as nx

In [2]:
from iac_sketch import data, etl, sketch, transform

In [3]:
# DEBUG
import importlib

importlib.reload(data)
importlib.reload(etl)
importlib.reload(transform)
importlib.reload(sketch)
# architect = sketch.Architect("../public/components")
# registry = architect.parse()

<module 'iac_sketch.sketch' from '/Users/zhafen/repos/iac-sketch/iac_sketch/sketch.py'>

In [4]:
from sklearn.preprocessing import FunctionTransformer

In [5]:
extract_sys = etl.ExtractSystem()
registry = extract_sys.extract_entities("../public/components/*yaml")
registry

<iac_sketch.data.Registry at 0x11a31ecf0>

In [6]:
transform_sys = etl.TransformSystem()
# Normalize components in the registry
excluded_components = ["fields"]
registry = transform_sys.apply_transform(
    registry,
    transform.ComponentNormalizer(),
    components_mapping={comp: data.View(comp) for comp in registry.components if comp not in excluded_components},
)

In [7]:
# We do an outer join here because we count "fields" as an implicit flag for a component.
X = registry.view(["component", "fields"], join_on="entity", join_how="outer")
X

Unnamed: 0,entity,comp_ind,multiplicity,flag,comp_ind.fields,component
0,alternative,0.0,,,,
1,case,0.0,,,2.0,"{'case [dict[str, str]]': 'Case statement. Can..."
2,code_location_format,,,,1.0,{'code_type [str]': 'What type of code this is...
3,cohort,,,,1.0,"{'config_id [str]': 'Unique ID for a cohort.',..."
4,cohort_patient,,,,1.0,"{'patient [entity]': None, 'cohort [entity]': ..."
5,cohort_patients,,,,2.0,"{'cohort [entity]': None, 'patient [entity]': ..."
6,cohort_patients_history,,,,2.0,"{'cohort [entity]': None, 'patient [entity]': ..."
7,component,0.0,1.0,,2.0,{'multiplicity [str]': {'description': 'Allowe...
8,databricks_workspace,,,,1.0,"{'url [str]': 'Unique URL for the workspace.',..."
9,dependency,0.0,,,,


In [8]:
# Add in all components defined in the registry
# and mark the ones that are not defined
X["defined"] = True
registry_comps = pd.DataFrame({"entity": registry.keys()})
X = X.merge(registry_comps, how="outer", on="entity")
X.loc[X["defined"].isna(), "defined"] = False
X["defined"] = X["defined"].astype(bool)
X

Unnamed: 0,entity,comp_ind,multiplicity,flag,comp_ind.fields,component,defined
0,algorithm,,,,,,False
1,alternative,0.0,,,,,True
2,alternative_to,,,,,,False
3,append_to,,,,,,False
4,associated_components,,,,,,False
...,...,...,...,...,...,...,...
56,url,,,,,,False
57,used_in,,,,,,False
58,website,,,,,,False
59,work_item,,,,1.0,"{'azdo_id [int]': 'The ID provided by AzDO.', ...",True


In [9]:
def parse_fields(row):
    if pd.isna(row["component"]):
        # Flags are components without any fields. Think of them like tags.
        if row["flag"]:
            row["valid"] = True
            row["errors"] = ""
        else:
            row["valid"] = False
            row["errors"] = "Fields definition is missing."
        return row

    fields_i = {}
    valid_fields = True
    valid_message = ""
    for field_key, field_value in row["component"].items():
        try:
            field = data.Field.from_kv_pair(field_key, field_value)
            fields_i[field.name] = field
        except ValueError:
            valid_fields = False
            valid_message = (
                f"Field {field_key} is incorrectly formatted: {field_value}. "
            )
            break

    row["fields"] = fields_i
    row["valid"] = valid_fields
    row["errors"] = valid_message

    return row

In [None]:
from pydoc import locate
dependencies = [
    "PyYAML",
    "pandas",
    "seaborn",
    "jupyterlab",
    "networkx",
    "scipy",
    "pandera",
]

In [23]:
import sys

print(sys.version)

3.13.2 | packaged by Anaconda, Inc. | (main, Feb  6 2025, 12:54:57) [Clang 14.0.6 ]


In [22]:
from annotationlib import Format

ModuleNotFoundError: No module named 'annotationlib'

In [20]:
dict[str, str]

dict[str, str]

In [19]:
print(locate("dict[str, str]"))

None


In [11]:
import pandera as pa

In [13]:
pa.DataType("int")

TypeError: DataType.__init__() takes 1 positional argument but 2 were given

In [10]:
X.apply(parse_fields, axis="columns")

Unnamed: 0,comp_ind,comp_ind.fields,component,defined,entity,errors,fields,flag,multiplicity,valid
0,,,,False,algorithm,,,,,True
1,0.0,,,True,alternative,,,,,True
2,,,,False,alternative_to,,,,,True
3,,,,False,append_to,,,,,True
4,,,,False,associated_components,,,,,True
...,...,...,...,...,...,...,...,...,...,...
56,,,,False,url,,,,,True
57,,,,False,used_in,,,,,True
58,,,,False,website,,,,,True
59,,1.0,"{'azdo_id [int]': 'The ID provided by AzDO.', ...",True,work_item,,"{'azdo_id': Field(name='azdo_id', type='int', ...",,,True


In [None]:
# Create a directed graph from the DataFrame
graph = nx.from_pandas_edgelist(
    registry["link"],
    source="source",
    target="target",
    edge_key="link_type",
    create_using=nx.DiGraph,
)

# Visualize the graph
nx.draw(graph, with_labels=True)

In [None]:
graph.add_nodes_from(registry["metadata"]["entity"].values)

In [None]:
connected_components = [_ for _ in nx.connected_components(graph.to_undirected())]
metadata = registry["metadata"].set_index("entity")
metadata["connected_component_category"] = -1
for i, comps in enumerate(connected_components):
    metadata.loc[list(comps), "connected_component_category"] = i

In [None]:
pair_connectivity = nx.all_pairs_node_connectivity(graph)

In [None]:
sum(pair_connectivity["accept_request"].values()) > 0

In [None]:
# Find invalid requirements
reqs = registry.view(["requirement", "status", "task", "input"])
is_valid = reqs["status"].isin(["closed", "removed"])
invalid_reqs = reqs.loc[~is_valid]
invalid_reqs

In [None]:
# Find invalid testcases
registry.view(["testcase", "status"])

In [None]:
import base64
import io, requests
from IPython.display import Image, display
from PIL import Image as im
import matplotlib.pyplot as plt

In [None]:


def mm(graph):
    graphbytes = graph.encode("utf8")
    base64_bytes = base64.urlsafe_b64encode(graphbytes)
    base64_string = base64_bytes.decode("ascii")
    img = im.open(io.BytesIO(requests.get('https://mermaid.ink/img/' + base64_string).content))
    plt.imshow(img)
    plt.axis('off') # allow to hide axis

In [None]:
graph = """
graph LR;
    A--> B & C & D
    B--> A & E
    C--> A & E
    D--> A
    E--> B & C & D
"""

In [None]:
graphbytes = graph.encode("utf8")
base64_bytes = base64.urlsafe_b64encode(graphbytes)
base64_string = base64_bytes.decode("ascii")
result = requests.get('https://mermaid.ink/img/' + base64_string).content
parsed_result = io.BytesIO(result)
img = im.open(parsed_result)
img

In [None]:
mm("""
flowchart LR;
    A--> B & C & D
    B--> A & E
    C--> A & E
    D--> A & E
    E--> B & C & D
""")