In [0]:
import joblib
import polars as pl
import re
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import sys
sys.path.append(r"/Workspace/9900-f18a-cake/working_branch/src")


CSV_PATH = "/Volumes/cb_prod/comp9300-9900-f18a-cake/9900-f18a-cake/data/mvalue_outputs_masked/MValue_concat.csv"
# CSV_PATH = "/Volumes/cb_prod/comp9300-9900-f18a-cake/9900-f18a-cake/data/mvalue_outputs_masked_subset_leukaemia_subsampled/MValue_polaris_pivot_0.csv"
# CSV_PATH = "/Volumes/cb_prod/comp9300-9900-f18b-cake/9900-f18b-cake/data/mvalue_outputs_masked/MValue_concat.csv"
JOBLIB_PATH = "/Workspace/9900-f18a-cake/working_branch/data/freeze0525/diseaseTree_mapped.joblib"
OUTPUT_PATH = "/Workspace/9900-f18a-cake/working_branch/data/freeze0525/biosample_alignment.csv"

ID_PATTERN = re.compile(r".*_T_.*_M$")  # 2J0D2U4J_T_XJAFP6HU_M

try:
    df_csv = pl.read_csv(CSV_PATH, columns=["biosample_id"], ignore_errors=True)
except Exception:
    df_csv = pl.read_csv(CSV_PATH, ignore_errors=True)
    if "biosample_id" not in df_csv.columns:
        for c in df_csv.columns:
            if c.lower() == "biosample_id":
                df_csv = df_csv.rename({c: "biosample_id"})
                break

csv_ids = [str(x).strip() for x in df_csv["biosample_id"].to_list() if x and str(x).strip()]
print(f"read csv done {len(csv_ids)} samples total")
print(csv_ids[:10])

def deep_scan_ids(obj, ids: set, _depth=0, _maxdepth=8):
    if obj is None or _depth > _maxdepth:
        return
    if isinstance(obj, str):
        if ID_PATTERN.match(obj):
            ids.add(obj)
        return
    if isinstance(obj, (list, tuple, set)):
        for it in obj:
            deep_scan_ids(it, ids, _depth + 1)
        return
    if isinstance(obj, dict):
        for k, v in obj.items():
            if isinstance(k, str) and ID_PATTERN.match(k):
                ids.add(k)
            deep_scan_ids(v, ids, _depth + 1)
        return
    if hasattr(obj, "__dict__"):
        deep_scan_ids(vars(obj), ids, _depth + 1)

obj = joblib.load(JOBLIB_PATH)
joblib_ids = set()
deep_scan_ids(obj, joblib_ids)
joblib_ids = sorted(joblib_ids)

from pprint import pprint

print(type(obj))
pprint(obj)

print(f"read Joblib done {len(joblib_ids)} samples total")
print(joblib_ids[:10])


csv_set = set(csv_ids)
joblib_set = set(joblib_ids)

intersection = csv_set & joblib_set
csv_only = csv_set - joblib_set
joblib_only = joblib_set - csv_set

print(f"ðŸ”¹ CSV count: {len(csv_set)}")
print(f"ðŸ”¹ Joblib count: {len(joblib_set)}")
print(f"ðŸ”¹ joint count: {len(intersection)}")
print(f"ðŸ”¹ CSV only: {len(csv_only)}")
print(f"ðŸ”¹ Joblib only: {len(joblib_only)}")



plt.figure(figsize=(6,5))
plt.bar(["CSV only", "Joblib only", "Align"], 
        [len(csv_only), len(joblib_only), len(intersection)],
        color=["#66c2a5", "#fc8d62", "#8da0cb"])
plt.ylabel("Sample cnt")
plt.title("Biosample ID joint")
for i, v in enumerate([len(csv_only), len(joblib_only), len(intersection)]):
    plt.text(i, v + 2, str(v), ha="center", fontweight="bold")
plt.show()

alignment_df = pd.DataFrame({
    "biosample_id_csv": list(csv_set),
    "in_joblib": [x in joblib_set for x in csv_set]
})

Path(OUTPUT_PATH).parent.mkdir(parents=True, exist_ok=True)
alignment_df.to_csv(OUTPUT_PATH, index=False)
print(f"Align output: {OUTPUT_PATH}")
alignment_df.head(10)


In [0]:
from pprint import pprint
import joblib
import polars as pl
import re
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
import sys
sys.path.append(r"/Workspace/9900-f18a-cake/working_branch/src")

obj = joblib.load(JOBLIB_PATH)

print(type(obj))
pprint(obj)

In [0]:
REPO_SRC   = "/Workspace/9900-f18a-cake/mt-method2/src" 
JOBLIB_PATH= "/Workspace/9900-f18a-cake/mt-method2/data/freeze0525/diseaseTree_mapped.joblib"

In [0]:
import sys, joblib, inspect, types

if REPO_SRC not in sys.path:
    sys.path.append(REPO_SRC)

tree = joblib.load(JOBLIB_PATH)
root = getattr(tree, "root", tree)

print("Loaded:", type(tree))
print("Root:", getattr(root, "name", getattr(root, "label", "<unknown>")))

In [0]:
from collections import deque

def node_name(n):
    return getattr(n, "name", getattr(n, "label", "UNKNOWN"))

def node_children(n):
    # support a few common attribute names
    for attr in ["children", "child_nodes", "kids", "subnodes"]:
        v = getattr(n, attr, None)
        if v: return list(v)
    return []

def node_child_names(n):
    return {node_name(c) for c in node_children(n)}

def walk_nodes(root):
    q = deque([root])
    seen = set()
    while q:
        n = q.popleft()
        if id(n) in seen: 
            continue
        seen.add(id(n))
        yield n
        q.extend(node_children(n))

In [0]:
import re

LIKELY_LABEL_ATTRS = [
    "labels","label_map","sample_labels","labels_by_sample","y","targets","classes_by_sample"
]
LIKELY_CLASS_LISTS = ["classes","class_names","child_names","labels_list"]

def try_extract_labels_from_node(n):
    """
    Returns list[(biosample_id, node_id, direct_label)] or [] if not found.
    """
    parent = node_name(n)
    child_names = node_child_names(n)
    out = []

    # quick helpers
    def looks_like_id(k):
        return isinstance(k, str) and len(k) >= 6  # biosample ids are strings like NLTJ7CGJ..., adjust if needed

    def commit_map(m):
        for k, v in m.items():
            if looks_like_id(k) and isinstance(v, str):
                out.append((k, parent, v))
        return out

    g = vars(n).copy()  # node's attribute dict

    # 1) direct dict attribute whose values are known child names
    for k, v in g.items():
        if isinstance(v, dict):
            vals = list(v.values())
            if vals and all(isinstance(x, str) for x in vals):
                # if these labels are among this parent's children, accept
                if child_names and all((x in child_names) for x in vals[: min(50, len(vals))]):
                    return commit_map(v)

    # 2) numeric class ids + a classes list we can map
    classes_list = None
    for key in LIKELY_CLASS_LISTS:
        cl = g.get(key, None)
        if isinstance(cl, (list, tuple)) and all(isinstance(x, str) for x in cl):
            classes_list = list(cl)
            break

    for k, v in g.items():
        if isinstance(v, dict) and classes_list and all(isinstance(x, int) for x in list(v.values())[:min(50, len(v))]):
            # map ints to class strings if in range
            mapped = {}
            ok = True
            for sid, cid in v.items():
                if not looks_like_id(sid) or not (0 <= cid < len(classes_list)):
                    ok = False
                    break
                mapped[sid] = classes_list[cid]
            if ok:
                return commit_map(mapped)

    # 3) attributes named like labels that are dicts
    for key in LIKELY_LABEL_ATTRS:
        v = g.get(key, None)
        if isinstance(v, dict):
            vals = list(v.values())
            if vals and all(isinstance(x, str) for x in vals):
                # if no child_names, still accept (we'll trust the strings)
                return commit_map(v)

    # 4) pairs of arrays (sample_ids, labels)
    # try to detect two arrays of same length; one stringy ids, the other strings that match children
    arrays = {k: v for k, v in g.items() if isinstance(v, (list, tuple))}
    for k1, a1 in arrays.items():
        for k2, a2 in arrays.items():
            if k1 == k2: 
                continue
            if len(a1) == len(a2) and len(a1) > 0:
                # id-array + label-array
                if all(isinstance(x, str) for x in a1[:min(50,len(a1))]) and \
                   all(isinstance(x, str) for x in a2[:min(50,len(a2))]):
                    # use them as (ids, labels)
                    for sid, lab in zip(a1, a2):
                        if looks_like_id(sid):
                            out.append((sid, parent, lab))
                    if out:
                        return out

    return out  # may be empty

In [0]:
all_rows = []
per_node_counts = []

for n in walk_nodes(root):
    rows = try_extract_labels_from_node(n)
    if rows:
        all_rows.extend(rows)
        per_node_counts.append((node_name(n), len(rows)))

# Deduplicate (same sample might appear more than once per node)
all_rows = list({(s,p,c) for (s,p,c) in all_rows})

print(f"Found rows: {len(all_rows)}")
display(spark.createDataFrame(per_node_counts, schema="node string, n int").orderBy(F.desc("n")))

In [0]:
n0 = root
n1 = node_children(root)[0] if node_children(root) else None

def dump_attrs(n, max_vals=5):
    print("Node:", node_name(n))
    for k, v in vars(n).items():
        t = type(v).__name__
        # preview small containers
        if isinstance(v, dict):
            items = list(v.items())[:max_vals]
            print(f"  {k}: dict[{len(v)}] sample={items}")
        elif isinstance(v, (list, tuple)):
            print(f"  {k}: {t}[{len(v)}] sample={v[:max_vals]}")
        else:
            print(f"  {k}: {t} -> {str(v)[:120]}")

dump_attrs(n0)
if n1: 
    dump_attrs(n1)