In [3]:
import os
from pathlib import Path
import scanpy as sc
import pandas as pd
import numpy as np

os.chdir("/Users/travishong/aml-lsc-hierarchy")
base = Path("data/Lambo_GSE235063")

mtx_files = sorted(base.glob("*_processed_matrix.mtx.gz"))
print("Found processed matrices:", len(mtx_files))
mtx_files[:3]

Found processed matrices: 0


[]

In [5]:
import os
from pathlib import Path

print(os.getcwd())
print(os.listdir())


/Users/travishong/aml-lsc-hierarchy
['.DS_Store', 'gene_sets', 'models', 'figures', 'data', 'notebooks', 'src']


In [6]:
print(os.listdir())

['.DS_Store', 'gene_sets', 'models', 'figures', 'data', 'notebooks', 'src']


In [7]:
if "data" in os.listdir():
    print("data/ contents:", os.listdir("data"))
if "Lambo_GSE235063" in os.listdir():
    print("top-level Lambo_GSE235063 contents:", os.listdir("Lambo_GSE235063"))


data/ contents: ['.DS_Store', 'scAML_LSPC_scran.h5ad', 'Fig3_Relapse_Deconvolution', 'abbas_patient_hierarchy_composition_from_scarches.csv', 'Lambo_GSE235063']


In [8]:
base = Path("data/Lambo_GSE235063/GSE235063_RAW")

In [9]:
print("Files in base:", len(list(base.iterdir())))
for f in list(base.iterdir())[:10]:
    print(f.name)


Files in base: 525
GSM7494271_AML7_DX_processed_matrix.mtx.gz
GSM7494316_AML25_DX_processed_metadata.tsv.gz
GSM7494327_AML12_REL_raw_genes.tsv.gz
GSM7494264_AML2_REL_raw_genes.tsv.gz
GSM7494302_AML22_REM_processed_metadata.tsv.gz
GSM7494288_AML27_REM_processed_genes.tsv.gz
GSM7494286_AML27_DX_raw_genes.tsv.gz
GSM7494314_AML14_DX_raw_genes.tsv.gz
GSM7494266_AML15_DX_processed_metadata.tsv.gz
GSM7494308_AML24_REM_processed_metadata.tsv.gz


In [10]:
mtx_files = sorted(base.glob("*_processed_matrix.mtx.gz"))
print("Found processed matrices:", len(mtx_files))
for f in mtx_files[:5]:
    print(f.name)

Found processed matrices: 75
GSM7494257_AML16_DX_processed_matrix.mtx.gz
GSM7494258_AML16_REL_processed_matrix.mtx.gz
GSM7494259_AML16_REM_processed_matrix.mtx.gz
GSM7494260_AML6_DX_processed_matrix.mtx.gz
GSM7494261_AML6_REL_processed_matrix.mtx.gz


In [11]:
import os
from pathlib import Path
import scanpy as sc
import pandas as pd

os.chdir("/Users/travishong/aml-lsc-hierarchy")

base = Path("data/Lambo_GSE235063/GSE235063_RAW")  # adjust if you moved it under data/
mtx_files = sorted(base.glob("*_processed_matrix.mtx.gz"))
print("Found processed matrices:", len(mtx_files))


Found processed matrices: 75


In [12]:
adatas = []

for mtx_path in mtx_files:
    prefix = mtx_path.name.replace("_processed_matrix.mtx.gz", "")
    genes_path = base / f"{prefix}_processed_genes.tsv.gz"
    barcodes_path = base / f"{prefix}_processed_barcodes.tsv.gz"
    meta_path = base / f"{prefix}_processed_metadata.tsv.gz"

    print("Loading", prefix)

    # 1) counts (cells x genes)
    a = sc.read_mtx(mtx_path).T

    # 2) gene names
    genes = pd.read_csv(genes_path, sep="\t", header=None)
    a.var_names = genes.iloc[:, 0].astype(str).values

    # 3) cell barcodes
    barcodes = pd.read_csv(barcodes_path, sep="\t", header=None)
    a.obs_names = barcodes.iloc[:, 0].astype(str).values

    # 4) metadata
    meta = pd.read_csv(meta_path, sep="\t")
    print("  metadata columns:", list(meta.columns))

    # pick the barcode column
    if "Cell_Barcode" in meta.columns:
        idxcol = "Cell_Barcode"
    elif "Barcode" in meta.columns:
        idxcol = "Barcode"
    else:
        # fall back to first column if needed
        idxcol = meta.columns[0]

    meta = meta.set_index(idxcol)

    # align to barcodes (same order)
    meta = meta.loc[a.obs_names]

    # attach
    a.obs = meta

    # keep the GSM/patient/timepoint string
    a.obs["sample_raw"] = prefix

    adatas.append(a)

# 5) concatenate all samples
adata = sc.concat(adatas, join="outer", label="sample_raw", keys=None)
print(adata)


Loading GSM7494257_AML16_DX
  metadata columns: ['Cell_Barcode', 'GEO_ID', 'Lambo_et_al_ID', 'Patient_Sample', 'Library_ID', 'Counts', 'Features', 'Mitochondria_percent', 'Classified_Celltype', 'Seurat_Cluster', 'Malignant', 'Patient_ID', 'Biopsy_Origin', 'Age_Months', 'Disease_free_days', 'Clinical_Blast_Percent', 'Expected_Driving_Aberration', 'Subgroup', 'Color_Subgroup', 'Known_CNVs', 'Treatment_Outcome', 'nCount_RNA', 'nFeature_RNA']
Loading GSM7494258_AML16_REL
  metadata columns: ['Cell_Barcode', 'GEO_ID', 'Lambo_et_al_ID', 'Patient_Sample', 'Library_ID', 'Counts', 'Features', 'Mitochondria_percent', 'Classified_Celltype', 'Seurat_Cluster', 'Malignant', 'Patient_ID', 'Biopsy_Origin', 'Age_Months', 'Disease_free_days', 'Clinical_Blast_Percent', 'Expected_Driving_Aberration', 'Subgroup', 'Color_Subgroup', 'Known_CNVs', 'Treatment_Outcome', 'nCount_RNA', 'nFeature_RNA']
Loading GSM7494259_AML16_REM
  metadata columns: ['Cell_Barcode', 'GEO_ID', 'Lambo_et_al_ID', 'Patient_Sample', '

  utils.warn_names_duplicates("obs")


In [13]:
import numpy as np

def parse_patient_from_sample_raw(s: str) -> str:
    parts = str(s).split("_")
    # look for token like 'AML16'
    for p in parts:
        if p.upper().startswith("AML"):
            return p
    # fallback: second token if it exists
    if len(parts) >= 2:
        return parts[1]
    return parts[0]

adata.obs["patient_id"] = adata.obs["sample_raw"].apply(parse_patient_from_sample_raw)
adata.obs["patient_id"].value_counts()[:10]

patient_id
26    9167
12    8524
39    8365
53    8058
62    7915
43    7887
32    7836
69    7615
67    7368
38    7335
Name: count, dtype: int64

In [14]:
def infer_timepoint_from_sample_raw(s: str) -> str:
    s = str(s).upper()
    if "_DX" in s:
        return "diagnosis"
    if "_REM" in s:
        return "remission"
    if "_REL" in s:
        return "relapse"
    return "other"

adata.obs["timepoint"] = adata.obs["sample_raw"].apply(infer_timepoint_from_sample_raw)
print(adata.obs["timepoint"].value_counts())

timepoint
other    353948
Name: count, dtype: int64


In [15]:
for c in adata.obs.columns:
    if "outcome" in c.lower() or "relapse" in c.lower():
        print(c, adata.obs[c].unique()[:10])

Treatment_Outcome ['Relapsed' 'Censored']


In [16]:
def map_outcome(val) -> str:
    if pd.isna(val):
        return np.nan
    s = str(val).lower()
    if "relapse" in s:
        return "relapse"
    if "no relapse" in s or "censored" in s or "cr" in s:
        return "no_relapse"
    return np.nan

import pandas as pd

if "Treatment_Outcome" in adata.obs.columns:
    adata.obs["outcome"] = adata.obs["Treatment_Outcome"].apply(map_outcome)
elif "Relapse" in adata.obs.columns:
    adata.obs["outcome"] = adata.obs["Relapse"].apply(map_outcome)
else:
    adata.obs["outcome"] = np.nan  # adjust once you see the real name

print(adata.obs["outcome"].value_counts(dropna=False))


outcome
relapse       319440
no_relapse     34508
Name: count, dtype: int64


In [17]:
from src.hierarchy_classifier import load_hierarchy_model, annotate_hierarchy
from src.lsc_scoring import score_signature
from src.patient_features import features_from_sc_remission

# 1. Hierarchy classifier
clf, ref_genes = load_hierarchy_model()
adata_h = annotate_hierarchy(adata, clf, ref_genes, label_key="hierarchy")

# 2. LSC17 score
lsc17_genes = [
    "DNMT3B","ZBTB46","NYNRIN","ARHGAP22","LAPTM4B","MMRN1",
    "DPYSL3","KIAA0125","CDK6","CPXM1","SOCS2","SMIM24",
    "EMP1","NGFRAP1","CD34","AKR1C3","GPR56",
]
adata_h = score_signature(adata_h, lsc17_genes, "LSC17_score")

# 3. LSC_high Primitive cells
prim = adata_h[adata_h.obs["hierarchy"] == "Primitive"]
thr = prim.obs["LSC17_score"].quantile(0.9)
adata_h.obs["LSC_high"] = (
    (adata_h.obs["hierarchy"] == "Primitive") &
    (adata_h.obs["LSC17_score"] >= thr)
)

# 4. Build remission patient-level features
feat = features_from_sc_remission(
    adata_h,
    patient_key="patient_id",
    timepoint_key="timepoint",
    outcome_key="outcome",
    hierarchy_key="hierarchy",
    lsc_flag_key="LSC_high",
    remission_label="remission",
    min_cells=50,
)

feat

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  for pid, sub in rem.obs.groupby(patient_key):


In [22]:
import numpy as np
import pandas as pd

print("Has Treatment_Outcome?", "Treatment_Outcome" in adata_h.obs.columns)
print(adata_h.obs["Treatment_Outcome"].value_counts(dropna=False).head())


Has Treatment_Outcome? True
Treatment_Outcome
Relapsed    319440
Censored     34508
Name: count, dtype: int64


In [23]:
def map_outcome(val):
    if pd.isna(val):
        return np.nan
    s = str(val).lower()
    if "relapse" in s:
        return "relapse"
    if "censored" in s or "no relapse" in s or "event-free" in s:
        return "no_relapse"
    return np.nan

adata_h.obs["outcome"] = adata_h.obs["Treatment_Outcome"].apply(map_outcome)
print(adata_h.obs["outcome"].value_counts(dropna=False))


outcome
relapse       319440
no_relapse     34508
Name: count, dtype: int64


In [25]:
print("feat shape:", feat.shape)
print("feat columns:", list(feat.columns))
print(feat.head())

feat shape: (0, 0)
feat columns: []
Empty DataFrame
Columns: []
Index: []


In [26]:
import pandas as pd
import numpy as np

print("'outcome' in adata_h.obs?", "outcome" in adata_h.obs.columns)
if "outcome" in adata_h.obs.columns:
    print(adata_h.obs["outcome"].value_counts(dropna=False))

'outcome' in adata_h.obs? True
outcome
relapse       319440
no_relapse     34508
Name: count, dtype: int64


In [27]:
for c in adata_h.obs.columns:
    if "outcome" in c.lower() or "relapse" in c.lower():
        print(c, ":", adata_h.obs[c].unique()[:10])


Treatment_Outcome : ['Relapsed' 'Censored']
outcome : ['relapse' 'no_relapse']


In [28]:
def map_outcome(val):
    if pd.isna(val):
        return np.nan
    s = str(val).lower()
    if "relapse" in s:
        return "relapse"
    if "censored" in s or "no relapse" in s or "event-free" in s:
        return "no_relapse"
    return np.nan

adata_h.obs["outcome"] = adata_h.obs["Treatment_Outcome"].apply(map_outcome)
print(adata_h.obs["outcome"].value_counts(dropna=False))

outcome
relapse       319440
no_relapse     34508
Name: count, dtype: int64


In [29]:
print(adata_h.obs["timepoint"].value_counts())

timepoint
other    353948
Name: count, dtype: int64


In [30]:
print(adata_h.obs[["sample_raw", "timepoint"]].head())

                   sample_raw timepoint
AAACCCACACAAGTGG-1          0     other
AAACCCACAGAAGTGC-1          0     other
AAACCCAGTACCACGC-1          0     other
AAACCCAGTACCCACG-1          0     other
AAACCCAGTGATACCT-1          0     other


In [31]:
import numpy as np
import pandas as pd

def features_from_sc_remission(
    adata,
    patient_key="patient_id",
    timepoint_key="timepoint",
    outcome_key="outcome",
    hierarchy_key="hierarchy",
    lsc_flag_key="LSC_high",
    remission_label="remission",
    min_cells=50,
):
    rows = []
    rem = adata[adata.obs[timepoint_key] == remission_label].copy()

    for pid, sub in rem.obs.groupby(patient_key):
        # need a well-defined outcome per patient
        if outcome_key not in sub or sub[outcome_key].nunique() != 1:
            continue

        outcome = sub[outcome_key].iloc[0]
        if outcome not in ["relapse", "no_relapse"]:
            continue

        total = sub.shape[0]
        if total < min_cells:
            continue

        frac_prim = (sub[hierarchy_key] == "Primitive").mean()
        frac_prog = (sub[hierarchy_key] == "Progenitor").mean()
        frac_mat  = (sub[hierarchy_key] == "Mature").mean()

        if lsc_flag_key in sub:
            frac_lsc = sub[lsc_flag_key].mean()
            prim_mask = sub[hierarchy_key] == "Primitive"
            prim_lsc_density = (
                sub.loc[prim_mask, lsc_flag_key].mean() if prim_mask.any() else 0.0
            )
        else:
            frac_lsc = 0.0
            prim_lsc_density = 0.0

        freq = sub[hierarchy_key].value_counts(normalize=True)
        entropy = -np.sum(freq * np.log2(freq + 1e-9))

        rows.append(dict(
            patient_id=pid,
            outcome=outcome,                 # <--- THIS MUST BE PRESENT
            n_cells=total,
            frac_Primitive=frac_prim,
            frac_Progenitor=frac_prog,
            frac_Mature=frac_mat,
            frac_LSC_high=frac_lsc,
            prim_LSC_density=prim_lsc_density,
            entropy_hierarchy=entropy,
        ))

    return pd.DataFrame(rows)


In [35]:
import os
from pathlib import Path
import scanpy as sc
import pandas as pd

os.chdir("/Users/travishong/aml-lsc-hierarchy")

# 1. Set the base path EXACTLY to where your screenshot shows:
base = Path("data/Lambo_GSE235063/GSE235063_RAW")  # if you moved it under data/, use data/Lambo_GSE235063/...

print("Base exists?", base.exists())
print("Example files:", list(base.iterdir())[:5])

# 2. Collect processed matrix files
mtx_files = sorted(base.glob("*_processed_matrix.mtx.gz"))
print("Found processed matrices:", len(mtx_files))
for f in mtx_files[:5]:
    print(" ", f.name)

adatas = []

for mtx_path in mtx_files:
    prefix = mtx_path.name.replace("_processed_matrix.mtx.gz", "")
    genes_path = base / f"{prefix}_processed_genes.tsv.gz"
    barcodes_path = base / f"{prefix}_processed_barcodes.tsv.gz"
    meta_path = base / f"{prefix}_processed_metadata.tsv.gz"

    # sanity: skip if any companion file missing
    if not (genes_path.exists() and barcodes_path.exists() and meta_path.exists()):
        print(f"SKIP {prefix} (missing genes/barcodes/meta)")
        continue

    print(f"Loading {prefix}")

    # 1) counts (genes x cells -> transpose)
    a = sc.read_mtx(mtx_path).T

    # 2) gene names
    genes = pd.read_csv(genes_path, sep="\t", header=None)
    a.var_names = genes.iloc[:, 0].astype(str).values

    # 3) barcodes
    barcodes = pd.read_csv(barcodes_path, sep="\t", header=None)
    a.obs_names = barcodes.iloc[:, 0].astype(str).values

    # 4) metadata
    meta = pd.read_csv(meta_path, sep="\t")

    if "Cell_Barcode" in meta.columns:
        idxcol = "Cell_Barcode"
    elif "Barcode" in meta.columns:
        idxcol = "Barcode"
    else:
        idxcol = meta.columns[0]

    meta = meta.set_index(idxcol)

    # align rows
    missing = set(a.obs_names) - set(meta.index)
    if missing:
        # if this happens, you can inspect, but for now align with intersection
        print(f"  Warning: {len(missing)} barcodes missing in meta for {prefix}, aligning on intersection.")
        common = pd.Index(a.obs_names).intersection(meta.index)
        a = a[common, :].copy()
        meta = meta.loc[common]

    a.obs = meta

    # retain the GSM/sample label so we can parse patient & timepoint later
    a.obs["sample_raw"] = prefix

    adatas.append(a)

print("Assembled AnnData objects:", len(adatas))

if not adatas:
    raise RuntimeError("No AnnData objects were created: check base path and filename patterns above.")

# 5) Concatenate all samples
adata = sc.concat(adatas, join="outer", index_unique=None)
print(adata)
print(adata.obs[["sample_raw"]].head())


Base exists? True
Example files: [PosixPath('data/Lambo_GSE235063/GSE235063_RAW/GSM7494271_AML7_DX_processed_matrix.mtx.gz'), PosixPath('data/Lambo_GSE235063/GSE235063_RAW/GSM7494316_AML25_DX_processed_metadata.tsv.gz'), PosixPath('data/Lambo_GSE235063/GSE235063_RAW/GSM7494327_AML12_REL_raw_genes.tsv.gz'), PosixPath('data/Lambo_GSE235063/GSE235063_RAW/GSM7494264_AML2_REL_raw_genes.tsv.gz'), PosixPath('data/Lambo_GSE235063/GSE235063_RAW/GSM7494302_AML22_REM_processed_metadata.tsv.gz')]
Found processed matrices: 75
  GSM7494257_AML16_DX_processed_matrix.mtx.gz
  GSM7494258_AML16_REL_processed_matrix.mtx.gz
  GSM7494259_AML16_REM_processed_matrix.mtx.gz
  GSM7494260_AML6_DX_processed_matrix.mtx.gz
  GSM7494261_AML6_REL_processed_matrix.mtx.gz
Loading GSM7494257_AML16_DX
Loading GSM7494258_AML16_REL
Loading GSM7494259_AML16_REM
Loading GSM7494260_AML6_DX
Loading GSM7494261_AML6_REL
Loading GSM7494262_AML6_REM
Loading GSM7494263_AML2_DX
Loading GSM7494264_AML2_REL
Loading GSM7494265_AML2_RE

  utils.warn_names_duplicates("obs")


In [36]:
def parse_patient_from_sample_raw(s: str) -> str:
    parts = str(s).split("_")
    for p in parts:
        if p.upper().startswith("AML"):
            return p
    return parts[0]

def infer_timepoint_from_sample_raw(s: str) -> str:
    s = str(s).upper()
    if "_DX" in s:
        return "diagnosis"
    if "_REM" in s:
        return "remission"
    if "_REL" in s:
        return "relapse"
    return "other"

adata.obs["patient_id"] = adata.obs["sample_raw"].apply(parse_patient_from_sample_raw)
adata.obs["timepoint"] = adata.obs["sample_raw"].apply(infer_timepoint_from_sample_raw)

import numpy as np
import pandas as pd

def map_outcome(val):
    if pd.isna(val):
        return np.nan
    s = str(val).lower()
    if "relapse" in s:
        return "relapse"
    if "censored" in s or "no relapse" in s or "event-free" in s:
        return "no_relapse"
    return np.nan

adata.obs["outcome"] = adata.obs["Treatment_Outcome"].apply(map_outcome)

print(adata.obs["timepoint"].value_counts())
print(adata.obs["outcome"].value_counts(dropna=False))


timepoint
diagnosis    153367
relapse      119723
remission     80858
Name: count, dtype: int64
outcome
relapse       319440
no_relapse     34508
Name: count, dtype: int64


In [37]:
from src.hierarchy_classifier import load_hierarchy_model, annotate_hierarchy
from src.lsc_scoring import score_signature
from src.patient_features import features_from_sc_remission
from src.relapse_lsc_model import RelapseLSCModel

clf, ref_genes = load_hierarchy_model()
adata_h = annotate_hierarchy(adata, clf, ref_genes, label_key="hierarchy")

lsc17_genes = [
    "DNMT3B","ZBTB46","NYNRIN","ARHGAP22","LAPTM4B","MMRN1",
    "DPYSL3","KIAA0125","CDK6","CPXM1","SOCS2","SMIM24",
    "EMP1","NGFRAP1","CD34","AKR1C3","GPR56",
]
adata_h = score_signature(adata_h, lsc17_genes, "LSC17_score")

prim = adata_h[adata_h.obs["hierarchy"] == "Primitive"]
thr = prim.obs["LSC17_score"].quantile(0.9)
adata_h.obs["LSC_high"] = (
    (adata_h.obs["hierarchy"] == "Primitive") &
    (adata_h.obs["LSC17_score"] >= thr)
)

feat = features_from_sc_remission(
    adata_h,
    patient_key="patient_id",
    timepoint_key="timepoint",
    outcome_key="outcome",
    hierarchy_key="hierarchy",
    lsc_flag_key="LSC_high",
    remission_label="remission",
    min_cells=50,
)

print("feat shape:", feat.shape)
print("feat columns:", list(feat.columns))
print(feat["outcome"].value_counts())

rlsc = RelapseLSCModel()
rlsc.fit(feat)
print(rlsc.explain())

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


feat shape: (24, 9)
feat columns: ['patient_id', 'outcome', 'n_cells', 'frac_Primitive', 'frac_Progenitor', 'frac_Mature', 'frac_LSC_high', 'prim_LSC_density', 'entropy_hierarchy']
outcome
relapse       20
no_relapse     4
Name: count, dtype: int64
             feature    weight
2   prim_LSC_density  0.031950
1      frac_LSC_high -0.013502
3  entropy_hierarchy -0.319491
0     frac_Primitive -0.345348


  utils.warn_names_duplicates("obs")


In [38]:
from src.patient_features import features_from_sc_remission

feat = features_from_sc_remission(
    adata_h,
    patient_key="patient_id",
    timepoint_key="timepoint",
    outcome_key="outcome",
    hierarchy_key="hierarchy",
    lsc_flag_key="LSC_high",
    remission_label="remission",  # matches how we defined REM above
    min_cells=50,                 # can lower to 30 if few cells per patient
)

print("feat shape:", feat.shape)
print("feat columns:", list(feat.columns))
print(feat["outcome"].value_counts())
feat.head()


feat shape: (24, 9)
feat columns: ['patient_id', 'outcome', 'n_cells', 'frac_Primitive', 'frac_Progenitor', 'frac_Mature', 'frac_LSC_high', 'prim_LSC_density', 'entropy_hierarchy']
outcome
relapse       20
no_relapse     4
Name: count, dtype: int64


  utils.warn_names_duplicates("obs")


Unnamed: 0,patient_id,outcome,n_cells,frac_Primitive,frac_Progenitor,frac_Mature,frac_LSC_high,prim_LSC_density,entropy_hierarchy
0,AML1,no_relapse,3706,0.624933,0.375067,0.0,0.023745,0.037997,0.954484
1,AML10,relapse,2695,0.519852,0.480148,0.0,0.020408,0.039258,0.998863
2,AML11,relapse,5525,0.58914,0.41086,0.0,0.007602,0.012903,0.97695
3,AML12,relapse,3278,0.372788,0.627212,0.0,0.008237,0.022095,0.952789
4,AML13,relapse,6577,0.150829,0.849171,0.0,0.008667,0.05746,0.61191


In [39]:
from src.relapse_lsc_model import RelapseLSCModel

rlsc = RelapseLSCModel()
rlsc.fit(feat)

print("Feature weights:")
print(rlsc.explain())


Feature weights:
             feature    weight
2   prim_LSC_density  0.031950
1      frac_LSC_high -0.013502
3  entropy_hierarchy -0.319491
0     frac_Primitive -0.345348


In [40]:
print(feat["outcome"].value_counts())
feat

outcome
relapse       20
no_relapse     4
Name: count, dtype: int64


Unnamed: 0,patient_id,outcome,n_cells,frac_Primitive,frac_Progenitor,frac_Mature,frac_LSC_high,prim_LSC_density,entropy_hierarchy
0,AML1,no_relapse,3706,0.624933,0.375067,0.0,0.023745,0.037997,0.954484
1,AML10,relapse,2695,0.519852,0.480148,0.0,0.020408,0.039258,0.998863
2,AML11,relapse,5525,0.58914,0.41086,0.0,0.007602,0.012903,0.97695
3,AML12,relapse,3278,0.372788,0.627212,0.0,0.008237,0.022095,0.952789
4,AML13,relapse,6577,0.150829,0.849171,0.0,0.008667,0.05746,0.61191
5,AML14,no_relapse,1521,0.579224,0.420776,0.0,0.011177,0.019296,0.981813
6,AML15,relapse,1808,0.253872,0.746128,0.0,0.006637,0.026144,0.817357
7,AML16,relapse,5153,0.663303,0.336697,0.0,0.002329,0.003511,0.921623
8,AML2,relapse,1840,0.426087,0.573913,0.0,0.009783,0.022959,0.984179
9,AML20,no_relapse,1990,0.408543,0.591457,0.0,0.01005,0.0246,0.975729


In [41]:
feat.groupby("outcome")[[
    "frac_Primitive",
    "frac_LSC_high",
    "prim_LSC_density",
    "entropy_hierarchy",
]].mean()


Unnamed: 0_level_0,frac_Primitive,frac_LSC_high,prim_LSC_density,entropy_hierarchy
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no_relapse,0.491316,0.017425,0.038008,0.962091
relapse,0.370762,0.013338,0.05105,0.850593


In [42]:
print(adata_h.obs["timepoint"].value_counts())

timepoint
diagnosis    153367
relapse      119723
remission     80858
Name: count, dtype: int64


In [43]:
Treatment_Outcome : ['Relapsed' 'Censored']
outcome : ['relapse' 'no_relapse']