In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc

  from pkg_resources import get_distribution, DistributionNotFound


In [14]:
adata = sc.read_h5ad("CTR9_snRNASeq/CTR9_snRNASeq_full.h5ad")
print(f"Loaded data: {adata.shape[0]} cells x {adata.shape[1]} genes")
print(adata)

Loaded data: 9869 cells x 33696 genes
AnnData object with n_obs × n_vars = 9869 × 33696
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample', 'RNA_snn_res.0.5', 'seurat_clusters', 'RNA_snn_res.0.1', 'RNA_snn_res.1', 'RNA_snn_res.0.2', 'cluster_annot'


In [15]:
print("Sample distribution:")
print(adata.obs['sample'].value_counts())
print()
# Check unique values
print("Unique sample values:", adata.obs['sample'].unique().tolist())
# Check if both WT and KO are present
has_wt = adata.obs['sample'].str.contains('WT', case=False).any()
has_ko = adata.obs['sample'].str.contains('KO', case=False).any()

Sample distribution:
WT_DM    4981
KO_DM    4888
Name: sample, dtype: int64

Unique sample values: ['WT_DM', 'KO_DM']


In [16]:
print("Metadata columns:")
print(adata.obs.columns.tolist())
print("\nDimensional reduction: ")
print(list(adata.obsm.keys()))
print("\nSample distribution:")
print(adata.obs['sample'].value_counts())
print("\nCell type distribution:")
print(adata.obs['cluster_annot'].value_counts())

Metadata columns:
['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample', 'RNA_snn_res.0.5', 'seurat_clusters', 'RNA_snn_res.0.1', 'RNA_snn_res.1', 'RNA_snn_res.0.2', 'cluster_annot']

Dimensional reduction: 
[]

Sample distribution:
WT_DM    4981
KO_DM    4888
Name: sample, dtype: int64

Cell type distribution:
Epi_Kit+Elf5+           1811
Adipocyte               1802
Tcells                  1333
BasalEpi_Acta2+Trp63    1066
Epi_Ctr9+                943
Fibroblasts              908
Bcells                   519
Endothelials             442
Myeloid_cells            404
Epi_proliferating        221
DCs                      162
Pericytes/SMC            142
SMC?                      79
Schwann?                  37
Name: cluster_annot, dtype: int64


In [17]:
# Check if a gene exists in the WT subset
gene_of_interest = "Ctr9"  # <-- change this

wt = adata[adata.obs['sample'] == 'WT_DM']

if gene_of_interest in wt.var_names:
    print(f"✅ '{gene_of_interest}' found in WT subset ({wt.shape[0]} cells, {wt.shape[1]} genes)")
    # Check expression
    import numpy as np
    expr = wt[:, gene_of_interest].X
    if hasattr(expr, 'toarray'):
        expr = expr.toarray().flatten()
    else:
        expr = np.array(expr).flatten()
    
    pct_expressing = (expr > 0).sum() / len(expr) * 100
    print(f"   Mean expression: {expr.mean():.4f}")
    print(f"   Cells expressing (>0): {(expr > 0).sum()} / {len(expr)} ({pct_expressing:.1f}%)")
else:
    print(f"❌ '{gene_of_interest}' NOT found in WT subset")
    # Fuzzy search for similar gene names
    matches = [g for g in wt.var_names if gene_of_interest.lower() in g.lower()]
    if matches:
        print(f"   Similar genes found: {matches}")

✅ 'Ctr9' found in WT subset (4981 cells, 33696 genes)
   Mean expression: 0.3551
   Cells expressing (>0): 1079 / 4981 (21.7%)


In [11]:
adata.var.index

Index(['Xkr4', 'Gm1992', 'Rp1', 'Rgs20', 'St18', 'Sntg1', 'Mybl1', 'Cpa6',
       'Prex2', 'A830018L16Rik',
       ...
       'mt-Co2', 'mt-Atp8', 'mt-Atp6', 'mt-Co3', 'mt-Nd3', 'mt-Nd4l', 'mt-Nd4',
       'mt-Nd5', 'mt-Nd6', 'mt-Cytb'],
      dtype='object', length=2000)

In [13]:
import scanpy as sc
import scipy.io
import pandas as pd

# Load the 2000-gene h5ad to grab the cell annotations
adata_small = sc.read_h5ad("CTR9_snRNASeq/CTR9_snRNASeq.h5ad")

# Load full raw counts (36K genes)
counts = scipy.io.mmread("CTR9_snRNASeq/CTR9_counts.mtx").T.tocsr()

with open("CTR9_snRNASeq/CTR9_genes.txt") as f:
    genes = [line.strip() for line in f]
with open("CTR9_snRNASeq/CTR9_cells.txt") as f:
    cells = [line.strip() for line in f]

# Build new AnnData with all 36K genes
import anndata as ad
adata_full = ad.AnnData(X=counts)
adata_full.obs_names = cells
adata_full.var_names = genes

# Transfer annotations from the old h5ad
adata_full.obs = adata_full.obs.join(adata_small.obs)

# Reorder cells if needed
adata_full = adata_full[adata_small.obs_names, :]

print(adata_full)  # should show all 36K genes with annotations
adata_full.write_h5ad("CTR9_snRNASeq/CTR9_snRNASeq_full.h5ad")


This is where adjacency matrices should go now.
  warn(


View of AnnData object with n_obs × n_vars = 9869 × 33696
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample', 'RNA_snn_res.0.5', 'seurat_clusters', 'RNA_snn_res.0.1', 'RNA_snn_res.1', 'RNA_snn_res.0.2', 'cluster_annot'


  df[key] = c
  df[key] = c
  df[key] = c


In [18]:
adata

AnnData object with n_obs × n_vars = 9869 × 33696
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample', 'RNA_snn_res.0.5', 'seurat_clusters', 'RNA_snn_res.0.1', 'RNA_snn_res.1', 'RNA_snn_res.0.2', 'cluster_annot'

In [21]:
print(adata.X)

  (0, 18)	1
  (0, 61)	1
  (0, 114)	1
  (0, 120)	1
  (0, 125)	1
  (0, 154)	2
  (0, 213)	1
  (0, 225)	1
  (0, 240)	1
  (0, 254)	1
  (0, 261)	1
  (0, 366)	2
  (0, 370)	2
  (0, 400)	2
  (0, 406)	1
  (0, 432)	2
  (0, 437)	1
  (0, 440)	1
  (0, 500)	3
  (0, 508)	1
  (0, 514)	6
  (0, 548)	3
  (0, 558)	1
  (0, 578)	1
  (0, 580)	1
  :	:
  (9868, 32521)	1
  (9868, 32597)	7
  (9868, 32642)	1
  (9868, 32688)	1
  (9868, 32696)	1
  (9868, 32703)	2
  (9868, 32728)	1
  (9868, 32763)	2
  (9868, 32768)	1
  (9868, 32835)	1
  (9868, 32836)	2
  (9868, 32848)	3
  (9868, 32884)	1
  (9868, 32897)	1
  (9868, 33631)	2
  (9868, 33632)	1
  (9868, 33634)	3
  (9868, 33635)	5
  (9868, 33637)	4
  (9868, 33638)	5
  (9868, 33639)	1
  (9868, 33640)	1
  (9868, 33641)	1
  (9868, 33644)	3
  (9868, 33695)	1
