In [2]:
!pip install scanpy
import os
import gzip
import pandas as pd
import scipy.io
import scanpy as sc
import urllib.request

# -------------------------------
# 1️⃣ Create data folder
# -------------------------------
data_dir = "./GSE306885_data"
os.makedirs(data_dir, exist_ok=True)

# -------------------------------
# 2️⃣ GEO supplementary files URLs
# -------------------------------
base_url = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE306nnn/GSE306885/suppl/"
files = {
    "matrix.mtx.gz": base_url + "GSE306885_matrix.mtx.gz",
    "barcodes.tsv.gz": base_url + "GSE306885_barcodes.tsv.gz",
    "features.tsv.gz": base_url + "GSE306885_features.tsv.gz",
    "annotation.txt.gz": base_url + "GSE306885_Wt-vs-DM.all.annot.txt.gz"
}

# -------------------------------
# 3️⃣ Download files if missing
# -------------------------------
for fname, url in files.items():
    out_path = os.path.join(data_dir, fname)
    if not os.path.exists(out_path):
        print(f"Downloading {fname}...")
        urllib.request.urlretrieve(url, out_path)
    else:
        print(f"{fname} already exists, skipping download.")

# -------------------------------
# 4️⃣ Load matrix
# -------------------------------
with gzip.open(os.path.join(data_dir, "matrix.mtx.gz"), "rb") as f:
    X = scipy.io.mmread(f).T.tocsr()  # transpose to cells x genes

# -------------------------------
# 5️⃣ Load barcodes and features
# -------------------------------
with gzip.open(os.path.join(data_dir, "barcodes.tsv.gz"), "rt") as f:
    barcodes = [line.strip() for line in f]

with gzip.open(os.path.join(data_dir, "features.tsv.gz"), "rt") as f:
    # Second column usually contains gene names
    features = [line.strip().split("\t")[1] for line in f]

# -------------------------------
# 6️⃣ Build AnnData
# -------------------------------
adata = sc.AnnData(X=X)
adata.obs_names = barcodes
adata.var_names = features

# -------------------------------
# 7️⃣ Load annotation and join
# -------------------------------
annot_file = os.path.join(data_dir, "annotation.txt.gz")
annot_df = pd.read_csv(annot_file, sep="\t", compression="gzip", index_col=0)

# Inspect barcodes and annotation
print("Example barcodes from matrix:", adata.obs_names[:5])
print("Example annotation rows:", annot_df.index[:5])

# Join annotations
# adata.var = adata.obs.join(annot_df.T, how="left")

# Confirm join worked
print("Joined obs example:")
print(adata.obs.head())

# -------------------------------
# 8️⃣ Ensure all obs and var names are strings
# -------------------------------
adata.obs = adata.obs.astype(str)
adata.var_names = [str(v) for v in adata.var_names]

# -------------------------------
# 9️⃣ Save h5ad
# -------------------------------
output_file = os.path.join(data_dir, "GSE306885.h5ad")
adata.write(output_file)
print(f"✅ Done! Dataset saved as {output_file}")


Collecting scanpy
  Downloading scanpy-1.12-py3-none-any.whl.metadata (8.4 kB)
Collecting anndata>=0.10.8 (from scanpy)
  Downloading anndata-0.12.9-py3-none-any.whl.metadata (9.9 kB)
Collecting fast-array-utils>=1.2.1 (from fast-array-utils[accel,sparse]>=1.2.1->scanpy)
  Downloading fast_array_utils-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting legacy-api-wrap>=1.5 (from scanpy)
  Downloading legacy_api_wrap-1.5-py3-none-any.whl.metadata (2.2 kB)
Collecting session-info2 (from scanpy)
  Downloading session_info2-0.3-py3-none-any.whl.metadata (3.5 kB)
Collecting array-api-compat>=1.7.1 (from anndata>=0.10.8->scanpy)
  Downloading array_api_compat-1.13.0-py3-none-any.whl.metadata (2.5 kB)
Collecting zarr!=3.0.*,>=2.18.7 (from anndata>=0.10.8->scanpy)
  Downloading zarr-3.1.5-py3-none-any.whl.metadata (10 kB)
Collecting donfig>=0.8 (from zarr!=3.0.*,>=2.18.7->anndata>=0.10.8->scanpy)
  Downloading donfig-0.8.1.post1-py3-none-any.whl.metadata (5.0 kB)
Collecting numcodecs>=0.14 (fr

In [3]:
adata

AnnData object with n_obs × n_vars = 12650 × 25108

In [4]:
adata.X

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 22271199 stored elements and shape (12650, 25108)>

In [6]:
annot_df # Gene info, not cell info!

Unnamed: 0_level_0,Ctrl-1_count,Ctrl-2_count,Ctrl-3_count,IK-myb-1_count,IK-myb-2_count,IK-myb-3_count,Ctrl-1_fpkm,Ctrl-2_fpkm,Ctrl-3_fpkm,IK-myb-1_fpkm,...,FDR,Symbol,Description,KEGG_A_class,KEGG_B_class,Pathway,K_ID,GO Component,GO Function,GO Process
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSDARG00000041394,1221,1202,1446,27389,36580,38149,51.86,42.21,65.23,871.73,...,1.056266e-17,dnajb1b,DnaJ heat shock protein family (Hsp40) member ...,Genetic Information Processing;Human Diseases,"Folding, sorting and degradation;Infectious di...",ko04141//Protein processing in endoplasmic ret...,K09507;K09507,GO:0005829//cytosol,GO:0051082//unfolded protein binding;GO:005108...,GO:0002088//lens development in camera-type ey...
ENSDARG00000075855,100,131,171,5479,7759,6919,2.29,2.47,4.13,93.42,...,1.056266e-17,adgre10,adhesion G protein-coupled receptor E10 [Sourc...,-,-,-,-,GO:0005887//integral component of plasma membr...,GO:0004888//transmembrane signaling receptor a...,GO:0007166//cell surface receptor signaling pa...
ENSDARG00000103777,442,351,359,6,5,9,15.82,10.39,13.62,0.16,...,3.156598e-17,znf1001,zinc finger protein 1001 [Source:ZFIN;Acc:ZDB-...,-,-,-,-,GO:0005634//nucleus,GO:0003676//nucleic acid binding,GO:0010468//regulation of gene expression
ENSDARG00000095939,45,41,28,1016,992,1803,4.01,3.02,2.65,67.82,...,1.797043e-15,si:ch73-226l13.2,si:ch73-226l13.2 [Source:ZFIN;Acc:ZDB-GENE-110...,-,-,-,-,GO:0005576//extracellular region;GO:0005615//e...,GO:0005125//cytokine activity,GO:0006954//inflammatory response;GO:0006955//...
ENSDARG00000053124,1267,1245,1815,0,0,0,82.48,66.88,126.06,0.00,...,2.976751e-14,si:dkey-148a17.6,si:dkey-148a17.6 [Source:ZFIN;Acc:ZDB-GENE-160...,Environmental Information Processing,Signaling molecules and interaction,ko04080//Neuroactive ligand-receptor interaction,K04296,GO:0016020//membrane;GO:0016021//integral comp...,GO:0004930//G protein-coupled receptor activity,GO:0007165//signal transduction;GO:0007186//G ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MSTRG.9669,0,0,0,0,0,0,0.00,0.00,0.00,0.00,...,1.000000e+00,--,putative pheromone receptor CPpr2 [Cyprinidae ...,-,-,-,-,-,-,-
MSTRG.9671,0,0,0,0,0,0,0.00,0.00,0.00,0.00,...,1.000000e+00,--,-,-,-,-,-,-,-,-
MSTRG.9807,0,0,0,0,0,0,0.00,0.00,0.00,0.00,...,1.000000e+00,NRG3,"PREDICTED: pro-neuregulin-3, membrane-bound is...",Environmental Information Processing,Signal transduction,ko04012//ErbB signaling pathway,K05457,-,-,-
MSTRG.99,0,0,0,464,0,0,0.00,0.00,0.00,15.09,...,1.000000e+00,--,-,-,-,-,-,-,-,-
