In [6]:
import os
import GEOparse
import urllib.request
import gzip
import shutil
from pathlib import Path

# Sample accessions from GSE116222
# You'll need to get the actual supplementary file URLs from GEO
# This is a template - we'll fill in real URLs




RAW_DIR = Path("data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)


geo_id = "GSE116222"

gse = GEOparse.get_GEO(geo=geo_id, destdir=RAW_DIR, annotate_gpl=True)


# GEO provides supplementary files - check the actual GSE page
# The typical structure is:
# ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE116nnn/GSE116222/suppl/

print(list(gse.gsms.keys()))

print("Navigate to: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE116222")
print("Download the supplementary files manually or use wget/curl")
print("Expected files: GSE116222_RAW.tar (contains all sample matrices)")

29-Nov-2025 15:44:29 DEBUG utils - Directory data/raw already exists. Skipping.
29-Nov-2025 15:44:29 INFO GEOparse - File already exist: using local version.
29-Nov-2025 15:44:29 INFO GEOparse - Parsing data/raw/GSE116222_family.soft.gz: 
29-Nov-2025 15:44:29 DEBUG GEOparse - DATABASE: GeoMiame
29-Nov-2025 15:44:29 DEBUG GEOparse - SERIES: GSE116222
29-Nov-2025 15:44:30 DEBUG GEOparse - PLATFORM: GPL24676
29-Nov-2025 15:44:30 DEBUG GEOparse - SAMPLE: GSM3214201
29-Nov-2025 15:44:30 DEBUG GEOparse - SAMPLE: GSM3214202
29-Nov-2025 15:44:30 DEBUG GEOparse - SAMPLE: GSM3214203
29-Nov-2025 15:44:30 DEBUG GEOparse - SAMPLE: GSM3214204
29-Nov-2025 15:44:30 DEBUG GEOparse - SAMPLE: GSM3214205
29-Nov-2025 15:44:30 DEBUG GEOparse - SAMPLE: GSM3214206
29-Nov-2025 15:44:30 DEBUG GEOparse - SAMPLE: GSM3214207
29-Nov-2025 15:44:30 DEBUG GEOparse - SAMPLE: GSM3214208
29-Nov-2025 15:44:30 DEBUG GEOparse - SAMPLE: GSM3214209


['GSM3214201', 'GSM3214202', 'GSM3214203', 'GSM3214204', 'GSM3214205', 'GSM3214206', 'GSM3214207', 'GSM3214208', 'GSM3214209']
Navigate to: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE116222
Download the supplementary files manually or use wget/curl
Expected files: GSE116222_RAW.tar (contains all sample matrices)


In [7]:
import pandas as pd

records = []

for gsm_id, gsm in gse.gsms.items():
    meta = gsm.metadata
    record = {
        'sample_id': gsm_id,
        'title': meta.get('title', [''])[0],
        'condition': meta.get('characteristics_ch1', [''])[0],  # often contains condition info
        'patient_id': meta.get('characteristics_ch1', [''])[1] if len(meta.get('characteristics_ch1', [])) > 1 else ''
    }
    records.append(record)

metadata_df = pd.DataFrame(records)
print(metadata_df)


    sample_id            title                           condition  \
0  GSM3214201      A3_inflamed  subject status: Ulcerative colitis   
1  GSM3214202  A2_non_inflamed  subject status: Ulcerative colitis   
2  GSM3214203       A1_healthy     subject status: healthy control   
3  GSM3214204      B3_inflamed  subject status: Ulcerative colitis   
4  GSM3214205  B2_non_inflamed  subject status: Ulcerative colitis   
5  GSM3214206       B1_healthy     subject status: healthy control   
6  GSM3214207      C3_inflamed  subject status: Ulcerative colitis   
7  GSM3214208  C2_non_inflamed  subject status: Ulcerative colitis   
8  GSM3214209       C1_healthy     subject status: healthy control   

                           patient_id  
0      tissue: inflamed area of colon  
1  tissue: adjacent non-inflamed area  
2              tissue: colonic biopsy  
3      tissue: inflamed area of colon  
4  tissue: adjacent non-inflamed area  
5              tissue: colonic biopsy  
6      tissue: infl

In [None]:
#!/usr/bin/env python3
"""
Load GSE116222 Expression Matrix
================================

GSE116222 provides data as a single combined expression matrix (txt.gz format)
rather than the typical 10x format (matrix.mtx + barcodes + genes).

This script loads the text matrix and converts it to AnnData format.
"""

import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
from pathlib import Path
from scipy import sparse
import gzip

# ==============================================================================
# CONFIGURATION
# ==============================================================================

# Adjust this path to where you downloaded the file
RAW_DIR = Path("data/raw")
PROCESSED_DIR = Path("data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# The expression matrix file
MATRIX_FILE = RAW_DIR /"GSE116222_Expression_matrix.txt.gz"

# Sample metadata from GEO (extracted from your screenshot)
SAMPLE_METADATA = {
    'GSM3214201': {'name': 'A3_inflamed', 'condition': 'UC_inflamed', 'batch': 'A', 'patient': 'UC_A'},
    'GSM3214202': {'name': 'A2_non_inflamed', 'condition': 'UC_noninflamed', 'batch': 'A', 'patient': 'UC_A'},
    'GSM3214203': {'name': 'A1_healthy', 'condition': 'Healthy', 'batch': 'A', 'patient': 'Healthy_A'},
    'GSM3214204': {'name': 'B3_inflamed', 'condition': 'UC_inflamed', 'batch': 'B', 'patient': 'UC_B'},
    'GSM3214205': {'name': 'B2_non_inflamed', 'condition': 'UC_noninflamed', 'batch': 'B', 'patient': 'UC_B'},
    'GSM3214206': {'name': 'B1_healthy', 'condition': 'Healthy', 'batch': 'B', 'patient': 'Healthy_B'},
    'GSM3214207': {'name': 'C3_inflamed', 'condition': 'UC_inflamed', 'batch': 'C', 'patient': 'UC_C'},
    'GSM3214208': {'name': 'C2_non_inflamed', 'condition': 'UC_noninflamed', 'batch': 'C', 'patient': 'UC_C'},
    'GSM3214209': {'name': 'C1_healthy', 'condition': 'Healthy', 'batch': 'C', 'patient': 'Healthy_C'},
}

# ==============================================================================
# STEP 1: Inspect the file format
# ==============================================================================

def inspect_matrix_file(filepath):
    """
    Peek at the first few lines to understand the file format.
    
    Common formats:
    - Genes as rows, cells as columns (genes x cells)
    - Cells as rows, genes as columns (cells x genes)
    """
    print(f"\nInspecting: {filepath}")
    print("-" * 50)
    
    # Read first few lines
    with gzip.open(filepath, 'rt') as f:
        for i, line in enumerate(f):
            if i < 5:  # First 5 lines
                # Show truncated line
                preview = line.strip()[:200]
                print(f"Line {i}: {preview}...")
            else:
                break
    
    # Count total lines (might take a moment for large files)
    print("\nCounting lines (this may take a moment)...")
    with gzip.open(filepath, 'rt') as f:
        line_count = sum(1 for _ in f)
    print(f"Total lines: {line_count:,}")
    
    return line_count


# ==============================================================================
# STEP 2: Load the expression matrix
# ==============================================================================

def load_expression_matrix(filepath):
    """
    Load the expression matrix from text file.
    
    Expected format: genes as rows, cells as columns
    First row: header with cell barcodes
    First column: gene names
    
    Returns: pandas DataFrame (genes x cells)
    """
    print(f"\nLoading expression matrix from: {filepath}")
    print("This may take a few minutes for large files...")
    
    # Read the matrix
    # - First column is gene names (index_col=0)
    # - First row is cell barcodes (header=0)
    df = pd.read_csv(filepath, sep='\t', index_col=0, compression='gzip')
    
    print(f"Loaded matrix shape: {df.shape}")
    print(f"  Rows (genes): {df.shape[0]:,}")
    print(f"  Columns (cells): {df.shape[1]:,}")
    
    # Preview
    print(f"\nFirst 5 genes: {list(df.index[:5])}")
    print(f"First 5 cell barcodes: {list(df.columns[:5])}")
    
    # Check for any non-numeric values
    print(f"\nData type: {df.values.dtype}")
    print(f"Value range: {df.values.min():.2f} to {df.values.max():.2f}")
    
    return df


# ==============================================================================
# STEP 3: Convert to AnnData
# ==============================================================================

def matrix_to_anndata(df):
    """
    Convert pandas DataFrame to AnnData object.
    
    AnnData expects cells as rows (observations) and genes as columns (variables).
    If the matrix is genes x cells, we transpose it.
    """
    print("\nConverting to AnnData format...")
    
    # Determine orientation
    # Heuristic: gene names typically contain letters, cell barcodes have specific patterns
    # For 10x data, ~20,000 genes and variable number of cells
    
    n_rows, n_cols = df.shape
    
    # If rows >> cols, likely genes as rows
    # If cols >> rows, likely genes as columns
    # For scRNA-seq, typically 20k genes and 1k-100k cells
    
    if n_rows > 15000 and n_rows < 30000:
        print(f"  Detected: genes as rows ({n_rows} genes), cells as columns ({n_cols} cells)")
        print("  Transposing to cells x genes...")
        df_t = df.T  # Transpose so cells are rows
    else:
        print(f"  Assuming: cells as rows ({n_rows}), genes as columns ({n_cols})")
        df_t = df
    
    # Create AnnData
    # Convert to sparse matrix for memory efficiency
    X_sparse = sparse.csr_matrix(df_t.values.astype(np.float32))
    
    adata = ad.AnnData(
        X=X_sparse,
        obs=pd.DataFrame(index=df_t.index),  # Cell metadata (empty for now)
        var=pd.DataFrame(index=df_t.columns)  # Gene metadata
    )
    
    print(f"  Created AnnData: {adata.n_obs} cells Ã— {adata.n_vars} genes")
    
    return adata


# ==============================================================================
# STEP 4: Parse cell barcodes and assign metadata
# ==============================================================================

def assign_sample_metadata(adata, sample_metadata):
    """
    Parse cell barcodes to extract sample information and assign metadata.
    
    Cell barcode format varies - common patterns:
    - SAMPLE_BARCODE (e.g., A1_healthy_ACGTACGT)
    - BARCODE-SAMPLE (e.g., ACGTACGT-1)
    """
    print("\nAssigning sample metadata...")
    
    # Examine cell barcode format
    sample_barcodes = adata.obs_names[:10].tolist()
    print(f"  Example cell barcodes: {sample_barcodes[:5]}")
    
    # Try to extract sample information from barcodes
    # The format depends on how the authors named cells
    
    # Strategy 1: Look for sample name patterns in barcode
    samples = []
    conditions = []
    batches = []
    patients = []
    
    for barcode in adata.obs_names:
        barcode_str = str(barcode)
        
        # Try to match sample patterns
        matched = False
        for gsm_id, meta in sample_metadata.items():
            sample_name = meta['name']
            
            # Check if sample name is in the barcode
            # Common patterns: "A1_healthy_BARCODE" or "A1_healthy.BARCODE"
            if sample_name.lower() in barcode_str.lower():
                samples.append(sample_name)
                conditions.append(meta['condition'])
                batches.append(meta['batch'])
                patients.append(meta['patient'])
                matched = True
                break
            
            # Also try partial matches (e.g., "A1" or "inflamed")
            parts = sample_name.split('_')
            if any(part.lower() in barcode_str.lower() for part in parts if len(part) > 1):
                # Additional verification needed
                pass
        
        if not matched:
            # If no match, try to infer from barcode structure
            samples.append('Unknown')
            conditions.append('Unknown')
            batches.append('Unknown')
            patients.append('Unknown')
    
    # Check how many were matched
    n_unknown = samples.count('Unknown')
    n_matched = len(samples) - n_unknown
    
    print(f"  Matched: {n_matched}/{len(samples)} cells ({100*n_matched/len(samples):.1f}%)")
    
    if n_unknown > 0:
        print(f"  Warning: {n_unknown} cells could not be matched to samples")
        print("  You may need to adjust the barcode parsing logic")
    
    # Assign to adata
    adata.obs['sample'] = samples
    adata.obs['condition'] = conditions
    adata.obs['batch'] = batches
    adata.obs['patient'] = patients
    
    # Show distribution
    print("\nCondition distribution:")
    print(adata.obs['condition'].value_counts())
    
    return adata


# ==============================================================================
# STEP 5: Calculate QC metrics
# ==============================================================================

def calculate_qc_metrics(adata):
    """Calculate standard single-cell QC metrics."""
    print("\nCalculating QC metrics...")
    
    # Identify mitochondrial genes (human: MT-)
    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    
    # Identify ribosomal genes
    adata.var['ribo'] = adata.var_names.str.startswith(('RPS', 'RPL'))
    
    # Identify hemoglobin genes (blood contamination)
    adata.var['hb'] = adata.var_names.str.contains('^HB[^(P)]', regex=True)
    
    # Calculate metrics
    sc.pp.calculate_qc_metrics(
        adata,
        qc_vars=['mt', 'ribo', 'hb'],
        percent_top=None,
        log1p=False,
        inplace=True
    )
    
    # Summary
    print("\nQC Summary:")
    print(f"  Genes per cell: {adata.obs['n_genes_by_counts'].median():.0f} (median)")
    print(f"  UMIs per cell: {adata.obs['total_counts'].median():.0f} (median)")
    print(f"  % Mitochondrial: {adata.obs['pct_counts_mt'].median():.1f}% (median)")
    print(f"  Mitochondrial genes found: {adata.var['mt'].sum()}")
    
    return adata


# ==============================================================================
# STEP 6: Save processed data
# ==============================================================================

def save_data(adata, output_path):
    """Save AnnData object to h5ad format."""
    print(f"\nSaving to: {output_path}")
    adata.write(output_path)
    
    file_size = output_path.stat().st_size / 1e6
    print(f"  File size: {file_size:.1f} MB")


# ==============================================================================
# MAIN
# ==============================================================================

def main():
    print("=" * 60)
    print("GSE116222 Data Loading Script")
    print("=" * 60)
    
    # Check if file exists
    if not MATRIX_FILE.exists():
        print(f"\nERROR: File not found: {MATRIX_FILE}")
        print("\nPlease download GSE116222_Expression_matrix.txt.gz from:")
        print("https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE116222")
        print(f"\nAnd save it to: {RAW_DIR}")
        return None
    
    # Step 1: Inspect file
    inspect_matrix_file(MATRIX_FILE)
    
    # Step 2: Load matrix
    df = load_expression_matrix(MATRIX_FILE)
    
    # Step 3: Convert to AnnData
    adata = matrix_to_anndata(df)
    
    # Free memory
    del df
    
    # Step 4: Assign metadata
    adata = assign_sample_metadata(adata, SAMPLE_METADATA)
    
    # Step 5: QC metrics
    adata = calculate_qc_metrics(adata)
    
    # Step 6: Save
    output_path = PROCESSED_DIR / "GSE116222_raw.h5ad"
    save_data(adata, output_path)
    
    print("\n" + "=" * 60)
    print("LOADING COMPLETE")
    print("=" * 60)
    print(f"\nDataset summary:")
    print(f"  Cells: {adata.n_obs:,}")
    print(f"  Genes: {adata.n_vars:,}")
    print(f"  File: {output_path}")
    
    return adata


if __name__ == "__main__":
    adata = main()

  from pkg_resources import get_distribution, DistributionNotFound


GSE116222 Data Loading Script

Inspecting: data/raw/GSE116222_Expression_matrix.txt.gz
--------------------------------------------------
Line 0: AAACCTGGTAATCGTC-A1	AAACGGGAGCTTTGGT-A1	AAACGGGTCTGGTATG-A1	AAAGTAGAGAACTGTA-A1	AAAGTAGCAGGGAGAG-A1	AAAGTAGGTTCGTTGA-A1	AAAGTAGTCGGGAGTA-A1	AAATGCCCAAGACGTG-A1	AAATGCCTCCTTTCTC-A1	AACACGTAGAGCAATT-A1	...
Line 1: DDX11L1	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	...
Line 2: WASH7P	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0...
Line 3: MIR6859-2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	...
Line 4: MIR130