In [5]:
"""
Memory-Efficient GSE116222 Loader - FIXED
=========================================

Fixed version that properly handles header row when using chunked reading.

File format:
- Row 0: Cell barcodes (no gene column header - starts with first cell barcode)
- Rows 1+: Gene_name TAB value1 TAB value2 ...
"""

import scanpy as sc
import anndata as ad
import pandas as pd
import numpy as np
from scipy import sparse
from pathlib import Path
import gzip
import gc

# ==============================================================================
# CONFIGURATION
# ==============================================================================

RAW_DIR = Path("../data/raw")
PROCESSED_DIR = Path("../data/processed")
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

MATRIX_FILE = RAW_DIR / "GSE116222_Expression_matrix.txt.gz"

# Sample mapping from cell barcode suffix
SAMPLE_MAPPING = {
    'A1': {'sample': 'A1_healthy', 'condition': 'Healthy', 'batch': 'A'},
    'A2': {'sample': 'A2_non_inflamed', 'condition': 'UC_noninflamed', 'batch': 'A'},
    'A3': {'sample': 'A3_inflamed', 'condition': 'UC_inflamed', 'batch': 'A'},
    'B1': {'sample': 'B1_healthy', 'condition': 'Healthy', 'batch': 'B'},
    'B2': {'sample': 'B2_non_inflamed', 'condition': 'UC_noninflamed', 'batch': 'B'},
    'B3': {'sample': 'B3_inflamed', 'condition': 'UC_inflamed', 'batch': 'B'},
    'C1': {'sample': 'C1_healthy', 'condition': 'Healthy', 'batch': 'C'},
    'C2': {'sample': 'C2_non_inflamed', 'condition': 'UC_noninflamed', 'batch': 'C'},
    'C3': {'sample': 'C3_inflamed', 'condition': 'UC_inflamed', 'batch': 'C'},
}


def load_matrix_memory_efficient(filepath, chunk_size=2000):
    """
    Load expression matrix in chunks using line-by-line reading.
    
    This approach avoids pandas chunking issues with headers.
    """
    print(f"\n{'='*60}")
    print("MEMORY-EFFICIENT MATRIX LOADING")
    print(f"{'='*60}")
    print(f"File: {filepath}")
    print(f"Chunk size: {chunk_size} genes per chunk")
    
    # -------------------------------------------------------------------------
    # STEP 1: Read header to get cell barcodes
    # -------------------------------------------------------------------------
    print("\n[1/5] Reading cell barcodes from header...")
    
    with gzip.open(filepath, 'rt') as f:
        header_line = f.readline().strip()
    
    cell_barcodes = header_line.split('\t')
    n_cells = len(cell_barcodes)
    print(f"  Found {n_cells:,} cells")
    
    # -------------------------------------------------------------------------
    # STEP 2: Count genes
    # -------------------------------------------------------------------------
    print("\n[2/5] Counting genes...")
    
    with gzip.open(filepath, 'rt') as f:
        next(f)  # Skip header
        n_genes = sum(1 for _ in f)
    
    print(f"  Found {n_genes:,} genes")
    
    # -------------------------------------------------------------------------
    # STEP 3: Read data line by line, build sparse matrix in chunks
    # -------------------------------------------------------------------------
    print("\n[3/5] Reading expression data (this takes several minutes)...")
    
    gene_names = []
    all_rows = []
    all_cols = []
    all_data = []
    
    with gzip.open(filepath, 'rt') as f:
        # Skip header
        next(f)
        
        for gene_idx, line in enumerate(f):
            # Parse line
            parts = line.strip().split('\t')
            gene_name = parts[0]
            values = parts[1:]
            
            gene_names.append(gene_name)
            
            # Only store non-zero values (sparse format)
            for cell_idx, val_str in enumerate(values):
                val = float(val_str)
                if val != 0:
                    all_rows.append(gene_idx)
                    all_cols.append(cell_idx)
                    all_data.append(val)
            
            # Progress update every 5000 genes
            if (gene_idx + 1) % 5000 == 0:
                progress = ((gene_idx + 1) / n_genes) * 100
                print(f"  Progress: {progress:.1f}% ({gene_idx + 1:,}/{n_genes:,} genes)")
                gc.collect()
    
    print(f"  Loaded {len(gene_names):,} genes")
    print(f"  Non-zero values: {len(all_data):,}")
    
    # -------------------------------------------------------------------------
    # STEP 4: Build sparse matrix
    # -------------------------------------------------------------------------
    print("\n[4/5] Building sparse matrix...")
    
    # Create sparse matrix (genes × cells)
    X_genes_by_cells = sparse.csr_matrix(
        (np.array(all_data, dtype=np.float32),
         (np.array(all_rows, dtype=np.int32),
          np.array(all_cols, dtype=np.int32))),
        shape=(n_genes, n_cells)
    )
    
    print(f"  Shape: {X_genes_by_cells.shape}")
    
    # Free memory
    del all_rows, all_cols, all_data
    gc.collect()
    
    # Transpose to cells × genes
    print("  Transposing to cells × genes...")
    X_cells_by_genes = X_genes_by_cells.T.tocsr()
    
    del X_genes_by_cells
    gc.collect()
    
    # -------------------------------------------------------------------------
    # STEP 5: Create AnnData
    # -------------------------------------------------------------------------
    print("\n[5/5] Creating AnnData object...")
    
    adata = ad.AnnData(
        X=X_cells_by_genes,
        obs=pd.DataFrame(index=cell_barcodes),
        var=pd.DataFrame(index=gene_names)
    )
    
    # Sparsity check
    sparsity = 1 - (adata.X.nnz / (adata.X.shape[0] * adata.X.shape[1]))
    print(f"  Cells: {adata.n_obs:,}")
    print(f"  Genes: {adata.n_vars:,}")
    print(f"  Sparsity: {sparsity:.1%}")
    
    return adata


def assign_metadata(adata):
    """Parse cell barcodes to extract sample metadata."""
    print("\n" + "="*60)
    print("ASSIGNING SAMPLE METADATA")
    print("="*60)
    
    samples = []
    conditions = []
    batches = []
    
    for barcode in adata.obs_names:
        # Extract suffix after last hyphen (e.g., "A1" from "AAACCTGGTAATCGTC-A1")
        if '-' in barcode:
            suffix = barcode.split('-')[-1]
        else:
            suffix = 'Unknown'
        
        if suffix in SAMPLE_MAPPING:
            meta = SAMPLE_MAPPING[suffix]
            samples.append(meta['sample'])
            conditions.append(meta['condition'])
            batches.append(meta['batch'])
        else:
            samples.append(f'Unknown_{suffix}')
            conditions.append('Unknown')
            batches.append('Unknown')
    
    adata.obs['sample'] = samples
    adata.obs['condition'] = conditions
    adata.obs['batch'] = batches
    
    print("\nSample distribution:")
    print(adata.obs['sample'].value_counts())
    
    print("\nCondition distribution:")
    print(adata.obs['condition'].value_counts())
    
    return adata


def calculate_qc(adata):
    """Calculate QC metrics."""
    print("\n" + "="*60)
    print("CALCULATING QC METRICS")
    print("="*60)
    
    # Mitochondrial genes
    adata.var['mt'] = adata.var_names.str.startswith('MT-')
    print(f"Mitochondrial genes: {adata.var['mt'].sum()}")
    
    # Ribosomal genes
    adata.var['ribo'] = adata.var_names.str.startswith(('RPS', 'RPL'))
    print(f"Ribosomal genes: {adata.var['ribo'].sum()}")
    
    # Calculate metrics
    sc.pp.calculate_qc_metrics(
        adata,
        qc_vars=['mt', 'ribo'],
        percent_top=None,
        log1p=False,
        inplace=True
    )
    
    print("\nQC Summary:")
    print(f"  Genes/cell: {adata.obs['n_genes_by_counts'].median():.0f} (median)")
    print(f"  UMIs/cell: {adata.obs['total_counts'].median():.0f} (median)")
    print(f"  %MT: {adata.obs['pct_counts_mt'].median():.1f}% (median)")
    
    return adata


def save_adata(adata, output_path):
    """Save AnnData object."""
    print(f"\nSaving to: {output_path}")
    adata.write(output_path)
    
    size_mb = output_path.stat().st_size / 1e6
    print(f"File size: {size_mb:.1f} MB")


def main():
    print("\n" + "="*60)
    print("GSE116222 DATA LOADING")
    print("="*60)
    
    if not MATRIX_FILE.exists():
        print(f"\nERROR: File not found: {MATRIX_FILE}")
        return None
    
    # Load
    adata = load_matrix_memory_efficient(MATRIX_FILE)
    
    # Metadata
    adata = assign_metadata(adata)
    
    # QC
    adata = calculate_qc(adata)
    
    # Save
    output_path = PROCESSED_DIR / "GSE116222_raw.h5ad"
    save_adata(adata, output_path)
    
    print("\n" + "="*60)
    print("COMPLETE")
    print("="*60)
    print(f"\nDataset: {adata.n_obs:,} cells × {adata.n_vars:,} genes")
    print(f"Saved to: {output_path}")
    
    return adata


if __name__ == "__main__":
    adata = main()

FileNotFoundError: [Errno 2] No such file or directory: 'data'