In [1]:

import os
import zipfile
import pandas as pd
from sklearn.preprocessing import StandardScaler
from IPython.display import display

DATASETS = {
    "Darmanis": {
        "zip": "Darmanis.zip",
        "expr": "Darmanis/darmanis_expression_matrix.csv",
        "meta": "Darmanis/darmanis_cell_metadata.csv",
    },
    "Zeisel": {
        "zip": "Zeisel.zip",
        "expr": "Zeisel/expression_matrix.csv",
        "meta": "Zeisel/cell_metadata.csv",
    },
    "Zhengmix4eq": {
        "zip": "Zhengmix4eq.zip",
        "expr": "Zhengmix4eq/original_expression_matrix.csv",
        "meta": "Zhengmix4eq/cell_types.csv",
        "label_column": "cell_type",
        "sample_column": "cell"
    }
}


def unzip_all():
    for name, paths in DATASETS.items():
        with zipfile.ZipFile(paths["zip"], 'r') as zip_ref:
            zip_ref.extractall()


def preprocess_dataset(expr_path, meta_path, label_column=None, sample_column=None):
    expr = pd.read_csv(expr_path, index_col=0)
    meta = pd.read_csv(meta_path, index_col=0)

    if sample_column:
        meta.index = meta[sample_column].astype(str)
        print(f"Metadata index replaced with column: {sample_column}")

    print(f"Expression shape: {expr.shape}")
    print(f"Metadata shape: {meta.shape}")
    print("Expression index sample:", expr.index[:3].tolist())
    print("Expression columns sample:", expr.columns[:3].tolist())
    print("Metadata index sample:", meta.index[:3].tolist())

    # Step 1: Try matching expr.index with meta.index
    common = expr.index.intersection(meta.index)
    if len(common) > 0:
        print(f"Matched {len(common)} samples on index.")
        expr = expr.loc[common]
        meta = meta.loc[common]
    else:
        # Step 2: Try matching expr.columns with meta.index (transpose case)
        common = expr.columns.intersection(meta.index)
        if len(common) == 0:
            raise ValueError("No overlapping samples found between expression matrix and metadata.")
        print(f"Matched {len(common)} samples on columns.")
        expr = expr[common].T
        meta = meta.loc[common]

    y = meta[label_column] if label_column else meta.iloc[:, 0]

    scaler = StandardScaler()
    X = pd.DataFrame(
        scaler.fit_transform(expr),
        index=expr.index,
        columns=expr.columns
    )

    return X, y


unzip_all()

for name, paths in DATASETS.items():
    print(f"\nProcessing dataset: {name}")
    try:
        X, y = preprocess_dataset(
            expr_path=paths["expr"],
            meta_path=paths["meta"],
            label_column=paths.get("label_column"),
            sample_column=paths.get("sample_column")
        )
        display(X.head())
        display(y.head())
    except Exception as e:
        print(f"Error processing {name}: {e}")



Processing dataset: Darmanis
Expression shape: (22085, 466)
Metadata shape: (466, 1)
Expression index sample: ['1/2-SBSRNA4', 'A1BG', 'A1BG-AS1']
Expression columns sample: ['GSM1657871', 'GSM1657872', 'GSM1657873']
Metadata index sample: ['GSM1657871', 'GSM1657872', 'GSM1657873']
Matched 466 samples on columns.


Unnamed: 0,1/2-SBSRNA4,A1BG,A1BG-AS1,A1CF,A2LD1,A2M,A2ML1,A2MP1,A4GALT,A4GNT,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,tAKR
GSM1657871,-0.185626,-0.159644,-0.100418,-0.061168,-0.073449,-0.239698,-0.142417,-0.121807,-0.090302,-0.072242,...,-0.097472,-0.129235,-0.158152,-0.438442,-0.07825,-0.456127,-0.318678,0.115564,-0.256199,-0.107773
GSM1657872,-0.185626,-0.159644,-0.100418,-0.061168,-0.073449,-0.194295,-0.142417,-0.121807,-0.090302,-0.072242,...,-0.097472,-0.129235,-0.158152,-0.424274,-0.07825,0.504822,-0.318678,-0.339878,0.085093,-0.107773
GSM1657873,-0.185626,-0.159644,-0.100418,-0.061168,-0.073449,-0.237176,-0.142417,-0.121807,-0.090302,-0.072242,...,-0.097472,-0.129235,-0.158152,-0.207029,-0.07825,-0.456127,-0.318678,-0.339878,-0.375254,-0.107773
GSM1657874,-0.185626,-0.159644,-0.100418,-0.061168,-0.073449,-0.239698,-0.142417,-0.121807,-0.090302,-0.072242,...,-0.097472,-0.129235,-0.158152,-0.320374,-0.07825,-0.456127,-0.318678,-0.339878,0.593063,-0.107773
GSM1657875,-0.185626,-0.159644,-0.100418,-0.061168,-0.073449,-0.237176,-0.142417,-0.121807,-0.090302,-0.072242,...,-0.097472,-0.129235,-0.158152,-0.443165,-0.07825,-0.426859,-0.318678,-0.241404,-0.375254,-0.107773


GSM1657871    oligodendrocytes
GSM1657872              hybrid
GSM1657873    oligodendrocytes
GSM1657874              hybrid
GSM1657875              hybrid
Name: cell.type, dtype: object


Processing dataset: Zeisel
Expression shape: (19972, 3005)
Metadata shape: (3005, 1)
Expression index sample: ['Tspan12', 'Tshz1', 'Fnbp1l']
Expression columns sample: ['1772071015_C02', '1772071017_G12', '1772071017_A05']
Metadata index sample: ['1772071015_C02', '1772071017_G12', '1772071017_A05']
Matched 3005 samples on columns.


Unnamed: 0,Tspan12,Tshz1,Fnbp1l,Adamts15,Cldn12,Rxfp1,2310042E22Rik,Sema3c,Jam2,Apbb1ip,...,Gm20826_loc1,Gm20826_loc2,Gm20877_loc2,Gm20877_loc1,Gm20865_loc4,Gm20738_loc4,Gm20738_loc6,Gm21943_loc1,Gm21943_loc3,Gm20738_loc3
1772071015_C02,-0.325159,2.119118,0.981584,-0.134558,0.678872,-0.158767,-0.154022,2.015803,0.690184,-0.142636,...,-0.048321,-0.056966,-0.048321,-0.036509,-0.031612,-0.054809,-0.027512,-0.025807,-0.025807,-0.025807
1772071017_G12,-0.325159,0.474078,-0.021531,-0.134558,0.678872,-0.158767,-0.154022,-0.215839,-0.259886,-0.142636,...,-0.048321,-0.056966,-0.048321,-0.036509,-0.031612,-0.054809,-0.027512,-0.025807,-0.025807,-0.025807
1772071017_A05,-0.325159,-0.348442,2.486257,-0.134558,0.678872,-0.158767,2.775312,4.856075,0.690184,-0.142636,...,-0.048321,-0.056966,-0.048321,-0.036509,-0.031612,-0.054809,-0.027512,-0.025807,-0.025807,-0.025807
1772071014_B06,3.172819,1.296598,1.483142,-0.134558,-0.414968,-0.158767,4.239979,-0.012962,-0.259886,-0.142636,...,-0.048321,-0.056966,-0.048321,-0.036509,-0.031612,-0.054809,-0.027512,-0.025807,-0.025807,-0.025807
1772067065_H06,-0.325159,1.296598,-0.021531,-0.134558,-0.414968,-0.158767,-0.154022,1.812926,-0.259886,-0.142636,...,-0.048321,-0.056966,-0.048321,-0.036509,-0.031612,-0.054809,-0.027512,-0.025807,-0.025807,-0.025807


1772071015_C02    interneurons
1772071017_G12    interneurons
1772071017_A05    interneurons
1772071014_B06    interneurons
1772067065_H06    interneurons
Name: cell_type, dtype: object


Processing dataset: Zhengmix4eq
Metadata index replaced with column: cell
Expression shape: (23341, 1924)
Metadata shape: (1924, 2)
Expression index sample: ['0610005C13Rik', '0610007C21Rik', '0610007L01Rik']
Expression columns sample: ['10X_P4_2_AAACCTGAGCTACCTA', '10X_P4_2_AAACCTGCAAGACACG', '10X_P4_2_AAACCTGCATGCCTTC']
Metadata index sample: ['10X_P4_2_AAACCTGAGCTACCTA', '10X_P4_2_AAACCTGCAAGACACG', '10X_P4_2_AAACCTGCATGCCTTC']
Matched 1924 samples on columns.


Unnamed: 0,0610005C13Rik,0610007C21Rik,0610007L01Rik,0610007N19Rik,0610007P08Rik,0610007P14Rik,0610007P22Rik,0610008F07Rik,0610009B14Rik,0610009B22Rik,...,Zwilch,Zwint,Zxda,Zxdb,Zxdc,Zyg11a,Zyg11b,Zyx,Zzef1,Zzz3
10X_P4_2_AAACCTGAGCTACCTA,1.800485,-0.530719,-0.439452,-0.10886,-0.159984,-0.507404,-0.257209,-0.21527,0.0,0.952191,...,-0.032258,-0.216247,-0.119184,-0.121523,-0.176682,-0.068555,-0.328423,-0.30063,-0.263596,-0.258315
10X_P4_2_AAACCTGCAAGACACG,-0.575056,-0.530719,-0.439452,-0.10886,-0.159984,-0.507404,-0.257209,-0.21527,0.0,0.952191,...,-0.032258,-0.216247,-0.119184,-0.121523,-0.176682,-0.068555,-0.328423,-0.30063,-0.263596,-0.258315
10X_P4_2_AAACCTGCATGCCTTC,0.018829,-0.530719,-0.439452,-0.10886,-0.159984,0.488764,-0.257209,-0.21527,0.0,-0.404858,...,-0.032258,-0.216247,-0.119184,-0.121523,-0.176682,-0.068555,-0.328423,-0.30063,-0.263596,-0.258315
10X_P4_2_AAACCTGGTATCTGCA,-0.575056,-0.530719,-0.439452,-0.10886,-0.159984,-0.507404,-0.257209,-0.21527,0.0,-0.404858,...,-0.032258,-0.216247,-0.119184,-0.121523,-0.176682,-0.068555,-0.328423,-0.30063,-0.263596,-0.258315
10X_P4_2_AAACGGGAGATATGGT,-0.575056,-0.530719,-0.439452,-0.10886,-0.159984,-0.507404,-0.257209,-0.21527,0.0,-0.404858,...,-0.032258,-0.216247,-0.119184,-0.121523,-0.176682,-0.068555,-0.328423,-0.30063,-0.263596,-0.258315


10X_P4_2_AAACCTGAGCTACCTA    hepatocyte
10X_P4_2_AAACCTGCAAGACACG    hepatocyte
10X_P4_2_AAACCTGCATGCCTTC    hepatocyte
10X_P4_2_AAACCTGGTATCTGCA    hepatocyte
10X_P4_2_AAACGGGAGATATGGT           NaN
Name: cell_type, dtype: object