### Preprocessing workflow
* Unable to run because no input file, only for displaying.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
BASE_DIR = Path("c:/Users/Iriss/Desktop/try_python")

COV_PATH = BASE_DIR /"GSE138852_covariates.csv"
COUNTS_PATH = BASE_DIR / "GSE138852_counts.csv"

OUTPUT_DIR = BASE_DIR / "preprocessing"
OUTPUT_DIR.mkdir(exist_ok=True)

COV_OUT = OUTPUT_DIR / "preprocessed_GSE138852_covariates.csv"
COUNTS_OUT = OUTPUT_DIR / "preprocessed_GSE138852_counts.csv"

In [8]:
# load data
counts_raw = pd.read_csv(COUNTS_PATH, index_col=0)
meta_raw = pd.read_csv(COV_PATH, index_col=0)
counts = counts_raw.copy()
meta = meta_raw.copy()
print("Covariates shape:", meta.shape)
print("Covariate columns:", list(meta.columns))

Covariates shape: (13214, 5)
Covariate columns: ['oupSample.batchCond', 'oupSample.cellType', 'oupSample.cellType_batchCond', 'oupSample.subclustID', 'oupSample.subclustCond']


In [9]:
# insepct
batch = meta["oupSample.batchCond"]
subclust = meta["oupSample.subclustCond"]
print("\n===== Unique values in batchCond =====")
print(batch.unique())
print("\n===== Unique values in subclustCond =====")
print(subclust.unique())
print("\n===== Crosstab: batchCond vs subclustCond =====")
cross = pd.crosstab(batch, subclust)
print(cross)
mismatch = meta[batch != subclust]
print("\n===== Rows where batchCond != subclustCond =====")
print(mismatch.head())
print(f"Total mismatched cells: {len(mismatch)}")


===== Unique values in batchCond =====
['AD' 'ct']

===== Unique values in subclustCond =====
['AD' 'ct' 'undetermined']

===== Crosstab: batchCond vs subclustCond =====
oupSample.subclustCond    AD    ct  undetermined
oupSample.batchCond                             
AD                      6220   184           269
ct                       220  5866           455

===== Rows where batchCond != subclustCond =====
                         oupSample.batchCond oupSample.cellType  \
AACCGCGAGTACGTTC_AD5_AD6                  AD              oligo   
AACTGGTGTCTAGTGT_AD5_AD6                  AD             neuron   
AACTGGTGTTGGTTTG_AD5_AD6                  AD              oligo   
AAGACCTTCAGGATCT_AD5_AD6                  AD              oligo   
AAGGCAGAGTGACATA_AD5_AD6                  AD              astro   

                         oupSample.cellType_batchCond oupSample.subclustID  \
AACCGCGAGTACGTTC_AD5_AD6                     oligo_AD                   o5   
AACTGGTGTCTAGTGT_AD5_AD6

In [10]:
# Remove doublets and unID cells
n_doublet_unid = (
    (meta["oupSample.cellType"] == "doublet") |
    (meta["oupSample.cellType"] == "unID")
).sum()

print("\nCells labeled as doublet or unID:", n_doublet_unid)

meta = meta[
    (meta["oupSample.cellType"] != "doublet") &
    (meta["oupSample.cellType"] != "unID")
    ]
print("Cells remaining after removal:", len(meta))


Cells labeled as doublet or unID: 1330
Cells remaining after removal: 11884


In [11]:
# Remove duplicated cells
dup_meta = meta.index[meta.index.duplicated()]
print("\nDuplicated cells in meta:", len(dup_meta))
meta = meta[~meta.index.duplicated(keep="first")]
dup_counts = counts.columns[counts.columns.duplicated()]
print("Duplicated cells in counts:", len(dup_counts))
counts = counts.loc[:, ~counts.columns.duplicated(keep="first")]


Duplicated cells in meta: 0
Duplicated cells in counts: 0


In [12]:
# Subset counts to match meta
cell_ids = [cid for cid in meta.index if cid in counts.columns]

counts = counts[cell_ids]
meta = meta.loc[cell_ids]

In [13]:
# Final alignment check
assert (meta.index == counts.columns).all()
print("\n✓ Cleaning finished: counts and meta are aligned and deduplicated")


✓ Cleaning finished: counts and meta are aligned and deduplicated


In [14]:
# Save cleaned data
meta.to_csv(COV_OUT)
counts.to_csv(COUNTS_OUT)

print(f"\nAfter cleaning, there are {len(meta)} cells")
print("Saved files:")
print(COV_OUT)
print(COUNTS_OUT)


After cleaning, there are 11884 cells
Saved files:
c:\Users\Iriss\Desktop\try_python\preprocessing\preprocessed_GSE138852_covariates.csv
c:\Users\Iriss\Desktop\try_python\preprocessing\preprocessed_GSE138852_counts.csv


In [15]:
# final Cell type summary
cell_type_col = meta.columns[2]  # same logic as your notebook
cell_type_counts = meta[cell_type_col].value_counts()

print("\nDifferent cell type counts:")
print(cell_type_counts)


Different cell type counts:
oupSample.cellType_batchCond
oligo_AD     4655
oligo_ct     2777
astro_ct     1699
OPC_ct        899
astro_AD      472
neuron_ct     407
mg_ct         277
neuron_AD     249
OPC_AD        179
mg_AD         172
endo_ct        61
endo_AD        37
Name: count, dtype: int64
