# Protein Panel Expansion

This notebook shows how to transfer customize protein data into correct shape for scLinguist.

## 0. Imports

In [17]:
import sys
sys.path.append('../../')
import numpy as np
import pandas as pd
import anndata as anndata
import scanpy as sc
from pathlib import Path
from scipy.sparse import csr_matrix
from scLinguist.data_loaders.data_loader import expand_protein_to_panel

## 1. Parameters

In [18]:
PANEL_PATH = Path('../../docs/tutorials/protein_index_map.csv')
PROTEIN_PATH = Path('../../data/test_sample_adt.h5ad')
OUTPUT_DIR = Path('../../docs/tutorials/expanded_output')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_H5AD = OUTPUT_DIR / 'pro_expanded_6427.h5ad'
COMPRESSION = 'gzip'

## 2. Load data & target panel

In [21]:
pro = sc.read_h5ad(PROTEIN_PATH)[:, :10] # take 10 proteins as example
panel = pd.read_csv(PANEL_PATH, index_col=None)
panel = panel.name.tolist()
print('Protein AnnData:', pro.shape)
print('Panel length:', len(panel))

Protein AnnData: (10546, 10)
Panel length: 6427


In [22]:
panel

['SP110',
 'GTPBA',
 'SNX2',
 'FRG1',
 'TT21A',
 'RHG18',
 'AR',
 'DOCK1',
 'RAB1A',
 'MUC1.HMFG2',
 'H2B1L',
 'RFC1',
 'TXTP',
 'MER34',
 'IL.3',
 'FXR2',
 'ARMD3',
 'ZDHC9',
 'KAPCA',
 'HH3',
 'ZBT21',
 'F149B',
 'PDE10',
 'PCD18',
 'PEBB',
 'PRI1',
 'NU214',
 'TNAP',
 'JHD2C',
 'MIC19',
 'CD213A2',
 'HYEP',
 'ZFR',
 'PUF60',
 'GPAT3',
 'DIDO1',
 'CD185',
 'AT12A',
 'TATD1',
 'ZN341',
 'RPB3',
 'APOC3',
 'TCR.VY9',
 'HS90B',
 'EMAL6',
 'WFS1',
 'RS9',
 'PGK2',
 'TYW2',
 'EF1B',
 'IMB1',
 'ACADM',
 'PAR6A',
 'CAP1',
 'CD357',
 'RGS7',
 'TCTP',
 'EXOS9',
 'RT34',
 'SQSTM',
 'TBA1A',
 'ENOPH',
 'BAP18',
 'AQR',
 'SMRC1',
 'FRIH',
 'HS105',
 'MORN3',
 'TIM23',
 'K1143',
 'CXB3',
 'CD72',
 'UBP25',
 'ECP',
 'EF2',
 'NSF1C',
 'CD196',
 'SPDE3',
 'CCR10',
 'PON2',
 'PURB',
 'PLCG2',
 'TCPD',
 'CPNE3',
 'TCRG',
 'MAOX',
 'PEPL',
 'NUF2',
 'RYR2',
 'LIMC1',
 'CHM2B',
 'LNX1',
 'MARCS',
 'ATP5E',
 'ZBED6',
 'FA47C',
 'UBE4B',
 'AAMDC',
 'CGAS',
 'AL1L1',
 'PDE1A',
 'VIR',
 'AKP8L',
 'APC',
 'P

## 3. Expand and save

In [23]:
pro_expanded = expand_protein_to_panel(pro, panel, id_col=None)
print('Expanded shape:', pro_expanded.shape)
pro_expanded.write_h5ad(OUT_H5AD, compression=COMPRESSION)
print('Saved to:', OUT_H5AD)

Expanded shape: (10546, 6427)
Saved to: ../../docs/tutorials/expanded_output/pro_expanded_6427.h5ad


  pro.var["feature_id"] = pro.var_names


## 4. Quick check

In [24]:
print('First 10 names:', list(pro_expanded.var_names[:10]))
print('Total vars:', pro_expanded.n_vars)
missing = [p for p in panel if p not in set(pro.var_names)]
print('Missing count from source (filled with zeros):', len(missing))

First 10 names: ['SP110', 'GTPBA', 'SNX2', 'FRG1', 'TT21A', 'RHG18', 'AR', 'DOCK1', 'RAB1A', 'MUC1.HMFG2']
Total vars: 6427
Missing count from source (filled with zeros): 6417
