In [18]:
import pandas as pd
import numpy as np
from anndata import AnnData
from os.path import join
from vitessce.data_utils import optimize_adata

## Load data from CSV files using pandas

In [19]:
matrix_df = pd.read_csv(join("raw_data", "habib17.cell_by_gene_matrix.csv"), index_col=0)
matrix_df.head()

Unnamed: 0_level_0,LINC00115,RP11-54O7.1,LINC02593,SAMD11,ISG15,RP11-54O7.11,MXRA8,MRPL20,RP4-758J18.13,ANKRD65,...,RP11-539G18.2,RP11-592B15.3,RP11-698N11.4,SIK3-IT1,AC011526.1,CTA-357J21.1,RP11-28F1.2,RP11-638I8.1,RNVU1-20,RP3-511B24.6
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hHP1_ACTCAATAGCAA-habib17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hHP1_TTCCCGTTAAAG-habib17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hHP1_GTCATTGAATCA-habib17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hHP1_CACCTTCAATAC-habib17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
hHP1_ATACATGTTGTC-habib17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
cell_type_df = pd.read_csv(join("raw_data", "habib17.cell_type_annotations.csv"), index_col=0)
cell_type_df.head()

Unnamed: 0_level_0,CellType
index,Unnamed: 1_level_1
hHP1_ACTCAATAGCAA-habib17,exCA1
hHP1_TTCCCGTTAAAG-habib17,exCA3
hHP1_GTCATTGAATCA-habib17,ASC1
hHP1_CACCTTCAATAC-habib17,exCA1
hHP1_ATACATGTTGTC-habib17,exCA3


In [21]:
cell_type_df["CoarseCellType"] = cell_type_df["CellType"].apply(lambda fine_cell_type: (
    "GABA" if fine_cell_type.startswith("GABA") else (
        "ASC" if fine_cell_type.startswith("ASC") else fine_cell_type
    )
))
cell_type_df.head()

Unnamed: 0_level_0,CellType,CoarseCellType
index,Unnamed: 1_level_1,Unnamed: 2_level_1
hHP1_ACTCAATAGCAA-habib17,exCA1,exCA1
hHP1_TTCCCGTTAAAG-habib17,exCA3,exCA3
hHP1_GTCATTGAATCA-habib17,ASC1,ASC
hHP1_CACCTTCAATAC-habib17,exCA1,exCA1
hHP1_ATACATGTTGTC-habib17,exCA3,exCA3


In [22]:
umap_df = pd.read_csv(join("raw_data", "habib17.umap.csv"), index_col=0)
umap_df.head()

Unnamed: 0_level_0,UMAP_1,UMAP_2
index,Unnamed: 1_level_1,Unnamed: 2_level_1
hHP1_ACTCAATAGCAA-habib17,3.140266,-7.16688
hHP1_TTCCCGTTAAAG-habib17,-3.105793,-3.203529
hHP1_GTCATTGAATCA-habib17,6.181531,3.414144
hHP1_CACCTTCAATAC-habib17,2.862645,-7.548567
hHP1_ATACATGTTGTC-habib17,-4.022884,-4.216279


## Instantiate a new AnnData object

In [23]:
obs = cell_type_df
var = pd.DataFrame(data=[], index=matrix_df.columns.values.tolist(), columns=[])
X = matrix_df.values
obsm={ "X_umap": umap_df.values }

In [24]:
adata = AnnData(X=X, obs=obs, var=var, obsm=obsm)
adata

  adata = AnnData(X=X, obs=obs, var=var, obsm=obsm)


AnnData object with n_obs × n_vars = 13067 × 5782
    obs: 'CellType', 'CoarseCellType'
    obsm: 'X_umap'

In [25]:
adata = optimize_adata(
    adata,
    obs_cols=["CoarseCellType", "CellType"],
    obsm_keys=["X_umap"],
    optimize_X=True,
)

## Save the AnnData object to a Zarr store

In [26]:
adata.write_zarr(join("processed_data", "habib17.zarr"))