In [1]:
import scanpy as sc 
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

In [2]:
adata_folder =  "adata_from_SoupX/"
adata_files = os.listdir(adata_folder)
num_files = len(adata_files)

In [3]:
sample_names = [f.replace(".h5ad", "") for f in adata_files]

In [4]:
%%time 
adata_list = list()
for i in np.arange(num_files):
    adata_file_path = adata_folder + adata_files[i]
    sample_name = sample_names[i]
    adata = sc.read_h5ad(adata_file_path)
    adata.obs['sample'] = sample_name
    adata.obs_names = sample_name + ":" + adata.obs_names
    adata_list.append(adata)

CPU times: user 1.74 s, sys: 1.82 s, total: 3.56 s
Wall time: 3.56 s


In [5]:
# concatenate all of the adata
adata = sc.concat(adata_list)

In [6]:
adata

AnnData object with n_obs × n_vars = 313849 × 38606
    obs: 'sample'

Add more metadata

In [7]:
donor_level_metadata = pd.read_csv("../01_metadata.txt", delim_whitespace=True)
donor_level_metadata = donor_level_metadata[["sample", "age_status", "disease_status", "age", "sex", "donor_id", "region"]]

In [8]:
donor_metadata_set = set(donor_level_metadata['sample'])

In [9]:
adata.obs = adata.obs.reset_index().merge(donor_level_metadata)

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


In [11]:
adata.obs_names = adata.obs['index']
adata.obs = adata.obs.drop(columns = "index")

In [14]:
adata.obs 

Unnamed: 0_level_0,sample,age_status,disease_status,age,sex,donor_id,region
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ND15755-RV-1st:AAAAAGCTTGTC,ND15755-RV-1st,Postnatal,Healthy,65,M,ND15755,RV
ND15755-RV-1st:AAAAAGTACCTG,ND15755-RV-1st,Postnatal,Healthy,65,M,ND15755,RV
ND15755-RV-1st:AAAAATCGACAT,ND15755-RV-1st,Postnatal,Healthy,65,M,ND15755,RV
ND15755-RV-1st:AAAAATTGTCCA,ND15755-RV-1st,Postnatal,Healthy,65,M,ND15755,RV
ND15755-RV-1st:AAAACATATGGC,ND15755-RV-1st,Postnatal,Healthy,65,M,ND15755,RV
...,...,...,...,...,...,...,...
K1647-LV-nonFACS:TTTTGCTTCTTG,K1647-LV-nonFACS,Postnatal,Healthy,34,M,K1647,LV
K1647-LV-nonFACS:TTTTGTTCACTC,K1647-LV-nonFACS,Postnatal,Healthy,34,M,K1647,LV
K1647-LV-nonFACS:TTTTTCCTTGTC,K1647-LV-nonFACS,Postnatal,Healthy,34,M,K1647,LV
K1647-LV-nonFACS:TTTTTCTAACAG,K1647-LV-nonFACS,Postnatal,Healthy,34,M,K1647,LV


In [13]:
adata.write("02_before_preprocessing.h5ad")