In [1]:
import scanpy as sc 
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

In [2]:
adata_folder =  "adata_from_SoupX/"
adata_files = os.listdir(adata_folder)
num_files = len(adata_files)

In [3]:
sample_names = [f.replace(".h5ad", "") for f in adata_files]

In [4]:
%%time 
adata_list = list()
for i in np.arange(num_files):
    adata_file_path = adata_folder + adata_files[i]
    sample_name = sample_names[i]
    adata = sc.read_h5ad(adata_file_path)
    adata.obs['sample'] = sample_name
    adata.obs_names = sample_name + ":" + adata.obs_names
    adata_list.append(adata)

CPU times: user 344 ms, sys: 510 ms, total: 854 ms
Wall time: 853 ms


In [5]:
# concatenate all of the adata
adata = sc.concat(adata_list)

In [6]:
adata.obs

Unnamed: 0,sample
Fetal-1st-LV-0315-2-run1n2:AAAAAAACTGGC,Fetal-1st-LV-0315-2-run1n2
Fetal-1st-LV-0315-2-run1n2:AAAAAAGCCCCT,Fetal-1st-LV-0315-2-run1n2
Fetal-1st-LV-0315-2-run1n2:AAAAAAGGGCGG,Fetal-1st-LV-0315-2-run1n2
Fetal-1st-LV-0315-2-run1n2:AAAAACTCCCTA,Fetal-1st-LV-0315-2-run1n2
Fetal-1st-LV-0315-2-run1n2:AAAAACTGGCGG,Fetal-1st-LV-0315-2-run1n2
...,...
Fetal-2nd-OFT-2:TTTTTATGGGTG,Fetal-2nd-OFT-2
Fetal-2nd-OFT-2:TTTTTGCTTCTG,Fetal-2nd-OFT-2
Fetal-2nd-OFT-2:TTTTTGGTTCAG,Fetal-2nd-OFT-2
Fetal-2nd-OFT-2:TTTTTTGACTTG,Fetal-2nd-OFT-2


### Add back the donor metadata

This has been updated after SNP calling

In [7]:
donor_level_metadata = pd.read_csv("../01_fetal_updated_metadata.csv", index_col = 0)
donor_level_metadata = donor_level_metadata[["sample", "age_status", "age", "sex", "donor_id", "region"]]

In [8]:
donor_level_metadata

Unnamed: 0,sample,age_status,age,sex,donor_id,region
0,Fetal-1st-LV-0315-1-run1n2,Fetal,18,female,Penn_F1,LV
1,Fetal-1st-LV-0315-2-run1n2,Fetal,18,female,Penn_F1,LV
2,Fetal-2nd-Atria-1,Fetal,18,male,Penn_F2,Atria
3,Fetal-2nd-Atria-2,Fetal,18,male,Penn_F2,Atria
4,Fetal-2nd-OFT-1,Fetal,18,male,Penn_F2,OFT
5,Fetal-2nd-OFT-2,Fetal,18,male,Penn_F2,OFT
6,Fetal-Atria-18wk,Fetal,18,female,Penn_F1,Atria
7,Fetal-LRV-18wk-male1-run1n2,Fetal,18,male,Penn_F2,LRV
8,Fetal-LRV-18wk-male2-run1n2,Fetal,18,male,Penn_F2,LRV
9,Fetal-LRV-18wk-male3-run1n2,Fetal,18,male,Penn_F2,LRV


In [9]:
donor_metadata_set = set(donor_level_metadata['sample'])

In [10]:
adata.obs = adata.obs.reset_index().merge(donor_level_metadata)

In [11]:
adata.obs_names = adata.obs['index']
adata.obs = adata.obs.drop(columns = "index")

In [12]:
adata.obs = adata.obs.rename(columns = {"sample": "sample_id"})

In [13]:
adata.obs 

Unnamed: 0_level_0,sample_id,age_status,age,sex,donor_id,region
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Fetal-1st-LV-0315-2-run1n2:AAAAAAACTGGC,Fetal-1st-LV-0315-2-run1n2,Fetal,18,female,Penn_F1,LV
Fetal-1st-LV-0315-2-run1n2:AAAAAAGCCCCT,Fetal-1st-LV-0315-2-run1n2,Fetal,18,female,Penn_F1,LV
Fetal-1st-LV-0315-2-run1n2:AAAAAAGGGCGG,Fetal-1st-LV-0315-2-run1n2,Fetal,18,female,Penn_F1,LV
Fetal-1st-LV-0315-2-run1n2:AAAAACTCCCTA,Fetal-1st-LV-0315-2-run1n2,Fetal,18,female,Penn_F1,LV
Fetal-1st-LV-0315-2-run1n2:AAAAACTGGCGG,Fetal-1st-LV-0315-2-run1n2,Fetal,18,female,Penn_F1,LV
...,...,...,...,...,...,...
Fetal-2nd-OFT-2:TTTTTATGGGTG,Fetal-2nd-OFT-2,Fetal,18,male,Penn_F2,OFT
Fetal-2nd-OFT-2:TTTTTGCTTCTG,Fetal-2nd-OFT-2,Fetal,18,male,Penn_F2,OFT
Fetal-2nd-OFT-2:TTTTTGGTTCAG,Fetal-2nd-OFT-2,Fetal,18,male,Penn_F2,OFT
Fetal-2nd-OFT-2:TTTTTTGACTTG,Fetal-2nd-OFT-2,Fetal,18,male,Penn_F2,OFT


In [14]:
adata.write("02_before_preprocessing.h5ad")