In [1]:
import os,sys
import datetime
import scanpy as sc
import numpy as np
import pandas as pd

# Load data

In [2]:
adata = sc.read_h5ad('backups_JZ_2022/clean_kidney_304_15tr_25c_batch_corrected_50236x2000_220315_12h14.h5ad')

In [3]:
filename = 'backups_JZ_2022/annotated_obs_info_50236x27_220503_10h45.npz'
encoding = 'latin1'

with np.load(filename,encoding=encoding, allow_pickle = True) as f:
    obs = pd.DataFrame(**f)
adata.obs = obs
adata.obs

Unnamed: 0,library,total_counts,pct_counts_mito,library2,sample,patient,pT stage,seq_date,beads,operation,...,top3pct_dbtl_score,top5pct_dbtl_score,top10pct_dbtl_score,closest_JZ_kidney,closest_JZ_kidney_hvg,n_counts,no_dblt_no_rbc,sp_cl_43,cell_type,broad_cell_type
2,N14,449,0.668151,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,False,False,TAM 2,TAM 2,449,True,40,TAM 4,Immune
19,N14,449,4.23163,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,False,False,TAM 2,TAM 2,449,True,37,TAM 3,Immune
363,N14,1229,10.6591,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,False,False,Tumor cells 2,Tumor cells 2,1229,True,22,Tumor cells 2,Tumor
433,N14,432,6.94444,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,False,False,Tumor cells 1,Tumor cells 2,432,True,27,Tumor cells 3,Tumor
444,N14,502,4.98008,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,False,False,False,Tumor vasculature 2,Tumor vasculature 2,502,True,28,Tumor vasculature 4,Endothelial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4865536,Tumor0228,683,9.37042,T4_old,T4,P4,pT3a,old,old,Open,...,False,False,False,TAM 1,TAM 4,683,True,30,TAM 2,Immune
4865584,Tumor0228,1473,6.51731,T4_old,T4,P4,pT3a,old,old,Open,...,False,False,True,TAM 1,TAM 4,1473,True,30,TAM 2,Immune
4865642,Tumor0228,498,9.43775,T4_old,T4,P4,pT3a,old,old,Open,...,False,False,False,TAM 4,TAM 4,498,True,37,TAM 3,Immune
4865726,Tumor0228,421,19.2399,T4_old,T4,P4,pT3a,old,old,Open,...,False,False,False,Mito high TAM/tumor cells,Mito high TAM/tumor cells,421,True,27,Tumor cells 3,Tumor


In [5]:
adata =adata.raw.to_adata()

In [None]:
adata.X.sum(axis=1) #raw_counts

In [7]:
adata.obs['broad_cell_type'].unique()

array(['Immune', 'Tumor', 'Endothelial', 'Stromal', 'Cycling',
       'Epithelial'], dtype=object)

In [8]:
label = 'exp_cyc_epi'
mask = ~adata.obs['broad_cell_type'].isin(['Cycling', 'Epithelial']).values
mask

array([ True,  True,  True, ...,  True,  True,  True])

In [9]:
adata = adata[mask]
adata

View of AnnData object with n_obs × n_vars = 42608 × 33538
    obs: 'library', 'total_counts', 'pct_counts_mito', 'library2', 'sample', 'patient', 'pT stage', 'seq_date', 'beads', 'operation', 'sex', 'tumor size, mm', 'age', 'tissue', 'necrosis', 'doublet_score', 'potential_doublet', 'top3pct_dbtl_score', 'top5pct_dbtl_score', 'top10pct_dbtl_score', 'closest_JZ_kidney', 'closest_JZ_kidney_hvg', 'n_counts', 'no_dblt_no_rbc', 'sp_cl_43', 'cell_type', 'broad_cell_type'
    uns: 'X_lin_cptt', 'X_log_z', 'beads_colors', 'draw_graph', 'neighbors', 'pca', 'sample_colors', 'seq_date_colors', 'tissue_colors', 'umap'
    obsm: 'X_draw_graph_fa', 'X_pca', 'X_pca_harmony', 'X_umap'

In [10]:

sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
adata.X.sum(axis = 1)

Trying to set attribute `.obs` of view, copying.


matrix([[10000.001],
        [10000.   ],
        [10000.001],
        ...,
        [ 9999.999],
        [ 9999.999],
        [ 9999.999]], dtype=float32)

In [13]:
#also performing log-transformation as in Bi et al.2021 and Braun et al.2021
sc.pp.log1p(adata)
adata.X.sum(axis=1)

matrix([[ 934.6439],
        [ 934.066 ],
        [1480.039 ],
        ...,
        [ 857.8656],
        [ 800.2327],
        [ 930.7578]], dtype=float32)

In [14]:
fname = 'outputs_JZ_2022/cellphone_counts_norm_log_%s.h5ad'%label
adata.write(fname)

... storing 'library' as categorical
... storing 'library2' as categorical
... storing 'sample' as categorical
... storing 'patient' as categorical
... storing 'pT stage' as categorical
... storing 'seq_date' as categorical
... storing 'beads' as categorical
... storing 'operation' as categorical
... storing 'sex' as categorical
... storing 'tissue' as categorical
... storing 'necrosis' as categorical
... storing 'closest_JZ_kidney' as categorical
... storing 'closest_JZ_kidney_hvg' as categorical
... storing 'sp_cl_43' as categorical
... storing 'cell_type' as categorical
... storing 'broad_cell_type' as categorical


In [15]:
df_meta = pd.DataFrame(data={'Cell': list(adata.obs.index), 
                             'cell_type': list(adata.obs['cell_type'])})



In [17]:
df_meta.set_index('Cell',inplace=True)

In [18]:
df_meta

Unnamed: 0_level_0,cell_type
Cell,Unnamed: 1_level_1
2,TAM 4
19,TAM 3
363,Tumor cells 2
433,Tumor cells 3
444,Tumor vasculature 4
...,...
4865536,TAM 2
4865584,TAM 2
4865642,TAM 3
4865726,Tumor cells 3


In [19]:
df_meta.to_csv('outputs_JZ_2022/cellphone_meta_%s.txt'%label, sep='\t')