In [1]:
import scanpy as sc
import os,sys
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import matplotlib.ticker as mticker

In [2]:
# add the utility function folder to PATH
sys.path.append(os.path.abspath("utility_functions_190403_12h24/"))

from rz_import_statements import *
import rz_functions as rz
import rz_utility_spring as srz

python version: 3.8.8


# Load  data

In [3]:
adata = sc.read_h5ad('backups_JZ_2022/no_T4_lib_mito20_umi400_filt_raw_147456x33538_220120_14h39_kidney.h5ad')


In [4]:
# overwrite obs with the most recent version
filename = 'backups_JZ_2022/no_T4_1_obs_info_51196x5_220120_14h47_kidney.npz'
encoding = 'latin1'

with np.load(filename,encoding=encoding, allow_pickle = True) as f:
    obs = pd.DataFrame(**f)
adata.obs = obs

In [5]:
adata.obs

Unnamed: 0,library,total_counts,pct_counts_mito,library2,sample
2,N14,449.0,0.668151,T2_1,T2
19,N14,449.0,4.231626,T2_1,T2
363,N14,1229.0,10.659073,T2_1,T2
413,N14,732.0,4.781421,T2_1,T2
433,N14,432.0,6.944445,T2_1,T2
...,...,...,...,...,...
4865536,Tumor0228,683.0,9.370424,T4_old,T4
4865584,Tumor0228,1473.0,6.517312,T4_old,T4
4865642,Tumor0228,498.0,9.437751,T4_old,T4
4865726,Tumor0228,421.0,19.239906,T4_old,T4


## Adding clinical information to data

All clinical variables that are added here (and more) can be found in Supplementary table S1. 

In [6]:
#adding patient ID
for i in sorted(adata.obs['sample'].unique()):
    print('"%s":"",'%i)

"N1":"",
"N2":"",
"N3":"",
"N4":"",
"N5":"",
"N6":"",
"N7":"",
"N8":"",
"N9":"",
"T2":"",
"T3":"",
"T4":"",
"T5":"",
"T6":"",
"T7":"",
"T8":"",
"T9":"",


In [7]:
renamer = {
"N1":"P1",
"N2":"P2",
"N3":"P3",
"N4":"P4",
"N5":"P5",
"N6":"P6",
"N7":"P7",
"N8":"P8",
"N9":"P9",
    
"T2":"P2",
"T3":"P3",
"T4":"P4",
"T5":"P5",
"T6":"P6",
"T7":"P7",
"T8":"P8",
"T9":"P9"  
}

adata.obs['patient'] = [renamer[i] for i in adata.obs['sample']]
adata.obs['patient']

2          P2
19         P2
363        P2
413        P2
433        P2
           ..
4865536    P4
4865584    P4
4865642    P4
4865726    P4
4866011    P4
Name: patient, Length: 51196, dtype: object

In [8]:
#Disease stage
#H stands for "healthy" tissue (adjacent healthy)

renamer = {
"N1":"H_pT3a",
"N2":"H_pT3a",
"N3":"H_pT1a",
"N4":"H_pT3a",
"N5":"H_pT1a",
"N6":"H_pT3a",
"N7":"H_pT3a",
"N8":"H_pT1a",
"N9":"H_pT3a",
    
"T2":"pT3a",
"T3":"pT1a",
"T4":"pT3a",
"T5":"pT1a",
"T6":"pT3a",
"T7":"pT3a",
"T8":"pT1a",
"T9":"pT3a"  
}

adata.obs['pT stage'] = [renamer[i] for i in adata.obs['sample']]
adata.obs['pT stage']

2          pT3a
19         pT3a
363        pT3a
413        pT3a
433        pT3a
           ... 
4865536    pT3a
4865584    pT3a
4865642    pT3a
4865726    pT3a
4866011    pT3a
Name: pT stage, Length: 51196, dtype: object

In [13]:
#operation type

renamer = {
"P1":"Open",
"P2":"Open",
"P3":"Open",
"P4":"Open",
"P5":"Open",
"P6":"Laparoscopic",
"P7":"Laparoscopic",
"P8":"Open",
"P9":"Laparoscopic"
}

adata.obs['operation'] = [renamer[i] for i in adata.obs['patient']]
adata.obs['operation']

2          Open
19         Open
363        Open
413        Open
433        Open
           ... 
4865536    Open
4865584    Open
4865642    Open
4865726    Open
4866011    Open
Name: operation, Length: 51196, dtype: object

In [14]:
#sex

renamer = {
"P1":"Female",
"P2":"Male",
"P3":"Male",
"P4":"Male",
"P5":"Female",
"P6":"Male",
"P7":"Male",
"P8":"Female",
"P9":"Female"
}

adata.obs['sex'] = [renamer[i] for i in adata.obs['patient']]
adata.obs['sex']

2          Male
19         Male
363        Male
413        Male
433        Male
           ... 
4865536    Male
4865584    Male
4865642    Male
4865726    Male
4866011    Male
Name: sex, Length: 51196, dtype: object

In [15]:
adata.obs

Unnamed: 0,library,total_counts,pct_counts_mito,library2,sample,patient,pT stage,seq_date,beads,operation,sex
2,N14,449.0,0.668151,T2_1,T2,P2,pT3a,20_11_12,old,Open,Male
19,N14,449.0,4.231626,T2_1,T2,P2,pT3a,20_11_12,old,Open,Male
363,N14,1229.0,10.659073,T2_1,T2,P2,pT3a,20_11_12,old,Open,Male
413,N14,732.0,4.781421,T2_1,T2,P2,pT3a,20_11_12,old,Open,Male
433,N14,432.0,6.944445,T2_1,T2,P2,pT3a,20_11_12,old,Open,Male
...,...,...,...,...,...,...,...,...,...,...,...
4865536,Tumor0228,683.0,9.370424,T4_old,T4,P4,pT3a,old,old,Open,Male
4865584,Tumor0228,1473.0,6.517312,T4_old,T4,P4,pT3a,old,old,Open,Male
4865642,Tumor0228,498.0,9.437751,T4_old,T4,P4,pT3a,old,old,Open,Male
4865726,Tumor0228,421.0,19.239906,T4_old,T4,P4,pT3a,old,old,Open,Male


In [16]:
#tumor size 

renamer = {
"N1":"Healthy",
"N2":"Healthy",
"N3":"Healthy",
"N4":"Healthy",
"N5":"Healthy",
"N6":"Healthy",
"N7":"Healthy",
"N8":"Healthy",
"N9":"Healthy",
    
"T2":75,
"T3":37,
"T4":120,
"T5":35,
"T6":30,
"T7":47,
"T8":33,
"T9":60 
}

adata.obs['tumor size, mm'] = [renamer[i] for i in adata.obs['sample']]
adata.obs['tumor size, mm']

2           75
19          75
363         75
413         75
433         75
          ... 
4865536    120
4865584    120
4865642    120
4865726    120
4866011    120
Name: tumor size, mm, Length: 51196, dtype: object

In [17]:
#patient age at sample collection

renamer = {
"P1":52,
"P2":60,
"P3":63,
"P4":62,
"P5":65,
"P6":68,
"P7":43,
"P8":65,
"P9":61
}

adata.obs['age'] = [renamer[i] for i in adata.obs['patient']]
adata.obs['age']

2          60
19         60
363        60
413        60
433        60
           ..
4865536    62
4865584    62
4865642    62
4865726    62
4866011    62
Name: age, Length: 51196, dtype: int64

In [18]:
#broad category for sample origin - tumor or healthy

renamer = {
"N1":"Healthy",
"N2":"Healthy",
"N3":"Healthy",
"N4":"Healthy",
"N5":"Healthy",
"N6":"Healthy",
"N7":"Healthy",
"N8":"Healthy",
"N9":"Healthy",
    
"T2":"Tumor",
"T3":"Tumor",
"T4":"Tumor",
"T5":"Tumor",
"T6":"Tumor",
"T7":"Tumor",
"T8":"Tumor",
"T9":"Tumor" 
}

adata.obs['tissue'] = [renamer[i] for i in adata.obs['sample']]
adata.obs['tissue']

2          Tumor
19         Tumor
363        Tumor
413        Tumor
433        Tumor
           ...  
4865536    Tumor
4865584    Tumor
4865642    Tumor
4865726    Tumor
4866011    Tumor
Name: tissue, Length: 51196, dtype: object

In [19]:
#clinical evaluation of necrosis in tissue

renamer = {
"N1":"Healthy",
"N2":"Healthy",
"N3":"Healthy",
"N4":"Healthy",
"N5":"Healthy",
"N6":"Healthy",
"N7":"Healthy",
"N8":"Healthy",
"N9":"Healthy",
    
"T2":"Negative",
"T3":"Negative",
"T4":"Positive",
"T5":"Negative",
"T6":"Negative",
"T7":"Positive",
"T8":"Negative",
"T9":"Negative" 
}

adata.obs['necrosis'] = [renamer[i] for i in adata.obs['sample']]
adata.obs['necrosis']

2          Negative
19         Negative
363        Negative
413        Negative
433        Negative
             ...   
4865536    Positive
4865584    Positive
4865642    Positive
4865726    Positive
4866011    Positive
Name: necrosis, Length: 51196, dtype: object

## Adding technical information

In [9]:
#adding sequencing run date
for i in sorted(adata.obs['library'].unique()):
    print('"%s":"",'%i)

"0621Normal":"",
"0621Tumor":"",
"0704Normal":"",
"0818T_S1":"",
"0903T_2_S3":"",
"0903T_S2":"",
"0914T_2_S5":"",
"0914T_S4":"",
"0923T_2_S7":"",
"0923T_S6":"",
"1116N_S11":"",
"1116T1_S8":"",
"1116T2_S9":"",
"1116T3_1_S10":"",
"Healthy0228":"",
"Healthy0314":"",
"N093_S4":"",
"N14":"",
"N21":"",
"N28":"",
"N818_S2":"",
"N914_S6":"",
"N923_S8":"",
"T14":"",
"T21":"",
"T818_S1":"",
"T903_S3":"",
"T914_S5":"",
"T923_S7":"",
"Tumor0228":"",
"Tumor0314":"",


In [10]:
renamer = {
"0621Normal":"old",
"0621Tumor":"old",
"0704Normal":"old",
    
"0818T_S1":"21_04_14",
"0903T_2_S3":"21_04_14",
"0903T_S2":"21_04_14",
"0914T_2_S5":"21_04_14",
"0914T_S4":"21_04_14",
"0923T_2_S7":"21_04_14",
"0923T_S6":"21_04_14",
"1116N_S11":"21_04_14",
"1116T1_S8":"21_04_14",
"1116T2_S9":"21_04_14",
"1116T3_1_S10":"21_04_14",
    
"Healthy0228":"old",
"Healthy0314":"old",
    
"N093_S4":"20_11_17",
    
"N14":"20_11_12",
"N21":"20_11_12",
"N28":"20_11_12",
"N818_S2":"20_11_17",
"N914_S6":"20_11_17",
"N923_S8":"20_11_17",
"T14":"20_11_12",
"T21":"20_11_12",
"T28":"20_11_12",
"T818_S1":"20_11_17",
"T903_S3":"20_11_17",
"T914_S5":"20_11_17",
"T923_S7":"20_11_17",
    
"Tumor0228":"old",
"Tumor0314":"old"
}

adata.obs['seq_date'] = [renamer[i] for i in adata.obs['library']]
adata.obs['seq_date']

2          20_11_12
19         20_11_12
363        20_11_12
413        20_11_12
433        20_11_12
             ...   
4865536         old
4865584         old
4865642         old
4865726         old
4866011         old
Name: seq_date, Length: 51196, dtype: object

In [11]:
# adding the barcoding bead version used in experiment
# "old" broadly refers to version v1 and v2020 to v2
# purhased from Atrandi Biosciences cat.no. DG-BHB-C

renamer = {
"0621Normal":"old",
"0621Tumor":"old",
"0704Normal":"old",
    
"0818T_S1":"v2020",
"0903T_2_S3":"v2020",
"0903T_S2":"v2020",
"0914T_2_S5":"v2020",
"0914T_S4":"v2020",
"0923T_2_S7":"v2020",
"0923T_S6":"v2020",
"1116N_S11":"v2020",
"1116T1_S8":"v2020",
"1116T2_S9":"v2020",
"1116T3_1_S10":"v2020",
    
"Healthy0228":"old",
"Healthy0314":"old",
    
"N093_S4":"v2020",
    
"N14":"old",
"N21":"old",
"N28":"old",
"N818_S2":"v2020",
"N914_S6":"v2020",
"N923_S8":"v2020",
"T14":"old",
"T21":"old",
"T28":"old",
"T818_S1":"v2020",
"T903_S3":"v2020",
"T914_S5":"v2020",
"T923_S7":"v2020",
    
"Tumor0228":"old",
"Tumor0314":"old"
}

adata.obs['beads'] = [renamer[i] for i in adata.obs['library']]
adata.obs['beads']

2          old
19         old
363        old
413        old
433        old
          ... 
4865536    old
4865584    old
4865642    old
4865726    old
4866011    old
Name: beads, Length: 51196, dtype: object

In [12]:
adata.obs['beads'].unique()

array(['old', 'v2020'], dtype=object)

In [20]:
adata.obs

Unnamed: 0,library,total_counts,pct_counts_mito,library2,sample,patient,pT stage,seq_date,beads,operation,sex,"tumor size, mm",age,tissue,necrosis
2,N14,449.0,0.668151,T2_1,T2,P2,pT3a,20_11_12,old,Open,Male,75,60,Tumor,Negative
19,N14,449.0,4.231626,T2_1,T2,P2,pT3a,20_11_12,old,Open,Male,75,60,Tumor,Negative
363,N14,1229.0,10.659073,T2_1,T2,P2,pT3a,20_11_12,old,Open,Male,75,60,Tumor,Negative
413,N14,732.0,4.781421,T2_1,T2,P2,pT3a,20_11_12,old,Open,Male,75,60,Tumor,Negative
433,N14,432.0,6.944445,T2_1,T2,P2,pT3a,20_11_12,old,Open,Male,75,60,Tumor,Negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4865536,Tumor0228,683.0,9.370424,T4_old,T4,P4,pT3a,old,old,Open,Male,120,62,Tumor,Positive
4865584,Tumor0228,1473.0,6.517312,T4_old,T4,P4,pT3a,old,old,Open,Male,120,62,Tumor,Positive
4865642,Tumor0228,498.0,9.437751,T4_old,T4,P4,pT3a,old,old,Open,Male,120,62,Tumor,Positive
4865726,Tumor0228,421.0,19.239906,T4_old,T4,P4,pT3a,old,old,Open,Male,120,62,Tumor,Positive


## Saving the updated obs file

In [21]:
fname = 'backups_JZ_2022/clinical_obs_info_%dx%d_%s.npz'%(adata.obs.shape[0],adata.obs.shape[1],rz.now())
print(fname)
rz.save_df(adata.obs,fname)

backups_JZ_2022/clinical_obs_info_51196x15_220120_14h54.npz


  d['descr'] = dtype_to_descr(array.dtype)
