Prepare excel file with metadata from adatas for an easier overview of the availiable samples.

In [97]:
import scanpy as sc
import pandas as pd
import os
import pickle

In [2]:
path_rna='/lustre/groups/ml01/workspace/karin.hrovatin/data/pancreas/scRNA/'

In [99]:
# All datasets
datasets=[
    ('human','GSE83139','GSE83139/GEO/'),
    ('human','GSE154126','GSE154126/GEO/'),
    ('human','GSE101207','GSE101207/GEO/'),
    ('human','GSE124742_GSE164875_patch','GSE124742_GSE164875/GEO/patch/'),
    ('human','GSE124742_FACS','GSE124742_GSE164875/GEO/FACS/'),
    ('human','GSE86469','GSE86469/GEO/'),
    ('human','GSE81547','GSE81547/GEO/'),
    ('human','GSE198623','P21000/sophie/human/'),
    ('human','GSE81608','GSE81608/GEO/'),
    ('human','GSE148073','GSE148073/GEO/'),
    ('mouse','GSE137909','GSE137909/GEO/'),
    ('mouse','GSE83146','GSE83146/GEO/')
]

In [100]:
# Save info about all datasets
pickle.dump(datasets,open(path_rna+'external_info.pkl','wb'))

In [101]:
# Save metadata of all datasets
writer = pd.ExcelWriter(path_rna+'external_metadata.xlsx',
                        engine='xlsxwriter') 
for species,name,d in datasets:
    print(species,name)
    # Load adata - filtered if exists
    path_filtered=path_rna+d+'adata_filtered.h5ad'
    file=path_filtered if os.path.exists(path_filtered) else path_rna+d+'adata.h5ad'
    obs=sc.read(file,backed='r').obs.copy()
    donors_all=set(obs.donor.unique()) if 'donor' in obs.columns else set()
    # all data cols
    print('All cols:',list(obs.columns))
    # Remove cells not useful for saving and grouping
    cols_keep=[c for c in obs.columns 
         # Remove organ/organism/tisse/cell info/geo_accession 
         # (as not added to all samples now,may also not be sample specific (e.g. could be cells)
         if 'organ' not in c and 'tissue' not in c and 'cell_' not in c
            and 'geo_accession' not in c ]
    # Make groups based on kept cols
    # Make sure empty groups are dropped and groups with NA are not dsriopped - does not work
    # Thuis fill nan with NA, must first remove categorical
    def uncategorize(col):
        if col.dtype.name == 'category':
            return col.astype(col.cat.categories.dtype)         
        else:
            return col
    obs = obs.apply(uncategorize, axis=0)
    obs=obs.fillna('NA')
    obs=obs.groupby(cols_keep,observed=True,dropna=False)
    # N beta cells per group - add to df
    n_beta=obs.apply(lambda x:x.query('cell_type=="beta"').shape[0])
    # Workaround to add n_beta - remove size latter
    obs=pd.DataFrame(obs.size())
    obs['N_beta_cells']=n_beta.values
    obs=obs.reset_index().drop(0,axis=1)
    # Add species
    obs['organism']=species
    # Display and save
    print('Saved cols:',list(obs.columns))
    display(obs)
    # Make sure donor (if exists) is unique to sinle group - groupping worked as expected
    if 'donor' in obs.columns and obs.value_counts('donor').max()>1:
        raise ValueError('Duplicated donor')
    # make sure all donors are kept
    if len(donors_all)>0:
        if not donors_all==set(obs.donor.unique()):
            raise ValueError('Donors not matching')
    obs.to_excel(writer, sheet_name=name,index=False)   
writer.save()

human GSE83139
All cols: ['tissue', 'disease', 'cell_type', 'geo_accession', 'organ', 'organism', 'donor', 'age', 'sex', 'ethnicity', 'BMI', 'cultured_days', 'cell_type_original']
Saved cols: ['disease', 'donor', 'age', 'sex', 'ethnicity', 'BMI', 'cultured_days', 'N_beta_cells', 'organism']


Unnamed: 0,disease,donor,age,sex,ethnicity,BMI,cultured_days,N_beta_cells,organism
0,T1D,ACGI428,23 y,male,,25.0,10.0,6,human
1,T2D,HP-15041,57 y,male,african_american,23.98,4.0,5,human
2,T2D,HP-15085,37 y,female,white,39.3,4.0,16,human
3,T2D,HP-15085: cultured,37 y,female,white,39.3,12.0,17,human
4,healthy,AAJF122,52 y,male,asian,29.1,6.0,1,human
5,healthy,ABAF490,39 y,female,white,45.2,4.0,30,human
6,healthy,ACAP236,21 y,male,white,39.0,2.0,17,human
7,healthy,ICRH76,2 y,male,white,13.6,2.0,4,human
8,healthy,ICRH80,19 m,female,white,18.0,3.0,15,human


human GSE154126
All cols: ['geo_accession', 'disease', 'cell_type_original', 'donor', 'age', 'sex', 'ethnicity', 'BMI', 'cell_type']
Saved cols: ['disease', 'donor', 'age', 'sex', 'ethnicity', 'BMI', 'N_beta_cells', 'organism']


Unnamed: 0,disease,donor,age,sex,ethnicity,BMI,N_beta_cells,organism
0,T2D,ACIA085_APT2D,64 y,male,european_american,33.16,0,human
1,T2D,ADDU206,41 y,male,hispanic,26.2,1,human
2,T2D,AEBK009,53 y,male,,,2,human
3,T2D,AEGY230A,52 y,female,hispanic,42.8,5,human
4,T2D,AEHL151,59 y,male,hispanic,27.7,1,human
5,T2D,HP-15041,57 y,male,african_american,23.98,4,human
6,T2D,HP-15085-01T2D,37 y,female,european_american,39.3,27,human
7,T2D,HP15269-01T2D,55 y,female,european_american,29.84,5,human
8,T2D,HP16012_1T2D,42 y,male,european_american,43.7,1,human
9,T2D,ICRH95,60 y,female,african_american,28.2,0,human


human GSE101207
All cols: ['donor', 'cell_type', 'cell_type_original', 'sex', 'age', 'BMI', 'HbA1c', 'ethnicity', 'death_cause', 'disease', 'organ', 'tissue', 'organism']
Saved cols: ['donor', 'sex', 'age', 'BMI', 'HbA1c', 'ethnicity', 'death_cause', 'disease', 'N_beta_cells', 'organism']


Unnamed: 0,donor,sex,age,BMI,HbA1c,ethnicity,death_cause,disease,N_beta_cells,organism
0,H1,male,27 y,20.6,5.4,ascian-filipino,stroke,healthy,322,human
1,H2,male,21 y,22.8,5.2,caucasian,trauma,healthy,247,human
2,H3,female,38 y,34.4,5.0,caucasian,anoxic event,healthy,2302,human
3,H4,male,52 y,22.0,5.6,caucasian,stroke,healthy,2357,human
4,H5,male,28 y,30.8,4.9,caucasian,stroke,healthy,1202,human
5,H6,male,44 y,34.6,0.054,caucasian,automobile accident,healthy,1662,human
6,T2D1,male,58 y,39.3,8.9,caucasian,anoxic event,T2D,248,human
7,T2D2,male,61 y,28.1,5.2,caucasian,cerebral vascular accident,T2D,387,human
8,T2D3,male,51 y,35.59,0.071,hispanic,stroke,T2D,1231,human


human GSE124742_GSE164875_patch
All cols: ['donor', 'age', 'sex', 'disease', 'years_diagnosis', 'cell_type_original', 'HbA1c', 'BMI', 'cell_type']
Saved cols: ['donor', 'age', 'sex', 'disease', 'years_diagnosis', 'HbA1c', 'BMI', 'N_beta_cells', 'organism']


Unnamed: 0,donor,age,sex,disease,years_diagnosis,HbA1c,BMI,N_beta_cells,organism
0,AGAH468,43 y,male,T2D,5.5,7.0,37.3,1,human
1,AGAL381,52 y,female,T2D,10.0,7.0,21.9,8,human
2,AGJU173,52 y,female,T2D,0.0,9.9,29.2,10,human
3,H2280,53 y,female,healthy,,5.7,20.8,1,human
4,H2289,62 y,female,healthy,,5.7,19.1,0,human
5,H2294,38 y,male,healthy,,6.0,35.5,0,human
6,H2296,52 y,female,healthy,,5.2,29.2,0,human
7,R079,32 y,male,T1D,17.0,9.3,21.9,1,human
8,R119,27 y,male,T1D,11.0,,18.6,11,human
9,R124,43 y,female,healthy,,5.2,24.5,0,human


human GSE124742_FACS
All cols: ['cell_type', 'donor', 'disease', 'age', 'sex', 'BMI', 'HbA1c']
Saved cols: ['donor', 'disease', 'age', 'sex', 'BMI', 'HbA1c', 'N_beta_cells', 'organism']


Unnamed: 0,donor,disease,age,sex,BMI,HbA1c,N_beta_cells,organism
0,R230,healthy,58.0 y,male,29.411765,6.2,32,human
1,R231,T2D,41.0 y,female,37.128028,6.8,25,human
2,R235,healthy,53.0 y,female,24.464602,5.7,12,human
3,R241,T2D,65.0 y,male,21.8,9.9,7,human
4,R242,healthy,46.0 y,male,20.1,5.9,246,human
5,R244,T2D,48.0 y,female,30.45,7.5,32,human
6,R247,healthy,72.0 y,male,23.9,,30,human
7,R252,healthy,26.0 y,female,25.4,5.0,46,human
8,R253,healthy,57.0 y,male,25.6,5.0,27,human
9,R256,healthy,23.0 y,male,32.5,5.4,61,human


human GSE86469


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


All cols: ['geo_accession', 'cell_type_original', 'sex', 'disease', 'age', 'ethnicity', 'BMI', 'donor', 'cell_type', 'medication', 'HbA1c']
Saved cols: ['sex', 'disease', 'age', 'ethnicity', 'BMI', 'donor', 'medication', 'HbA1c', 'N_beta_cells', 'organism']


Unnamed: 0,sex,disease,age,ethnicity,BMI,donor,medication,HbA1c,N_beta_cells,organism
0,female,T2D,42 y,hispanic,43.0,ACJV399,no,6.5,57,human
1,female,T2D,55 y,white,29.8,ACIW009,no,7.4,27,human
2,female,healthy,53 y,white,22.0,ACEK420A,no,,10,human
3,female,healthy,56 y,white,26.6,ACIB065,no,,25,human
4,male,T2D,51 y,hispanic,35.8,ACCR015A,yes,,12,human
5,male,healthy,22 y,african_american,32.95,ACCG268,no,5.4,50,human
6,male,healthy,29 y,white,23.0,ACEL337,no,5.1,71,human
7,male,healthy,30 y,african_american,55.0,ACHY057,no,5.2,12,human


human GSE81547
All cols: ['geo_accession', 'age', 'sex', 'cell_type_original', 'donor', 'ethnicity', 'BMI', 'cell_type', 'cell_name', 'disease']
Saved cols: ['age', 'sex', 'donor', 'ethnicity', 'BMI', 'disease', 'N_beta_cells', 'organism']


Unnamed: 0,age,sex,donor,ethnicity,BMI,disease,N_beta_cells,organism
0,1 m,male,DID_scRSq01,african_american,13.71,healthy,48,human
1,21 y,male,DID_scRSq04,caucasian,28.4,healthy,4,human
2,22 y,male,DID_scRSq05,asian,24.8,healthy,89,human
3,38 y,female,DID_scRSq06,african_caucasian,29.5,healthy,69,human
4,44 y,female,DID_scRSq07,american,23.8,healthy,24,human
5,5 y,male,DID_scRSq02,caucasian,17.6,healthy,35,human
6,54 y,male,DID_scRSq08,caucasian,27.29,healthy,36,human
7,6 y,male,DID_scRSq03,,,healthy,43,human


human GSE198623
All cols: ['donor', 'age', 'sex', 'BMI', 'HbA1c', 'cell_type_original', 'cell_type', 'cell_subtype_original', 'disease']
Saved cols: ['donor', 'age', 'sex', 'BMI', 'HbA1c', 'disease', 'N_beta_cells', 'organism']


Unnamed: 0,donor,age,sex,BMI,HbA1c,disease,N_beta_cells,organism
0,R229,22 y,female,23.0,5.3,healthy,1975,human
1,R237,61 y,male,19.6,5.9,healthy,2196,human
2,R239,24 y,female,22.0,5.5,healthy,1485,human
3,R245,63 y,male,22.3,5.6,healthy,2440,human
4,R266,74 y,female,29.2,6.0,healthy,3827,human


human GSE81608
All cols: ['geo_accession', 'donor', 'disease', 'age', 'ethnicity', 'sex', 'cell_type_original', 'cell_type', 'BMI', 'HbA1c']
Saved cols: ['donor', 'disease', 'age', 'ethnicity', 'sex', 'BMI', 'HbA1c', 'N_beta_cells', 'organism']


Unnamed: 0,donor,disease,age,ethnicity,sex,BMI,HbA1c,N_beta_cells,organism
0,Non T2D 1,healthy,23 y,african_american,male,21.0,,13,human
1,Non T2D 10,healthy,43 y,caucasian,male,31.7,Normal,26,human
2,Non T2D 11,healthy,31 y,hispanic,female,28.7,5.4,12,human
3,Non T2D 12,healthy,56 y,african_american,male,22.8,5.2,50,human
4,Non T2D 2,healthy,32 y,caucasian,female,19.0,5.1,13,human
5,Non T2D 3,healthy,23 y,caucasian,female,24.5,4.9,10,human
6,Non T2D 4,healthy,56 y,caucasian,female,24.1,,11,human
7,Non T2D 5,healthy,27 y,african_american,male,31.8,,24,human
8,Non T2D 6,healthy,68 y,caucasian,male,26.7,5.3,7,human
9,Non T2D 7,healthy,29 y,caucasian,male,23.4,5.1,17,human


human GSE148073
All cols: ['cell_type_original', 'cell_type', 'disease', 'donor', 'sex', 'age', 'ethnicity', 'BMI', 'years_diagnosis', 'HbA1c', 'organism']
Saved cols: ['disease', 'donor', 'sex', 'age', 'ethnicity', 'BMI', 'years_diagnosis', 'HbA1c', 'N_beta_cells', 'organism']


Unnamed: 0,disease,donor,sex,age,ethnicity,BMI,years_diagnosis,HbA1c,N_beta_cells,organism
0,AAB,HPAP019,male,22 y,caucasian,29.8,,5.2,3193,human
1,AAB,HPAP024,male,18 y,caucasian,24.3,,5.5,89,human
2,AAB,HPAP029,male,23 y,caucasian,28.6,,5.3,30,human
3,AAB,HPAP038,male,13 y,caucasian,18.34,,5.7,1074,human
4,AAB,HPAP043,male,15 y,hispanic,24.07,,5.90,281,human
5,AAB,HPAP045,female,27 y,caucasian,26.2,,5.2,1066,human
6,AAB,HPAP049,male,29 y,caucasian,37.2,,5.4,959,human
7,AAB,HPAP050,female,21 y,hispanic,28.99,,5.1,377,human
8,T1D,HPAP020,male,14 y,caucasian,13.32,,,199,human
9,T1D,HPAP021,female,13 y,caucasian,21.4,7.0,,81,human


mouse GSE137909
All cols: ['strain', 'STZ', 'insulin_implant', 'time_after_STZ', 'age', 'batch', 'cell_type_original', 'cell_subtype_original', 'cell_type', 'donor', 'sex', 'disease']
Saved cols: ['strain', 'STZ', 'insulin_implant', 'time_after_STZ', 'age', 'batch', 'donor', 'sex', 'disease', 'N_beta_cells', 'organism']


Unnamed: 0,strain,STZ,insulin_implant,time_after_STZ,age,batch,donor,sex,disease,N_beta_cells,organism
0,Ngn3-Cre; Rosa-RFP,False,False,,12 d,1,Ngn3Cre_P12_1,male,healthy,61,mouse
1,Ngn3-Cre; Rosa-RFP,False,False,,12 d,2,Ngn3Cre_P12_2,male,healthy,67,mouse
2,Ngn3-Cre; Rosa-RFP,False,False,,2 m,1,Ngn3Cre_Control_1,male,healthy,210,mouse
3,Ngn3-Cre; Rosa-RFP,False,False,,21 d,1,Ngn3Cre_P21_1,male,healthy,105,mouse
4,Ngn3-Cre; Rosa-RFP,False,False,,3 d,1,Ngn3Cre_P3_1,male,healthy,123,mouse
5,Ngn3-Cre; Rosa-RFP,False,False,,4 m,2,Ngn3Cre_Control_2,male,healthy,138,mouse
6,Ngn3-Cre; Rosa-RFP,False,False,,4 m,3,Ngn3Cre_Control_3,male,healthy,21,mouse
7,Ngn3-Cre; Rosa-RFP,True,False,2 m,4 m,1,Ngn3Cre_STZ_M2_NoIns_1,male,T2D,5,mouse
8,Ngn3-Cre; Rosa-RFP,True,False,2 m,4 m,2,Ngn3Cre_STZ_M2_NoIns_2,male,T2D,11,mouse
9,Ngn3-Cre; Rosa-RFP,True,True,12 d,2.4 m,1,Ngn3Cre_STZ_D12_1,male,T2D,16,mouse


mouse GSE83146
All cols: ['geo_accession', 'age', 'sex', 'cell_type', 'disease']
Saved cols: ['age', 'sex', 'disease', 'N_beta_cells', 'organism']


Unnamed: 0,age,sex,disease,N_beta_cells,organism
0,26 m,male,healthy,69,mouse
1,3 m,male,healthy,138,mouse


In [82]:
path_rna+'external_metadata.xlsx'

'/lustre/groups/ml01/workspace/karin.hrovatin/data/pancreas/scRNA/external_metadata.xlsx'