In [1]:
import os
os.chdir('../')
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import scanpy as sc
import json
save_path = os.path.expanduser(f'~/Documents/')

In [3]:
def takeSecond(elem):
    return elem[1]

def create_conditions_order(adata, condition_key):
    adata_conditions = adata.obs[condition_key].unique().tolist()
    study_stats = list()
    condition_list = list()
    for condition in adata_conditions:
        adata_study = adata[adata.obs[condition_key].isin([condition])].copy()
        study_stats.append([condition, adata_study.X.shape[0]])
    study_stats.sort(key=takeSecond, reverse=True)
    study_stats = ([ a for a,b in study_stats ], [ b for a,b in study_stats ])
    for n_reference in range(len(adata_conditions)-1):
        condition_list.append(dict())
        condition_list[n_reference]['reference'] = study_stats[0][:n_reference+1]
        condition_list[n_reference]['query'] = study_stats[0][n_reference+1:]
    return condition_list

# Create and save conditions_order.txt File

In [4]:
condition_key = 'study'
full_path = f'{save_path}/datasets/pancreas_normalized.h5ad'
adata = sc.read(full_path)
print(adata)
for condition in adata.obs[condition_key].unique().tolist():
    adata_study = adata[adata.obs[condition_key].isin([condition])].copy()
    print(condition, adata_study.X.shape[0])

AnnData object with n_obs × n_vars = 15681 × 1000
    obs: 'batch', 'study', 'cell_type', 'size_factors'
Pancreas inDrop 8391
Pancreas CelSeq2 2426
Pancreas CelSeq 1271
Pancreas Fluidigm C1 632
Pancreas SS2 2961


  if not is_categorical(df_full[k]):


In [5]:
condition_list = create_conditions_order(adata, condition_key)
condition_list

[{'reference': ['Pancreas inDrop'],
  'query': ['Pancreas SS2',
   'Pancreas CelSeq2',
   'Pancreas CelSeq',
   'Pancreas Fluidigm C1']},
 {'reference': ['Pancreas inDrop', 'Pancreas SS2'],
  'query': ['Pancreas CelSeq2', 'Pancreas CelSeq', 'Pancreas Fluidigm C1']},
 {'reference': ['Pancreas inDrop', 'Pancreas SS2', 'Pancreas CelSeq2'],
  'query': ['Pancreas CelSeq', 'Pancreas Fluidigm C1']},
 {'reference': ['Pancreas inDrop',
   'Pancreas SS2',
   'Pancreas CelSeq2',
   'Pancreas CelSeq'],
  'query': ['Pancreas Fluidigm C1']}]

In [6]:
with open(f'{save_path}conditions_order_pancreas.txt', 'w') as filehandle:
    json.dump(condition_list, filehandle)

# Load existing .txt File

In [7]:
with open(f'{save_path}conditions_order_pancreas.txt') as filehandle:
    condition_list = json.load(filehandle)

In [8]:
for i in range(len(condition_list)):
    ref_conditions = condition_list[i]['reference']
    target_conditions = condition_list[i]['query']
    print("\nTEST:", i+1)
    print("Reference:", ref_conditions)
    print("Target:", target_conditions)


TEST: 1
Reference: ['Pancreas inDrop']
Target: ['Pancreas SS2', 'Pancreas CelSeq2', 'Pancreas CelSeq', 'Pancreas Fluidigm C1']

TEST: 2
Reference: ['Pancreas inDrop', 'Pancreas SS2']
Target: ['Pancreas CelSeq2', 'Pancreas CelSeq', 'Pancreas Fluidigm C1']

TEST: 3
Reference: ['Pancreas inDrop', 'Pancreas SS2', 'Pancreas CelSeq2']
Target: ['Pancreas CelSeq', 'Pancreas Fluidigm C1']

TEST: 4
Reference: ['Pancreas inDrop', 'Pancreas SS2', 'Pancreas CelSeq2', 'Pancreas CelSeq']
Target: ['Pancreas Fluidigm C1']


# Same Example for Mouse Brain

In [9]:
condition_key = 'study'
full_path = f'{save_path}/datasets/mouse_brain_normalized.h5ad'
adata = sc.read(full_path)
print(adata)
for condition in adata.obs[condition_key].unique().tolist():
    adata_study = adata[adata.obs[condition_key].isin([condition])].copy()
    print(condition, adata_study.X.shape[0])

AnnData object with n_obs × n_vars = 56399 × 2000
    obs: 'Age', 'Subclass', 'Taxonomy_group', 'Tissue', 'age', 'batch', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'class', 'cluster', 'cluster_id', 'louvain', 'reason', 'refined_class', 'region', 'region_subcluster', 'sample_type', 'study', 'subcluster', 'n_counts'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
Saunders 34502
Zeisel 7394
Rosenberg 6647
Tabula_muris 7856


In [10]:
condition_list = create_conditions_order(adata, condition_key)
condition_list

[{'reference': ['Saunders'], 'query': ['Tabula_muris', 'Zeisel', 'Rosenberg']},
 {'reference': ['Saunders', 'Tabula_muris'], 'query': ['Zeisel', 'Rosenberg']},
 {'reference': ['Saunders', 'Tabula_muris', 'Zeisel'], 'query': ['Rosenberg']}]

In [11]:
with open(f'{save_path}conditions_order_brain.txt', 'w') as filehandle:
    json.dump(condition_list, filehandle)

In [12]:
with open(f'{save_path}conditions_order_brain.txt') as filehandle:
    condition_list = json.load(filehandle)

In [13]:
for i in range(len(condition_list)):
    ref_conditions = condition_list[i]['reference']
    target_conditions = condition_list[i]['query']
    print("\nTEST:", i+1)
    print("Reference:", ref_conditions)
    print("Target:", target_conditions)


TEST: 1
Reference: ['Saunders']
Target: ['Tabula_muris', 'Zeisel', 'Rosenberg']

TEST: 2
Reference: ['Saunders', 'Tabula_muris']
Target: ['Zeisel', 'Rosenberg']

TEST: 3
Reference: ['Saunders', 'Tabula_muris', 'Zeisel']
Target: ['Rosenberg']
