In [8]:
import scanpy as sc
import pandas as pd
import decoupler as dc
import numpy as np
from pprint import pprint

from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [3]:
adata = sc.read('../../pipeline/data/pp/hlca_healthy_ipf_embed_balanced_final.h5ad')
adata



AnnData object with n_obs × n_vars = 450214 × 30
    obs: "3'_or_5'", 'BMI', 'age_or_mean_of_age_range', 'age_range', 'anatomical_region_ccf_score', 'ancestry', 'ann_coarse_for_GWAS_and_modeling', 'ann_finest_level', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'assay', 'cause_of_death', 'cell_type', 'core_or_extension', 'dataset', 'development_stage', 'disease', 'donor_id', 'fresh_or_frozen', 'log10_total_counts', 'lung_condition', 'mixed_ancestry', 'original_ann_level_1', 'original_ann_level_2', 'original_ann_level_3', 'original_ann_level_4', 'original_ann_level_5', 'original_ann_nonharmonized', 'reannotation_type', 'sample', 'scanvi_label', 'sequencing_platform', 'sex', 'smoking_status', 'study', 'subject_type', 'suspension_type', 'tissue', 'tissue_coarse_unharmonized', 'tissue_detailed_unharmonized', 'tissue_dissociation_protocol', 'tissue_level_2', 'tissue_level_3', 'tissue_sampling_method', 'total_counts', 'transf_ann_level_1_label', 'transf_ann_leve

In [4]:
condition_key = 'lung_condition_coarse'
sample_key = 'sample'
label_key = 'ann_level_3_label_final'

In [5]:
adata.obs[condition_key] = adata.obs[condition_key].astype('category')
adata.obs[sample_key] = adata.obs[sample_key].astype('category')
adata.obs[label_key] = adata.obs[label_key].astype('category')

In [18]:
adata.obs[sample_key] = adata.obs[sample_key].astype(str)

rename_dict = {name: number for number, name in enumerate(np.unique(adata.obs[condition_key]))}
ct_to_keep = list(np.unique(adata.obs[label_key]))
standartize = True

In [22]:
celltype = 'h'
ct = 3
donor = 4
sample = 4

In [25]:
dict([(celltype, ct), (donor, sample), ('count', 0)])

{'h': 3, 4: 4, 'count': 0}

In [62]:
def create_frequency_dataset(adata, celltype, donor, condition, standartize, rename_dict, ct_to_keep):
    missing_ct = list(set(ct_to_keep) - set(adata.obs[celltype]))
    df = adata.obs[[celltype, donor]].groupby([celltype, donor]).size().reset_index(name='count')

    unique_samples = np.unique(adata.obs[donor])
    missing_df = {celltype: [], donor: [], 'count': []}
    for ct in missing_ct:
        for sample in unique_samples:
            missing_df[celltype].append(ct)
            missing_df[donor].append(sample)
            missing_df['count'].append(0)
    missing_df = dict(missing_df)
    missing_df = pd.DataFrame(missing_df)

    df = pd.concat([df, missing_df])
    df = df.reset_index()
    
    X = []
    y = []

    for sample in df[donor].unique():
        df_sample = df[df[donor] == sample]
        df_sample = df_sample.sort_values(celltype)
        X.append(df_sample['count'].values)
        y.append(rename_dict[adata[adata.obs[donor] == sample].obs[condition][0]])

    X = np.array(X)
    y = np.array(y)

    # drop donors with less than 10 cells in total
    idx = np.argwhere(np.sum(X, axis=1) <= 300)
    X = np.delete(X, idx, axis=0)
    y = np.delete(y, idx)
    
    if standartize is True:
        X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
    
    return X, y

In [67]:
i = 4
 
train = adata[adata.obs[f'split{i}'] == 'train'].copy()
val = adata[adata.obs[f'split{i}'] == 'val'].copy()
#train data
x, y = create_frequency_dataset(
    train,
    celltype=label_key,
    donor=sample_key,
    condition=condition_key,
    standartize=standartize,
    rename_dict=rename_dict,
    ct_to_keep=ct_to_keep,
)
print("Train shapes:")
print(f"x.shape = {x.shape}")
print(f"y.shape = {y.shape}")
#val data
x_val, y_val = create_frequency_dataset(
    val,
    celltype=label_key,
    donor=sample_key,
    condition=condition_key,
    standartize=standartize,
    rename_dict=rename_dict,
    ct_to_keep=ct_to_keep,
)
print("Val shapes:")
print(f"x_val.shape = {x_val.shape}")
print(f"y_val.shape = {y_val.shape}")

  df = adata.obs[[celltype, donor]].groupby([celltype, donor]).size().reset_index(name='count')
  df = pd.concat([df, missing_df])
  y.append(rename_dict[adata[adata.obs[donor] == sample].obs[condition][0]])


Train shapes:
x.shape = (110, 25)
y.shape = (110,)


  df = adata.obs[[celltype, donor]].groupby([celltype, donor]).size().reset_index(name='count')
  y.append(rename_dict[adata[adata.obs[donor] == sample].obs[condition][0]])


Val shapes:
x_val.shape = (18, 25)
y_val.shape = (18,)


  X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
