### Subsampled the entire adata to get a smaller 

In [1]:
import scanpy as sc
from collections import Counter

In [None]:
%%time
adata = sc.read_h5ad("07_final_RNA_without_scvi.h5ad")
adata

#### Create a function that can subsample the adata object based on cell type proportions, either proportionally or inversely proportional. This will be useful for operations that do not require all of the cell types such as cell type deconvolution of bulk RNA-seq datasets.

In [None]:
def sample_cells_proportionally_by_donor_id_and_cell_type(adata_metadata, 
                                                          target_cells,
                                                          cell_type_key, 
                                                          donor_key,
                                                          random_state = None,
                                                          inverse_proportional=False):

    '''Sample cells proportionally per cell type using the adata.obs metadata. The goal is to get roughly equal
    sampling per donor (based on donor_key) and cell type (based on cell_type_key) and to obtain in (target_cells) number of cells'''

    # determine number of cells to sample per cell type proportionally
    total_cells = len(adata_metadata)
    cell_type_counts = adata_metadata[cell_type_key].value_counts()

    # if inversely proportional, then sample the adata.obs inversely to the overall cell count in the larger adata_metadata
    if inverse_proportional:
        inverse_counts = 1 / cell_type_counts
        normalized_inverse_counts = inverse_counts / inverse_counts.sum()
        cell_type_target_counts = (normalized_inverse_counts * target_cells).astype(int)
    else:
        cell_type_proportions = cell_type_counts / total_cells
        cell_type_target_counts =  (cell_type_proportions * target_cells).astype(int)
    
    # create list to store sampled indices
    sampled_indices = []
    
    # sample from each donor and cell type proportionally (or inversely)
    for cell_type, count in cell_type_target_counts.items():
        cell_type_df = adata_metadata[adata_metadata[cell_type_key] == cell_type]
        donors = cell_type_df[donor_key].unique()
        
        # get roughly equal sampling per donor
        cells_per_study = max(count // len(donors), 1)  # minimum of 1 cell per study
        for donor in donors:
            donor_df = cell_type_df[cell_type_df[donor_key] == donor]
            sample_count = min(len(donor_df), cells_per_study)
            
            # randomly sample without replacement
            sampled_indices.extend(donor_df.sample(n=sample_count, replace=False, random_state=random_state).index)
    
    # create sampled metadata df
    sampled_metadata = adata_metadata.loc[sampled_indices]
    
    # final downsampling to target_cells if oversampled
    if len(sampled_metadata) > target_cells:
        sampled_metadata = sampled_metadata.sample(n=target_cells, replace=False)
    
    return sampled_metadata

### Subsample to about 5% the size of the original adata, inversely proportional to the cell type proportions in the overall adata object

In [None]:
%%time
# subsample adata and use the random size 42
subsampled_metadata = sample_cells_proportionally_by_donor_id_and_cell_type(adata_metadata = adata.obs, 
                                                                            donor_key = "donor_id",
                                                                            cell_type_key = "final_cell_type",
                                                                            target_cells = adata.obs.shape[0] * 0.05, 
                                                                            random_state = 42, 
                                                                           inverse_proportional=False).copy()

In [None]:
subsampled_metadata.shape

In [None]:
%%time
subsampled_adata = adata[subsampled_metadata.index, :].copy()
subsampled_adata

In [None]:
Counter(subsampled_adata.obs.donor_id)

In [None]:
Counter(subsampled_adata.obs.v2_scvi_cell_type)

In [None]:
sc.pl.umap(subsampled_adata, color = "final_cell_type", legend_loc = "on data")

### Save the subsampled adata

In [None]:
subsampled_adata.write("07_subsampled_adata.h5ad")