# 2024-03-11-Demo: Creating a prediction dataframe

A notebook demonstrating how to generate a prediction dataframe on disk for the `predict.py` script

In [None]:
!ls ../data/perturbench_data

In [None]:
import scanpy as sc
import pandas as pd

In [2]:
data_cache_dir = '../data/perturbench_data' ## Change this to your local data directory

In [4]:
adata = sc.read_h5ad(f'{data_cache_dir}/srivatsan20_processed.h5ad', backed='r')
adata

AnnData object with n_obs × n_vars = 183856 × 9198 backed at '../neurips2024/perturbench_data/srivatsan20_processed.h5ad'
    obs: 'ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'chembl-ID', 'dataset', 'cell_type', 'treatment', 'condition', 'dose', 'cov_merged', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes'
    var: 'ensembl_id', 'ncounts', 'ncells', 'gene_symbol', 'n_cells', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highl

In [5]:
adata.obs.cell_type.unique()

['mcf7', 'k562', 'a549']
Categories (3, object): ['a549', 'k562', 'mcf7']

In [6]:
unique_perturbations = [p for p in adata.obs.perturbation.unique() if p != 'control']
len(unique_perturbations)

188

In [7]:
prediction_df = pd.DataFrame(
    {
        'condition': unique_perturbations,
        'cell_type': 'k562',
    }
)
prediction_df.head()

Unnamed: 0,condition,cell_type
0,TAK-901,k562
1,Busulfan,k562
2,BMS-536924,k562
3,Enzastaurin (LY317615),k562
4,BMS-911543,k562


In [8]:
prediction_df.to_csv(f'{data_cache_dir}/prediction_dataframe.csv', index=False)