# Differential gene expression baseline

## Motivation

We want to add an additional view and an additional baseline to our feature attribution selection by determining genes that define clusters. In other words, we want to find marker genes for clusters which ideally correspond to markers of cell types or other conditions.

We follow the marker gene selection recommendations of the best practices book which is the Wilcoxon test.

In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import scanpy as sc
import pandas as pd

In [3]:
datasets_path = "/home/lukas/phd/year_2/datasets"

## Norman19

1. Load data
2. Find cluster labels
3. Calculate Wilcoxon tests with the references "rest" and "control"
4. Try to save the output in the format genes x cell types where the actual value is the p-value

In [4]:
norman = sc.read(f"{datasets_path}/scgen_norman19.h5ad")

### Wilcoxon of perturbation vs all other perturbations

In [None]:
reference = "rest"

In [5]:
sc.tl.rank_genes_groups(
    norman,
    groupby="perturbation_name",
    layer="counts",
    reference="rest",
    method="wilcoxon",
    key_added=f"wilcoxon_vs_{reference}",
)



In [6]:
norman_rest_df = pd.DataFrame(index=list(norman.var_names), columns=list(norman.obs.perturbation_name.cat.categories))

for perturbation in norman.obs.perturbation_name.cat.categories:
    perturbation_de_result = sc.get.rank_genes_groups_df(norman, group=perturbation, key=f"wilcoxon_vs_{reference}")

    norman_rest_df[perturbation] = perturbation_de_result.set_index("names").pvals

In [7]:
norman_rest_df.to_csv(f"norman_DE_{reference}.csv", sep=",")

### Wilcoxon of perturbation vs control

In [11]:
reference = "control"

In [12]:
sc.tl.rank_genes_groups(
    norman,
    groupby="perturbation_name",
    layer="counts",
    reference="rest",
    method="wilcoxon",
    key_added=f"wilcoxon_vs_{reference}",
)



In [13]:
norman_control_df = pd.DataFrame(
    index=list(norman.var_names), columns=list(norman.obs.perturbation_name.cat.categories)
)

for perturbation in norman.obs.perturbation_name.cat.categories:
    perturbation_de_result = sc.get.rank_genes_groups_df(norman, group=perturbation, key=f"wilcoxon_vs_{reference}")

    norman_control_df[perturbation] = perturbation_de_result.set_index("names").pvals

In [14]:
norman_control_df.to_csv(f"norman_DE_{reference}.csv", sep=",")

## HLCA

In [16]:
hlca = sc.read(f"{datasets_path}/hlca.h5ad")

In [22]:
hlca.obs.columns

Index(['sample', 'original_celltype_ann', 'study_long', 'study',
       'last_author_PI', 'subject_ID', 'subject_ID_as_published',
       'pre_or_postnatal', 'age_in_years', 'age_range', 'sex', 'ethnicity',
       'mixed_ethnicity', 'smoking_status', 'smoking_history', 'BMI',
       'known_lung_disease', 'condition', 'subject_type', 'cause_of_death',
       'sample_type', 'anatomical_region_coarse', 'anatomical_region_detailed',
       'tissue_dissociation_protocol', 'cells_or_nuclei',
       'single_cell_platform', '3'_or_5'', 'enrichment', 'sequencing_platform',
       'reference_genome_coarse', 'ensembl_release_reference_genome',
       'cell_ranger_version', 'disease_status', 'fresh_or_frozen', 'cultured',
       'cell_viability_%', 'comments', 'Processing_site', 'dataset',
       'anatomical_region_level_1', 'anatomical_region_level_2',
       'anatomical_region_level_3', 'anatomical_region_highest_res', 'age',
       'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4',
   

In [25]:
sc.tl.rank_genes_groups(
    hlca, groupby="manual_ann", layer="counts", reference="rest", method="wilcoxon", key_added="wilcoxon_cell_types"
)



In [27]:
hlca_df = pd.DataFrame(index=list(hlca.var_names), columns=list(hlca.obs.manual_ann.cat.categories))

for cell_type in hlca.obs.manual_ann.cat.categories:
    cell_type_de_result = sc.get.rank_genes_groups_df(hlca, group=cell_type, key="wilcoxon_cell_types")

    hlca_df[cell_type] = cell_type_de_result.set_index("names").pvals

In [28]:
hlca_df.to_csv(f"hlca_DE_cell_types.csv", sep=",")