In [2]:
import diffxpy.api as de
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scvi
import seaborn as sns
from scipy import stats

  self.seed = seed
  self.dl_pin_memory_gpu_training = (
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
adata = sc.read_h5ad('../data/processed_data/integrated_data.h5ad')
adata.obs.Sample.unique().tolist()

In [None]:
def map_condition(x):
    if 'cov' in x:
        return 'COVID19'
    else:
        return 'control'

In [None]:
adata.obs['condition'] = adata.obs.Sample.map(map_condition)
adata.obs

In [None]:
num_tot_cells = adata.obs.groupby(['Sample']).count()
num_tot_cells = dict(zip(num_tot_cells.index, num_tot_cells.doublet))
num_tot_cells

In [None]:
cell_type_counts = adata.obs.groupby(['Sample', 'condition', 'cell type']).count()
cell_type_counts = cell_type_counts[cell_type_counts.sum(axis = 1) > 0].reset_index()
cell_type_counts = cell_type_counts[cell_type_counts.columns[0:4]]

cell_type_counts['total_cells'] = cell_type_counts.Sample.map(num_tot_cells).astype(int)
cell_type_counts['frequency'] = cell_type_counts.doublet / cell_type_counts.total_cells
cell_type_counts

In [None]:
plt.figure(figsize = (10,4))
ax = sns.boxplot(data = cell_type_counts, x = 'cell type', y = 'frequency', hue = 'condition')
plt.xticks(rotation = 35, rotation_mode = 'anchor', ha = 'right')
plt.show()

## Differential expression analysis

In [None]:
subset = adata[adata.obs['cell_type'].insin(['AT1', 'AT2'])].copy()
subset.X = subset.X.toarray()
len(subset.var)

In [None]:
sc.pp.filter_genes(subset, min_cells = 100)
len(subset.var)

In [None]:
res = de.test.wald(data=subset, formula_loc='~ 1 + cell_type', factor_loc_totest='cell_type')

In [None]:
dedf = res.summary().sort_values('log2fc', ascending=False).reset_index(drop=True)
dedf

In [None]:
most_up = dedf.iloc[0].gene
i = np.where(subset.var_names == most_up)[0][0]

a = subset[subset.obs.cell_type == 'AT1'].X[:,i]
b = subset[subset.obs.cell_type == 'AT2'].X[:,i]

print(f"{most_up} expression:")
print(f"AT1: {a.mean()}")
print(f"AT2: {b.mean()}")

In [None]:
dedf['log2fc'] = dedf['log2fc']*-1
dedf = dedf.sort_values('log2fc', ascending=False).reset_index(drop=True)
dedf

In [None]:
dedf = dedf[(dedf.qval < 0.05) & (dedf.log2fc > .5)]
dedf

In [None]:
dedf = dedf[dedf['mean'] > 0.15]
dedf

In [None]:
geenes_to_show = dedf[-25:].gene.tolist() + dedf[:25].gene.tolist() # top 25 up and down
sc.pl.heatmap(subset, geenes_to_show, groupby='cell_type', swap_axes=True)

## Differential expression analysis with SCVI

In [None]:
model = scvi.model.SCVI.load('model.model', adata)
scvi_de = model.differential_expression(
    idx1 = [adata.obs['cell_type'] == 'AT1'],
    idx2 = [adata.obs['cell_type'] == 'AT2'],
)
scvi_de = scvi_de[(scvi_de['is_de_fdr_0.05']) & (abs(scvi_de.lfc_mean) > .5)]
scvi_de = scvi_de.sort_values('lfc_mean')
scvi_de = scvi_de[(scvi_de.raw_normalized_mean1 > .5) | (scvi_de.raw_normalized_mean2 > .5)]

In [None]:
geenes_to_show = scvi_de[-25:].gene.tolist() + scvi_de[:25].gene.tolist() # top 25 up and down
sc.pl.heatmap(subset, geenes_to_show,
              groupby='cell_type',
              swap_axes=True,
              layer='scvi_normalized',
              log=True)

## Gene Ontology enrichment analysis

In [None]:
import gseapy as gp
gp.get_library_name()

In [None]:
enr = gp.enrich(gene_list=dedf[dedf.log2fc > 0].gene.tolist(),
                gene_sets=['KEGG_2021_Human', 'GO_Biological_Process_2021'],
                organism='human',
                outdir=None,
                background=subset.var_names.tolist(),
)
enr.results

In [None]:
sc.pl.violin(subset[subset.obs.cell_type == 'AT2'], 'ETV5', groupby='condition')

In [None]:
temp = subset[subset.obs.cell_type == 'AT2']
i = np.where(temp.var_names == 'ETV5')[0][0]

a = temp[temp.obs.condition == 'COVID19'].X[:,i]
b = temp[temp.obs.condition == 'control'].X[:,i]

stats.mannwhitneyu(a, b)

## Score gene signature (to be done)