In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
import anndata as ad

In [None]:
binary_matrix = pd.read_csv('../MOFA/MOFA_15_Factors_6082HVG/binary_matrix_factors.csv', index_col='Unnamed: 0')
adata = sc.read_h5ad('../MOFA/adata_combined_even_more_hvg.h5ad')

In [None]:
genes_mofa_factors = binary_matrix.T.columns.tolist()

In [None]:
pd.DataFrame(genes_mofa_factors).to_csv('MOFA_genes.csv')

In [None]:
len(genes_mofa_factors)

# DE genes across Technologies

In [None]:
sc.tl.rank_genes_groups(adata, groupby='Condition', layer='log_norm')

In [None]:
sc.pl.rank_genes_groups_dotplot(adata, groupby='Condition', values_to_plot='logfoldchanges', n_genes=100)

In [None]:
df = pd.DataFrame(adata.uns['rank_genes_groups']['names'])

In [None]:
df_sc = sc.get.rank_genes_groups_df(adata, group='scRNA-seq')
df_sn = sc.get.rank_genes_groups_df(adata, group='snRNA-seq')

In [None]:
de_sc = df_sc[df_sc.logfoldchanges>5].names.tolist()
de_sn = df_sn[df_sn.logfoldchanges>5].names.tolist()

In [None]:
sc_remove = ([i for i in de_sc if i in genes_mofa_factors])
sn_remove = ([i for i in de_sn if i in genes_mofa_factors])

In [None]:
genes_to_remove_techincal_variability= sn_remove + sc_remove

In [None]:
len(genes_to_remove_techincal_variability)

In [None]:
sc.pl.dotplot(adata, groupby='Level_1', var_names=genes_to_remove_techincal_variability,standard_scale='var', layer='log_norm' )

# Markers from Broad

In [None]:
broad_markers = pd.read_csv('broad_markers.csv', index_col='Unnamed: 0')

In [None]:
broad_markers

In [None]:
broad_markers_list = list(set(broad_markers.values.flatten().astype(str)[broad_markers.values.flatten().astype(str) != 'nan']))

In [None]:
pd.DataFrame(broad_markers_list).to_csv('broad_markers_list.csv')

In [None]:
([i for i in broad_markers_list if i not in adata.var_names])

# DE Markers across Cell Types

In [None]:
sc.tl.rank_genes_groups(adata, groupby='Level_1', layer='log_norm')

In [None]:
df = pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(30)
print(len(list(set(df.values.flatten()))))

In [None]:
# Filter out genes that are already present in genes_mofa_factors 
genes_to_check = ([i for i in (list(set(df.values.flatten()))) if i not in genes_mofa_factors])
print(len(genes_to_check))

In [None]:
df.head()

In [None]:
gene_to_group = {}
for gene in genes_to_check:
    test= df.where(df==gene).dropna(how='all').dropna(axis=1, how='all')
    cols = test.columns.tolist()
    gene_to_group[gene] = cols

In [None]:
df['Adipocyte'][df['Adipocyte'].isin(genes_to_check)].tolist()

In [None]:
gene_to_group_T = {}
for cols in df.columns:
    temp = df[cols]
    temp_genes = temp[temp.isin(genes_to_check)].tolist()
    gene_to_group_T[cols] = temp_genes

In [None]:
df = pd.DataFrame.from_dict(gene_to_group_T, orient='index')

In [None]:
de_genes = list(set([val for val in df.values.flatten() if val is not None]))
print(type(de_genes))

In [None]:
df_genes = pd.DataFrame(de_genes, columns=['Gene'])

In [None]:
df_genes.reset_index(inplace=True)

In [None]:
print(df_genes.dtypes)

In [None]:
df_genes.to_csv('de_genes_to_be_added.csv')

In [None]:
from collections import Counter
Counter(pd.DataFrame.from_dict(gene_to_group, orient='index').values.flatten())

# Check Genes

In [None]:
all_genes = (genes_mofa_factors + broad_markers_list + de_genes)

In [None]:
len([i for i in list(set(all_genes)) if i not in adata.var_names])

In [None]:
[i for i in all_genes if i not in adata.var_names]

# Reload

In [None]:
mofa_genes_df = pd.read_csv('MOFA_genes.csv')
broad_markers = pd.read_csv('broad_markers.csv', index_col='Unnamed: 0')
broad_markers_list = list(set(broad_markers.values.flatten().astype(str)[broad_markers.values.flatten().astype(str) != 'nan']))
de_genes_df = pd.read_pickle('de_genes_to_be_added.csv')
mofa_genes = mofa_genes_df.values.flatten().tolist()
de_genes = de_genes_df.values.flatten().tolist()

In [None]:
all_genes = (genes_mofa_factors + broad_markers_list + de_genes)

# Xenium Panel

In [None]:
xenium = pd.read_csv('pdac_xenium_panel.csv')

In [None]:
xenium_genes = list(set(xenium.Gene.tolist()))

In [None]:
len(xenium_genes)

In [None]:
# check how many genes not in all_genes
len([i for i in xenium_genes if i not in all_genes])

In [None]:
valid_genes = [i for i in xenium_genes if i in adata.var_names]

In [None]:
genes_to_plot = [i for i in valid_genes if i not in all_genes]

In [None]:
# Number of xenium genes present in 6082 HVG and in the selected all_genes list
len(genes_to_plot)

In [None]:
sc.pl.dotplot(adata, groupby='Level_1', var_names=genes_to_plot, layer='log_norm', standard_scale='var')

# Create final list

In [None]:
all_genes = list(set(mofa_genes + broad_markers_list + de_genes + xenium_genes))

In [None]:
len(all_genes)

In [None]:
all_genes = list(set(mofa_genes + broad_markers_list + de_genes + xenium_genes))

In [None]:
all_genes = [i for i in all_genes if i in adata.var_names]