In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import os
import sys
import scipy


def MovePlots(plotpattern, subplotdir):
    os.system('mkdir -p '+str(sc.settings.figdir)+'/'+subplotdir)
    os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'** '+str(sc.settings.figdir)+'/'+subplotdir)


sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './genital_tubercle_figures_humans/'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures

sys.executable

# 1. Load human data

In [None]:
path_to_data = '/nfs/team292/vl6/FetalReproductiveTract/'

In [None]:
human = sc.read(path_to_data + "human_genital_tubercle_mpw_orthologs_with_embedding.h5ad")
human

In [None]:
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=150, 
                         frameon=True, vector_friendly=True, fontsize=14, figsize=[7,7], color_map=None, 
                         format='pdf', facecolor=None, transparent=False,)

# 2. Import human neighbourhood assignment

In [None]:
human_nhoods = pd.read_csv(path_to_data + 'human_genital_tubercle_mpw_milo_15knn.csv', index_col = 0)
human_nhoods.shape

In [None]:
human_nhoods.head()

In [None]:
human_nhoods['celltype'] = human_nhoods.index.map(human.obs['human_celltype'].to_dict())

In [None]:
human_nhoods['celltype'].value_counts()

In [None]:
# Melt the DataFrame to long format
human_nhoods_long = pd.melt(human_nhoods, id_vars=['celltype'], var_name='neighborhood', value_name='membership')

# Filter for rows where the cell belongs to the neighborhood
human_nhoods_long = human_nhoods_long[human_nhoods_long['membership'] == 1]

# Group by Neighborhood and cell type, then count the occurrences
human_cell_counts = human_nhoods_long.groupby(['neighborhood', 'celltype']).size().reset_index(name='count')

# Determine the most abundant cell type for each neighborhood
human_most_abundant_celltype = human_cell_counts.loc[human_cell_counts.groupby('neighborhood')['count'].idxmax()]

human_most_abundant_celltype = human_most_abundant_celltype[['neighborhood', 'celltype']]

human_most_abundant_celltype.set_index('neighborhood', inplace=True)

In [None]:
human_most_abundant_celltype.shape

In [None]:
human_most_abundant_celltype.head()

# 3. Import mouse data 

In [None]:
mouse = sc.read(path_to_data + 'mouse_genital_tubercle_mpw_orthologs_with_embedding.h5ad')
mouse

# 4. Import mouse neighbourhood assignment

In [None]:
nhoods_mouse = pd.read_csv(path_to_data + "mouse_genital_tubercle_mpw_milo_15knn.csv", index_col = 0)
nhoods_mouse.shape

In [None]:
nhoods_mouse.head()

In [None]:
nhoods_mouse['celltype'] = nhoods_mouse.index.map(mouse.obs['mouse_celltype'].to_dict())

In [None]:
# Melt the DataFrame to long format
nhoods_mouse_long = pd.melt(nhoods_mouse, id_vars=['celltype'], var_name='neighborhood', value_name='membership')

# Filter for rows where the cell belongs to the neighborhood
nhoods_mouse_long = nhoods_mouse_long[nhoods_mouse_long['membership'] == 1]

# Group by Neighborhood and CellType, then count the occurrences
cell_counts_mouse = nhoods_mouse_long.groupby(['neighborhood', 'celltype']).size().reset_index(name='count')

# Determine the most abundant cell type for each neighborhood
most_abundant_celltype_mouse = cell_counts_mouse.loc[cell_counts_mouse.groupby('neighborhood')['count'].idxmax()]

# Drop unnecessary columns if you want, and rename as needed
most_abundant_celltype_mouse = most_abundant_celltype_mouse[['neighborhood', 'celltype']]

# Optionally, you might want to set the Neighborhood as the index
most_abundant_celltype_mouse.set_index('neighborhood', inplace=True)

In [None]:
most_abundant_celltype_mouse.shape

In [None]:
most_abundant_celltype_mouse.head()

# 5. Import neighbourhood matching

In [None]:
nhood_match = pd.read_csv('/nfs/team292/vl6/Mouse_RepTract/Amato2021/MNNMatch__spearman__mouse_genital_tubercle_mpw_milo_15knn__human_genital_tubercle_mpw_milo_15knn.csv')
nhood_match.shape

In [None]:
nhood_match.head()

In [None]:
nhood_match['mouse_celltype'] = nhood_match['mouse_genital_tubercle_mpw_milo_15knn'].astype(str).map(most_abundant_celltype_mouse['celltype'].to_dict())
nhood_match['human_celltype'] = nhood_match['human_genital_tubercle_mpw_milo_15knn'].astype(str).map(human_most_abundant_celltype['celltype'].to_dict())

In [None]:
nhood_match.head()

In [None]:
import pandas as pd
import plotly.graph_objects as go

In [None]:
from plotly.offline import iplot, init_notebook_mode

# Enable Plotly offline mode to display plots in the notebook
init_notebook_mode(connected=True)

In [None]:

# Create a list of unique labels (cell types) from both annotations
labels = pd.concat([nhood_match['mouse_celltype'], nhood_match['human_celltype']]).unique()

# Map labels to integers for source/target in Sankey
label_to_id = {label: idx for idx, label in enumerate(labels)}

# Create source, target, and value lists for Sankey diagram
sources = nhood_match['mouse_celltype'].map(label_to_id).tolist()
targets = nhood_match['human_celltype'].map(label_to_id).tolist()
values = [1] * len(nhood_match)  # Assuming each row (comparison) contributes a count of 1

# Build the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=30,  # Padding between nodes
        thickness=20,
        line=dict(color="black", width=1),
        label=list(labels),
    ),
    link=dict(
        source=sources,  # indices of source nodes
        target=targets,  # indices of target nodes
        value=values,  # magnitude of flow between nodes
        line=dict(color="lightgrey", width=0.5)
    )
)])

fig.update_layout(title_text="Comparison of Cell Type Annotations", font_size=15,
                 width=800, height=600 )
fig.show()


In [None]:
import kaleido

In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.io import write_image

labels = pd.concat([nhood_match['mouse_celltype'], nhood_match['human_celltype']]).unique()
label_to_id = {label: idx for idx, label in enumerate(labels)}

# Create source, target, and value lists for Sankey diagram
sources = nhood_match['mouse_celltype'].map(label_to_id).tolist()
targets = nhood_match['human_celltype'].map(label_to_id).tolist()
values = [1] * len(nhood_match)  # Assuming each row (comparison) contributes a count of 1

# Define colors for the nodes
color_map = {
    'Epithelium' : 'mediumorchid', 'Distal Dorsal Glanular Mesenchyme' : 'navy', 
    'Ventral Glanular Mesenchyme' : 'cornflowerblue', 
    'Early Proximal Glanular Mesenchyme' : 'deepskyblue', 'Late Proximal Glanular Mesenchyme' : 'darkturquoise', 
    'Corpus Cavernsoum' : 'forestgreen', 'Preputial Mesenchyme' : 'yellowgreen',
       'Sub-Dermal Prepuce' : 'lightseagreen','Urethral Plate Basal' : 'purple', 'Urethral Plate Uroplakins' : 'palevioletred', 
    'Genital Epidermis' : 'mediumorchid', 'Preputial Lamina' : 'pink',
     'Glans' : 'navy', 'Corpus Spongiosum' : 'deepskyblue', 
   
    'Corpus Cavernosum' : 'forestgreen', 
     'Prepuce' : 'yellowgreen', 'Subdermal Prepuce' : 'lightseagreen', 'Labio-Scrotal Swelling' : 'teal'
}

# Use the color map to assign colors to each label
node_colors = [color_map.get(label, '#cccccc') for label in labels]  # Default color if not specified

# Build the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=30,  # Padding between nodes
        thickness=20,
        line=dict(color="black", width=0.5),
        label=list(labels),
        color=node_colors  # Apply color to nodes
    ),
    link=dict(
        source=sources,  # indices of source nodes
        target=targets,  # indices of target nodes
        value=values,  # magnitude of flow between nodes
        color='gainsboro'  # Color of the links
    )
)])

fig.update_layout(font_size=15, width=800, height=600)

# Save the figure as a PDF
fig.write_image("sankey_diagram.pdf")


In [None]:
nhood_match.tail()

In [None]:
nhood_match['nhood_matched'] = range(1, len(nhood_match) + 1)

In [None]:
nhood_match['nhood_matched'] = ['nhood_' + str(i) for i in nhood_match['nhood_matched']]

In [None]:
nhood_match.head()

In [None]:
nhood_match['combined_celltype'] = nhood_match['mouse_celltype'] + ' --- ' + nhood_match['human_celltype']

In [None]:
nhood_match.head()

In [None]:
nhood_match['combined_celltype'].value_counts()

In [None]:
nhood_match = nhood_match[nhood_match['combined_celltype'] == 'Early Proximal Glanular Mesenchyme --- Corpus Spongiosum']
nhood_match

In [None]:
mouse_nhood_match = nhood_match['mouse_genital_tubercle_mpw_milo_15knn'].to_list()

In [None]:
len(mouse_nhood_match)

In [None]:
len(np.unique(mouse_nhood_match))

In [None]:
mouse_nhood_match = list(np.unique(mouse_nhood_match))

In [None]:
mouse_nhood_match_str = [str(i) for i in mouse_nhood_match]

In [None]:
nhoods_mouse_filtered = nhoods_mouse[mouse_nhood_match_str]

In [None]:
nhoods_mouse_filtered = nhoods_mouse_filtered[nhoods_mouse_filtered.any(axis=1)]

In [None]:
nhoods_mouse_filtered

In [None]:
# Count the occurrences of each value in 'combined_celltype'
value_counts = nhood_match['combined_celltype'].value_counts()

# Filter the counts to find values that appear less than 10 times
values_to_keep = value_counts[value_counts >= 100].index

# Filter the DataFrame to only include rows where 'combined_celltype' value appears 10 or more times
nhood_match_filtered = nhood_match[nhood_match['combined_celltype'].isin(values_to_keep)]

In [None]:
nhood_match.shape, nhood_match_filtered.shape

In [None]:
len(np.unique(nhood_match_filtered['combined_celltype']))

In [None]:
nhood_match_filtered

In [None]:
nhood_match_filtered['human_genital_tubercle_mpw_milo_15knn'] = nhood_match_filtered['human_genital_tubercle_mpw_milo_15knn'].astype(str)
nhood_match_filtered['mouse_genital_tubercle_mpw_milo_15knn'] = nhood_match_filtered['mouse_genital_tubercle_mpw_milo_15knn'].astype(str)

In [None]:
mapping_human = pd.Series(nhood_match_filtered['nhood_matched'].values,index=nhood_match_filtered['human_genital_tubercle_mpw_milo_15knn']).to_dict()
mapping_mouse = pd.Series(nhood_match_filtered['nhood_matched'].values,index=nhood_match_filtered['mouse_genital_tubercle_mpw_milo_15knn']).to_dict()

In [None]:
# Rename columns based on mapping
nhoods_mouse_renamed = nhoods_mouse.rename(columns=mapping_mouse)
human_nhoods_renamed = human_nhoods.rename(columns=mapping_human)


In [None]:
nhoods_mouse.shape

In [None]:
for c in nhoods_mouse_renamed.columns:
    if c.startswith('nhood'):
        print(c)
    else: 
        del nhoods_mouse_renamed[c]

In [None]:
nhoods_mouse_renamed.shape

In [None]:
for c in human_nhoods_renamed.columns:
    if c.startswith('nhood'):
        print(c)
    else: 
        del human_nhoods_renamed[c]

In [None]:
human_nhoods_renamed.shape

In [None]:
# Combine DataFrames
combined_df = pd.concat([human_nhoods_renamed, nhoods_mouse_renamed], ignore_index=False)

In [None]:
combined_df['species'] = ['human' if i.startswith('HD_F') else 'mouse' for i in combined_df.index]

In [None]:
combined_df['species'].value_counts()

In [None]:
combined_df.head()

In [None]:
# Melt the DataFrame to long format
long_df = combined_df.melt(id_vars='species', var_name='nhood', value_name='assignment')

# Filter out unassigned cells
assigned_df = long_df[long_df['assignment'] == 1]

# Count cells per neighborhood and species
counts = assigned_df.groupby(['nhood', 'species']).size().reset_index(name='count')

print(counts)

In [None]:
combined_df['nhoods_counts'] = combined_df.drop(columns=['species']).sum(axis=1)

print(combined_df[['species', 'nhoods_counts']])

### Convert cell to neighbourhood to cell to cell type 

In [None]:
combined_df_save = combined_df.drop(columns=['species', 'nhoods_counts'])
combined_df_save.shape

In [None]:
#combined_df_save.to_csv('/nfs/team292/vl6/FetalReproductiveTract/genital_tubercle_human_mouse_nhoods_50knn.csv', index=True)

In [None]:
neighborhood_to_cell_type = dict(zip(nhood_match['nhood_matched'], nhood_match['combined_celltype']))

In [None]:
neighborhood_to_cell_type

In [None]:
# Vectorized mapping from neighborhood to cell type
mapped_cell_types = combined_df_save.columns.map(neighborhood_to_cell_type.get)

In [None]:
# Create a new DataFrame with the same index as df_cells and columns based on unique cell types
df_cell_type_membership = pd.DataFrame(0, index=combined_df_save.index, columns=pd.unique(mapped_cell_types))

In [None]:
df_cell_type_membership

In [None]:
# For each cell type, aggregate neighborhood memberships into cell type memberships
for cell_type in df_cell_type_membership.columns:
    # Identify neighborhoods that map to the current cell type
    neighborhoods = [k for k, v in neighborhood_to_cell_type.items() if v == cell_type]
    # Sum the columns for these neighborhoods and check if >0 (indicating membership)
    df_cell_type_membership[cell_type] = combined_df_save[neighborhoods].sum(axis=1) > 0

# Convert boolean True/False to integers 1/0
df_cell_type_membership = df_cell_type_membership.astype(int)

In [None]:
df_cell_type_membership['species'] = ['human' if i.startswith('HD_F') else 'mouse' for i in df_cell_type_membership.index]
df_cell_type_membership

In [None]:
# Melt the DataFrame to long format
df_cell_type_membership['species'] = ['human' if i.startswith('HD_F') else 'mouse' for i in df_cell_type_membership.index]
long_df_cells = df_cell_type_membership.melt(id_vars='species', var_name='celltype', value_name='assignment')

# Filter out unassigned cells
assigned_df_cells = long_df_cells[long_df_cells['assignment'] == 1]

# Count cells per neighborhood and species
counts_cells = assigned_df_cells.groupby(['celltype', 'species']).size().reset_index(name='count')


In [None]:
df_cell_type_membership['celltype_counts'] = df_cell_type_membership.drop(columns=['species']).sum(axis=1)


In [None]:
df_cell_type_membership['celltype_counts'].value_counts()

In [None]:
df_cell_type_membership.head()

In [None]:
# Add info to each dataset 
human.obs['belongs_to_matched_nhoods'] = human.obs_names.map(combined_df['nhoods_counts'].to_dict())
human.obs['belongs_to_n_celltypes'] = human.obs_names.map(df_cell_type_membership['celltype_counts'].to_dict())

In [None]:
human.obs['Early Proximal Glanular Mesenchyme --- Corpus Spongiosum'] = human.obs_names.map(df_cell_type_membership['Early Proximal Glanular Mesenchyme --- Corpus Spongiosum'].to_dict())


In [None]:
human.obs['belongs_to_0_matched_nhoods'] = [1 if i == 0 else 0 for i in human.obs['belongs_to_matched_nhoods']]


In [None]:
sc.pl.umap(human, color = ['Early Proximal Glanular Mesenchyme --- Corpus Spongiosum'], color_map = 'OrRd')

In [None]:
mouse.obs['Early Proximal Glanular Mesenchyme --- Corpus Spongiosum'] = mouse.obs_names.map(df_cell_type_membership['Early Proximal Glanular Mesenchyme --- Corpus Spongiosum'].to_dict())
mouse.obs['Preputial Mesenchyme --- Corpus Spongiosum'] = mouse.obs_names.map(df_cell_type_membership['Preputial Mesenchyme --- Corpus Spongiosum'].to_dict())



In [None]:
mouse.obs['belongs_to_matched_nhoods'] = mouse.obs_names.map(combined_df['nhoods_counts'].to_dict())
mouse.obs['belongs_to_n_celltypes'] = mouse.obs_names.map(df_cell_type_membership['celltype_counts'].to_dict())
mouse.obs['belongs_to_0_matched_nhoods'] = [1 if i == 0 else 0 for i in mouse.obs['belongs_to_matched_nhoods']]


In [None]:
color_dict = {
    'Epithelium' : 'mediumorchid', 'Distal Dorsal Glanular Mesenchyme' : 'navy', 
    'Ventral Glanular Mesenchyme' : 'cornflowerblue', 
    'Early Proximal Glanular Mesenchyme' : 'deepskyblue', 'Late Proximal Glanular Mesenchyme' : 'darkturquoise', 
    'Corpus Cavernsoum' : 'forestgreen', 'Preputial Mesenchyme' : 'yellowgreen',
       'Sub-Dermal Prepuce' : 'mediumseagreen',
}

In [None]:
## Eliminate small clusters of doublets 
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=150, 
                         frameon=True, vector_friendly=True, fontsize=14, figsize=[7,7], color_map=None, 
                         format='pdf', facecolor=None, transparent=False,)

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

In [None]:
mouse.obs['Early Proximal Glanular Mesenchyme --- Corpus Spongiosum'] = mouse.obs_names.isin(nhoods_mouse_filtered.index.to_list()).astype(int)

In [None]:
mouse.obs['Early Proximal Glanular Mesenchyme --- Corpus Spongiosum'].value_counts()

In [None]:
mouse.obs['Early Proximal Glanular Mesenchyme --- Corpus Spongiosum'] = mouse.obs['Early Proximal Glanular Mesenchyme --- Corpus Spongiosum'].astype(str)

In [None]:
filter2

In [None]:
mouse.obs['Early Proximal Glanular Mesenchyme --- Corpus Spongiosum'] = mouse.obs['Early Proximal Glanular Mesenchyme --- Corpus Spongiosum'].astype(str)
mouse.obs['Preputial Mesenchyme --- Corpus Spongiosum'] = mouse.obs['Preputial Mesenchyme --- Corpus Spongiosum'].astype(str)


In [None]:
sc.pl.umap(mouse, color = ['mouse_celltype',
                          ], palette = color_dict, save = '_mouse_orthologs.pdf')

In [None]:
sc.pl.umap(mouse, color = ['Early Proximal Glanular Mesenchyme --- Corpus Spongiosum',
                          ], palette = {'0' : 'gainsboro', '1' : 'darkred'} , save = '_mouse_csearly.pdf')

In [None]:
sc.pl.umap(mouse, color = ['Preputial Mesenchyme --- Corpus Spongiosum',
                          ], palette = {'0' : 'lightgrey', '1' : 'maroon'} , save = '_mouse_csearly_prep.pdf')

In [None]:
sc.pl.umap(mouse, color = ['Foxf1','Sall1','Grid2', 'Foxl2' 
                          ], color_map = 'YlOrRd', use_raw = False, save = '_mouse_markers.pdf')

In [None]:
mouse

In [None]:
mouse.obs['Early Proximal Glanular Mesenchyme --- Corpus Spongiosum'].value_counts()

## Differential expression between males and females in corpus spongiosum

In [None]:
mouse.obs['matched_celltype'] = np.where(mouse.obs['Early Proximal Glanular Mesenchyme --- Corpus Spongiosum'] == '1',
                                        'Corpus Spongiosum', mouse.obs['celltype'])

In [None]:
mouse.obs['matched_celltype'].value_counts(dropna = False)

In [None]:
# Load original object with full transcriptome and transfer matched cell type annotations 
mouse_full = sc.read('/nfs/team292/vl6/Mouse_RepTract/Amato2021/Amato2021_mese_epi_annotated.h5ad')
mouse_full

In [None]:
mouse_full.obs['matched_celltype'] = mouse_full.obs_names.map(mouse.obs['matched_celltype'].to_dict())

In [None]:
import anndata

In [None]:
import rpy2.rinterface_lib.callbacks
import logging
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)
import anndata2ri
anndata2ri.activate()
%load_ext rpy2.ipython

In [None]:
%%R
library(SingleCellExperiment)
library(biomaRt)
library(data.table)
library(scater)
library(BiocParallel)

In [None]:
mpw_raw = anndata.AnnData(X = mouse_full.raw.X, var = mouse_full.raw.var, obs = mouse_full.obs)
mpw_raw.shape

In [None]:
# remove mito genes
non_mito_genes = [name for name in mpw_raw.var_names if not name.startswith('mt-')]
mpw_raw = mpw_raw[:, non_mito_genes]

# remove ribo genes
non_ribo_genes = [name for name in mpw_raw.var_names if not name.startswith('Rp')]
mpw_raw = mpw_raw[:, non_ribo_genes]

# remove heat shock protein genes
non_hps_genes = [name for name in mpw_raw.var_names if not name.startswith('Hsp')]
mpw_raw = mpw_raw[:, non_hps_genes]
sc.pp.filter_genes(mpw_raw, min_counts = 10)

In [None]:
mpw_raw.shape

In [None]:
import decoupler as dc

In [None]:
%%R -o results
library(biomaRt)
# Connect to the Ensembl database, selecting the Mus musculus dataset
mart <- useMart(biomart = "ensembl", dataset = "mmusculus_gene_ensembl")

# Retrieve data for genes on the Y chromosome with the generic gene name attribute
results <- getBM(attributes = c("chromosome_name", "external_gene_name"),
                 filters = "chromosome_name", values = "Y", mart = mart, verbose = TRUE)

# Display the first few rows of the results
#head(results)

In [None]:
results = results.set_index('external_gene_name')

In [None]:
results.head()

In [None]:
mpw_raw.var['Y_chrom'] = mpw_raw.var_names.map(results['chromosome_name'].to_dict())

In [None]:
mpw_raw.var['Y_chrom'].value_counts(dropna = False)

In [None]:
ychrom = mpw_raw.var[mpw_raw.var['Y_chrom'] == 'Y'].index.to_list()

In [None]:
keep = [i for i in mpw_raw.var_names.to_list() if i not in ychrom]

In [None]:
len(keep)

In [None]:
mpw_raw = mpw_raw[:, keep]

In [None]:
mouse_full = mouse_full[:, keep]

In [None]:
mouse_full.shape

In [None]:
mouse_full.layers["counts"] = mpw_raw.X.copy()

In [None]:
mouse_full.layers["counts"].toarray()[20:30, 20:30]

In [None]:
# Get pseudo-bulk profile
pdata = dc.get_pseudobulk(
    mouse_full,
    sample_col='donor',
    groups_col='matched_celltype',
    layer='counts',
    mode='sum',
    min_cells=0,
    min_counts=0
)

In [None]:
dc.plot_psbulk_samples(pdata, groupby=['donor', 'matched_celltype'], figsize=(12, 5))

In [None]:
# Get filtered pseudo-bulk profile
pdata = dc.get_pseudobulk(
    mouse_full,
    sample_col='donor',
    groups_col='matched_celltype',
    layer='counts',
    mode='sum',
    min_cells=10,
    min_counts=1000
)
pdata

In [None]:
# Store raw counts in layers
pdata.layers['counts'] = pdata.X.copy()

# Normalize, scale and compute pca
sc.pp.normalize_total(pdata, target_sum=1e4)
sc.pp.log1p(pdata)
sc.pp.scale(pdata, max_value=10)
sc.tl.pca(pdata)

# Return raw counts to X
dc.swap_layer(pdata, 'counts', X_layer_key=None, inplace=True)

In [None]:
sc.pl.pca(pdata, color=['sex', 'matched_celltype'], ncols=1, size=300)
sc.pl.pca_variance_ratio(pdata)

In [None]:
dc.get_metadata_associations(
    pdata,
    obs_keys = ['sex', 'matched_celltype', 'psbulk_n_cells', 'psbulk_counts'],  # Metadata columns to associate to PCs
    obsm_key='X_pca',  # Where the PCs are stored
    uns_key='pca_anova',  # Where the results are stored
    inplace=True,
)

In [None]:
dc.plot_associations(
    pdata,
    uns_key='pca_anova',  # Summary statistics from the anova tests
    obsm_key='X_pca',  # where the PCs are stored
    stat_col='p_adj',  # Which summary statistic to plot
    obs_annotation_cols = ['sex', 'matched_celltype'], # which sample annotations to plot
    titles=['Principle component scores', 'Adjusted p-values from ANOVA'],
    figsize=(7, 7),
    n_factors=10,
)

In [None]:
# Select CorpusSpongiosum
cs = pdata[pdata.obs['matched_celltype'] == 'Corpus Spongiosum'].copy()

In [None]:
dc.plot_filter_by_expr(cs, group='sex', min_count=50, min_total_count=60)

In [None]:
# Obtain genes that pass the thresholds
genes = dc.filter_by_expr(cs, group='sex', min_count=50, min_total_count=15)

# Filter by these genes
cs = cs[:, genes].copy()
cs

In [None]:
# Import DESeq2
from pydeseq2.dds import DeseqDataSet, DefaultInference
from pydeseq2.ds import DeseqStats

In [None]:
# Build DESeq2 object
inference = DefaultInference(n_cpus=8)
dds = DeseqDataSet(
    adata=cs,
    design_factors='sex',
    ref_level=['sex', 'female'],
    refit_cooks=True,
    inference=inference,
)

In [None]:
# Compute LFCs
dds.deseq2()

In [None]:
# Extract contrast between males vs females
stat_res = DeseqStats(
    dds,
    contrast=["sex", 'male', 'female'],
    inference=inference,
)

In [None]:
# Compute Wald test
stat_res.summary()

In [None]:
# Extract results
results_df = stat_res.results_df
results_df

In [None]:
results_df.loc['Mafb']

In [None]:
results_df.loc['Csrp2']

In [None]:
dc.plot_volcano_df(
    results_df,
    x='log2FoldChange',
    y='padj',
    lFCs_thr = 1.25,
    sign_thr = 0.05,
    top = 41,
    color_pos = 'deepskyblue',
    color_neg = 'pink',
    color_null='lightgray',
    lFCs_limit = 5,
    figsize=(7, 5), 
    save = '_mouse_corpusspongiosumall_DE.pdf'
)

In [None]:
results_df = results_df[(results_df['log2FoldChange'] > 1.25) | (results_df['log2FoldChange'] < -1.25)]
results_df = results_df[results_df['padj'] < 0.05]

In [None]:
pd.set_option('display.max_rows', 200)

In [None]:
results_df.sort_values('log2FoldChange')

In [None]:
results_df.to_csv('/nfs/team292/vl6/Mouse_RepTract/Amato2021/corpusspongiosum_allDEGs.csv')