# Allen Brain Atlas 10x Analysis

Differential expression analysis between spatially-defined AON neuron populations 
using the Allen Brain Cell Atlas 10x Whole Mouse Brain dataset.

**Goal**: Identify genes differentially expressed between Target (dorsolateral) and 
Non-target (ventromedial) AON glutamatergic neurons.

In [None]:
from pathlib import Path
import anndata as ad
import pandas as pd
import numpy as np
import scanpy as sc
from scipy.stats import ttest_ind

DATA_DIR = Path("../data").resolve()
AON_DIR = DATA_DIR / "aon_10x"
ALLEN_DIR = DATA_DIR / "allen_brain_atlas"

In [None]:
adata = ad.read_h5ad(ALLEN_DIR / "WMB-10Xv2-OLF-log2.h5ad")

cell_meta = pd.read_csv(
    ALLEN_DIR / "10x_cell_metadata_with_group_membership.csv",
    usecols=["cell_barcode", "cluster"]
)

In [3]:
# Drop any duplicate barcodes from metadata
cell_meta = cell_meta.drop_duplicates(subset='cell_barcode')
cell_meta = cell_meta.set_index('cell_barcode')

# Drop duplicate barcodes from your AnnData to be safe
adata = adata[~adata.obs['cell_barcode'].duplicated(keep='first')].copy()

# Now map safely using the 'cell_barcode' column
adata.obs['cluster'] = adata.obs['cell_barcode'].map(cell_meta['cluster'])


## Define Target and Non-target Clusters

Target clusters are located in the dorsolateral AON (contralaterally-projecting region).
Non-target clusters are in the ventromedial AON. Cluster assignments are based on spatial
coordinates from MERFISH data (see ABCA_Analysis.ipynb).

In [None]:
target_clusters = [
    '0184 IT AON-TT-DP Glut_5',
    '0183 IT AON-TT-DP Glut_5',
    '0180 IT AON-TT-DP Glut_4',
    '0177 IT AON-TT-DP Glut_3',
    '0181 IT AON-TT-DP Glut_4'
]

nontarget_clusters = [
    '0171 IT AON-TT-DP Glut_2',
    '0172 IT AON-TT-DP Glut_2',
    '0173 IT AON-TT-DP Glut_2',
    '0170 IT AON-TT-DP Glut_1',
    '0175 IT AON-TT-DP Glut_3',
]

adata.obs['group'] = 'other'
adata.obs.loc[adata.obs['cluster'].isin(target_clusters), 'group'] = 'target'
adata.obs.loc[adata.obs['cluster'].isin(nontarget_clusters), 'group'] = 'non-target'

combined = adata[adata.obs['group'].isin(['target', 'non-target'])].copy()

print(f"Target cells: {(adata.obs['group'] == 'target').sum():,}")
print(f"Non-target cells: {(adata.obs['group'] == 'non-target').sum():,}")

In [None]:
# Create mapping from gene symbols to Ensembl IDs
symbol_to_id = adata.var.reset_index().set_index('gene_symbol')[adata.var.index.name].to_dict()

# Candidate markers for contralaterally-projecting AON neurons
candidate_genes = ['Robo2', 'Abi3bp', 'Gabrg1', 'Adcyap1', 'Chrm3', 'Rprm', 'Thrb', 'Cntn5']

# Filter to genes present in dataset
gene_dict = {symbol: symbol_to_id[symbol] for symbol in candidate_genes if symbol in symbol_to_id}

sc.pl.dotplot(combined, var_names=gene_dict, groupby='group')

In [6]:
# Split groups
target_mask = adata.obs['group'] == 'target'
nontarget_mask = adata.obs['group'] == 'non-target'

X_target = adata[target_mask].X
X_nontarget = adata[nontarget_mask].X

# Convert to dense if sparse
if hasattr(X_target, "toarray"):
    X_target = X_target.toarray()
    X_nontarget = X_nontarget.toarray()

In [7]:
# Compute mean log2 expression
mean_target = X_target.mean(axis=0)
mean_nontarget = X_nontarget.mean(axis=0)

# Compute log2 difference (already in log space)
log2_diff = mean_target - mean_nontarget

# T-test across genes
t_stat, p_vals = ttest_ind(X_target, X_nontarget, axis=0, equal_var=False)

# Build result table
de_results = pd.DataFrame({
    'ensembl_id': adata.var_names,
    'mean_expr_target': mean_target,
    'mean_expr_nontarget': mean_nontarget,
    'log2_diff': log2_diff,
    'p_value': p_vals
})

# Add gene symbols (if available)
if 'gene_symbol' in adata.var.columns:
    de_results['gene_symbol'] = adata.var['gene_symbol'].values

# Sort by log2_diff descending (target-overexpressed)
de_results = de_results.sort_values(by='log2_diff', ascending=False)

# Select top 50
top50 = de_results.head(50)

In [8]:
output_path = Path("../output") / "top50_target_vs_nontarget.csv"
top50.to_csv(output_path, index=False)
print(top50[['gene_symbol', 'log2_diff', 'p_value']].head())

      gene_symbol  log2_diff  p_value
26801      Abi3bp   5.023862      0.0
8093       Gabrg1   3.510349      0.0
20920         Id2   3.509014      0.0
1465       Pcp4l1   3.501947      0.0
6902     Marcksl1   3.424366      0.0
