In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
import anndata
import scanpy as sc

In [2]:
def get_hvg(ds, n_top_genes):
    """
    Returns an AnnData's highly-variable genes list.

    Parameters
    ----------
    ds: AnnData
        The dataset to compute the hvg genes of.

    Returns
    -------
    List[str]:
        List of genes with the highest variability.
    """
    sc.pp.highly_variable_genes(ds, n_top_genes=n_top_genes)

    return ds.var["highly_variable"][ds.var["highly_variable"]].index.to_list()

In [2]:
# no batch effect 
X = pd.read_csv('data/splatter_counts.csv', index_col=0).values.T
y = pd.read_csv('data/splatter_labels.csv', index_col=0).values.flatten()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
pd.DataFrame(X_train).to_csv('data/splatter_ref_counts.csv',index=False, header=False)
pd.DataFrame(X_test).to_csv('data/splatter_q_counts.csv',index=False, header=False)

pd.DataFrame(y_train).to_csv('data/splatter_ref_labels.csv',index=False, header=False) 
pd.DataFrame(y_test).to_csv('data/splatter_q_labels.csv',index=False, header=False) 

In [46]:
# load data with batch effect 
b = 1
X_train = pd.read_csv('data/splatter_ref_counts_b%s.csv' % b, index_col=0).values.T
X_test = pd.read_csv('data/splatter_q_counts_b%s.csv' % b, index_col=0).values.T
y_train = pd.read_csv('data/splatter_ref_labels_b%s.csv' % b, index_col=0).values.flatten()
y_test = pd.read_csv('data/splatter_q_labels_b%s.csv' % b, index_col=0).values.flatten()


In [47]:
# Create an AnnData object
q = anndata.AnnData(X_test.astype(np.float64))
ref = anndata.AnnData(X_train.astype(np.float64)) 

# log1p normalize query and reference to target sum 100
q_sums = q.X.sum(axis=1).reshape((-1, 1))
ref_sums = ref.X.sum(axis=1).reshape((-1, 1))
q.X = 100 * q.X / q_sums
ref.X = 100 * ref.X / ref_sums

# log-transform the data
q.X = np.log1p(q.X)
ref.X = np.log1p(ref.X) 

# select the union of top 200 varying genes
hvg = list(set(get_hvg(q, 200)).union(get_hvg(ref, 200)))

q = q[:, hvg]
ref = ref[:, hvg]

  disp_grouped = df.groupby("mean_bin")["dispersions"]
  disp_grouped = df.groupby("mean_bin")["dispersions"]


First cluster the query data using Leiden clustering. 

In [48]:
# Perform clustering
sc.pp.neighbors(q, n_neighbors=10, use_rep='X')  # Compute neighborhood graph
sc.tl.leiden(q)  # Perform Leiden clustering (a variation of K-means) 
q_clustering = q.obs['leiden']

CIPR: 

In [49]:
clusters = np.unique(q.obs['leiden'])

In [50]:
avg_expression_per_cluster = pd.DataFrame(index=q.var_names, columns=clusters)
for cluster in clusters:
    # Get indices of cells in the current cluster
    cluster_indices = np.where(q.obs['leiden'] == cluster)[0]
    
    # Extract gene expression data for cells in the current cluster
    cluster_expression = q.X[cluster_indices, :]
    
    # Compute average gene expression across cells in the current cluster
    avg_expression = np.mean(cluster_expression, axis=0)
    
    # Store the average gene expression values for the current cluster
    avg_expression_per_cluster[cluster] = avg_expression
avg_expression_per_cluster.to_csv('data/splatter_cipr_query.csv')

In [51]:
gene_expression_df = pd.DataFrame(index=ref.var_names)
gene_expression_df = pd.DataFrame(ref.X.T, index=ref.var_names)
gene_expression_df.columns = y_train
gene_expression_df.to_csv('data/splatter_cipr_ref.csv')

In [52]:
# fill in the results from https://aekiz.shinyapps.io/CIPR/; make sure to put 'Gene' as the first column in the csv files 
cipr_results = {
    '0': 'Group2', 
    '1': 'Group5', 
    '2': 'Group1',
    '3': 'Group4',
    '4': 'Group3'
}

In [53]:
q.obs["cipr"] = ""
for cluster, cipr in cipr_results.items():
    q.obs.loc[q.obs["leiden"] == cluster, "cipr"] = cipr

In [54]:
sum(np.array(q.obs['cipr']) == y_test) / y_test.size

0.9995

# Perform RefCM

In [136]:
from refcm import RefCM

In [137]:
# setup 
adata_q = anndata.AnnData(X_test.astype(np.float64))
adata_q.obs['labels'] = q_clustering

adata_ref = anndata.AnnData(X_train.astype(np.float64))
adata_ref.obs['labels'] = y_train

In [138]:
rcm = RefCM(load_mcosts=False, save_mcosts=False, n_top_genes=200)

In [139]:
rcm.annotate(adata_q, 'splatter_q', adata_ref, 'splatter_ref', 'labels', 'labels')

  disp_grouped = df.groupby("mean_bin")["dispersions"]
  disp_grouped = df.groupby("mean_bin")["dispersions"]
|████████████████| [100.00% ] : 00:02


<matchings.Matching at 0x33146c2d0>

In [140]:
sum(np.array(adata_q.obs['refcm_annot']) == y_test) / y_test.size

0.9985

# Benchmark with SVM

In [141]:
from sklearn.svm import SVC

In [142]:
X_train, X_test = ref.X, q.X

In [143]:
# create and train the SVM
classifier = SVC(kernel='linear')
classifier.fit(X_train, y_train)

In [144]:
preds = classifier.predict(X_test)

In [145]:
np.mean(preds == y_test)

0.43325