In [3]:
import scanpy as sc
import sys
import pandas as pd
import numpy as np

In [4]:
sys.path.append('/home/icb/laura.martens/organoid_regulomes/integration/')

In [5]:
from matching import get_cost_knn_graph, mcmf

In [6]:
adata = sc.read('/lustre/groups/ml01/workspace/laura.martens/moretti_colab/signac/adata_xglue_embedding.h5ad', sparse=False, X_name="matrix")

In [7]:
adata

AnnData object with n_obs × n_vars = 62526 × 50
    obs: 'clusters', 'day', 'tech'

In [8]:
try:
    embed_mat = pd.DataFrame(adata.X.todense())
except:
    embed_mat = pd.DataFrame(adata.X)
embed_mat.columns = list(adata.var.index.values)
embed_mat.index = list(adata.obs.index.values)

sys.stdout.write('Splitting groups.\n')
# Get column to split by
split_vals = adata.obs.loc[:, "tech"]
# Get larger group
group1_name = split_vals.value_counts().idxmax()
group2_name = split_vals[split_vals!=group1_name].unique()[0]

# Split groups
group1 = embed_mat[split_vals==group1_name]
group2 = embed_mat[split_vals!=group1_name]

Splitting groups.


In [39]:
sys.stdout.write('Computing digraph.\n')
# Calculate cost graph
cost_graph = get_cost_knn_graph(
    source = group1,
    target = group2,
    knn_k = 20,
    knn_n_jobs = 1,
    null_cost_percentile = 99,
    capacity_method = "uniform",
    add_null=False
)

Computing digraph.
Max dist:  517
Number of nodes:  62528
Number of edges:  1036249


In [40]:
sys.stdout.write('Finding optimal matches.\n')
# Find bipartite matches
g1_idx, g2_idx = mcmf(cost_graph)

Finding optimal matches.
MCMF took [s]: 1173.373453853


In [44]:
len(g1_idx)

31244

In [45]:
len(g2_idx)

31244

In [47]:
any_nans = (np.isnan(g1_idx) | np.isnan(g2_idx))

In [60]:
np.isnan(g2_idx).sum()

0

In [61]:
any_nans.sum()

0

In [62]:
np.isnan(g2_idx).sum()/len(g1_idx)

0.0

In [50]:
g1_idx_n = group1.index[g1_idx[~any_nans].astype(int)]
g2_idx_n = group2.index[g2_idx[~any_nans].astype(int)]

In [51]:
g1_idx_n.shape

(31244,)

In [52]:
g2_idx_n.shape

(31244,)

In [39]:
g2_idx_n

array([10470,  1571,   332, ..., 14607, 18658, 25295])

In [53]:
group1.index.isin(g1_idx_n).sum()/ len(group1.index)

0.880137468661089

In [58]:
g1_idx_n.unique()

Index(['AAACCCAAGCCGCACT-1_7', 'AAACCCAAGCGAGAAA-1_6', 'AAACCCAAGCTCCATA-1_1',
       'AAACCCAAGTAGCATA-1_1', 'AAACCCAAGTTAGAAC-1_4', 'AAACCCACAACCTAAC-1_4',
       'AAACCCACAATTCTTC-1_7', 'AAACCCACACGCTGAC-1_5', 'AAACCCACAGACGGAT-1_7',
       'AAACCCACATCGTGGC-1_7',
       ...
       'TTTGTTGGTTTCGTAG-1_5', 'TTTGTTGTCAAAGCCT-1_2', 'TTTGTTGTCAGCTTGA-1_5',
       'TTTGTTGTCATCGCCT-1_1', 'TTTGTTGTCCACCCTA-1_7', 'TTTGTTGTCCACGGGT-1_4',
       'TTTGTTGTCCTATTGT-1_1', 'TTTGTTGTCGTCAAAC-1_4', 'TTTGTTGTCTCTCTAA-1_6',
       'TTTGTTGTCTTAGGAC-1_1'],
      dtype='object', length=31244)

In [59]:
g2_idx_n.unique()

Index(['CAGCCTTTCACAACAC-7', 'TACTGCCAGAGCCTGA-3', 'GAACCGCTCCGTTTCG-1',
       'TAGCCCTCAACGCACC-1', 'CGTAAACAGATGCGCA-4', 'CTACTTAAGGATGTAT-4',
       'CATGCCTAGTCGTATC-7', 'GAGATTCGTCCGTCGA-4', 'TGGTCCTGTAAAGGCC-7',
       'CAAGGCCGTCTGGGCT-7',
       ...
       'TACATGGCACGAACGA-7', 'TGGAAGGTCGCAAACT-2', 'TTAGCGATCTGGCGCA-1',
       'CGTAAACGTCATAGAA-1', 'GCGCCAACAGGCACAA-6', 'GCACGGTTCGGGAAAC-1',
       'TTTGTGTTCTTCATAC-7', 'CTCCCAAAGTAACTCC-3', 'TCCGACTTCCATTGTT-6',
       'GCTCCTACATCCCTCA-1'],
      dtype='object', length=24226)

In [64]:
# Remove NaNs
any_nans = (np.isnan(g1_idx) | np.isnan(g2_idx))
g1_idx = g1_idx[~any_nans].astype(int)
g2_idx = g2_idx[~any_nans].astype(int)
matches = pd.DataFrame({
    group1_name: group1.index[g1_idx],
    group2_name: group2.index[g2_idx]
})

In [65]:
matches

Unnamed: 0,rna,atac
0,AAACCCAAGCCGCACT-1_7,CAGCCTTTCACAACAC-7
1,AAACCCAAGCGAGAAA-1_6,TACTGCCAGAGCCTGA-3
2,AAACCCAAGCTCCATA-1_1,GAACCGCTCCGTTTCG-1
3,AAACCCAAGTAGCATA-1_1,TAGCCCTCAACGCACC-1
4,AAACCCAAGTTAGAAC-1_4,CGTAAACAGATGCGCA-4
...,...,...
31239,TTTGTTGTCCACGGGT-1_4,CTCCCAAAGTAACTCC-3
31240,TTTGTTGTCCTATTGT-1_1,CTGTTCGCAGTGCGAA-1
31241,TTTGTTGTCGTCAAAC-1_4,GCCTACTGTCTAAAGA-2
31242,TTTGTTGTCTCTCTAA-1_6,TCCGACTTCCATTGTT-6
