In [1]:
from refcm import RefCM
from matchings import Matching
import scanpy as sc
import config
import logging

config.start_logging(logging.DEBUG)

In [12]:
ds = sc.read_h5ad('zebrafish.h5ad')

In [15]:
ds.obs

Unnamed: 0_level_0,n_counts,unique_cell_id,cell_names,library_id,batch,ClusterID,ClusterName,TissueID,TissueName,TimeID
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0-0-0,15111.0,DEW050_AGTCAATAC-TTGGATCG,bcGPGV,DEW050,0,1,04hpf-pluripotent,9,Pluripotent,4hpf
1-0-0,2337.0,DEW050_AAGAACGGG-GCGTTGCT,bcDSDI,DEW050,0,1,04hpf-pluripotent,9,Pluripotent,4hpf
2-0-0,2078.0,DEW050_GACCTACTAG-TTAGTCCG,bcENHV,DEW050,0,1,04hpf-pluripotent,9,Pluripotent,4hpf
3-0-0,1648.0,DEW050_GTTTGTTT-GGTCCCTT,bcAABE,DEW050,0,1,04hpf-pluripotent,9,Pluripotent,4hpf
4-0-0,1153.0,DEW050_TGATTGCACGC-TAACCATC,bcFTTU,DEW050,0,1,04hpf-pluripotent,9,Pluripotent,4hpf
...,...,...,...,...,...,...,...,...,...,...
1993-28-6,2525.0,DEW169_ATTTCCAT-CAGTCCCT,bcDANY,DEW169,6,135,24hpf-muscle - myl1,8,Mesoderm,24hpf
1994-28-6,1548.0,DEW169_CTACGGGA-ATACCCAG,bcELQP,DEW169,6,157,24hpf-differentiating neurons - eomesa,1,Forebrain / Optic,24hpf
2000-28-6,5054.0,DEW169_TGGAAAGC-CCGCAACT,bcIGIF,DEW169,6,138,24hpf-neural - diencephalon,1,Forebrain / Optic,24hpf
2001-28-6,3270.0,DEW169_GAAAGACA-GTCCGTAC,bcFBRF,DEW169,6,178,24hpf-pharyngeal arch - ndnf,8,Mesoderm,24hpf


In [2]:
human_intron = sc.read_h5ad('data/LGN_human_intron.h5ad')
macaque_intron = sc.read_h5ad('data/LGN_macaque_intron.h5ad')

[h5py._conv      ] [DEBUG   ] : Creating converter from 3 to 5


In [4]:
rcm = RefCM(target_sum=1e6, discovery_threshold=0.1, n_top_genes=3000)
m = rcm.annotate(human_intron, 'human_intron', macaque_intron, 'macaque_intron', 'cluster_label', 'cluster_label')
m.eval('cluster_label')
m.display_matching_costs('cluster_label')

[refcm           ] [INFO    ] : NOTE: raw counts expected in anndata .X attributes.
[refcm           ] [DEBUG   ] : No existing matching db cost file db.json found.
[refcm           ] [DEBUG   ] : Selecting joint gene subset for query and reference datasets




[refcm           ] [DEBUG   ] : Using 2770 genes.
[refcm           ] [DEBUG   ] : Computing Wasserstein distances.
|████████████████| [100.00% ] : 00:02
[refcm           ] [DEBUG   ] : starting LP optimization
[refcm           ] [DEBUG   ] : optimization terminated w. status "Optimal"
[matchings       ] [DEBUG   ] : Astro                mapped to K1                  
[matchings       ] [DEBUG   ] : GABA1                mapped to GABA1               
[matchings       ] [DEBUG   ] : GABA2                mapped to GABA3               
[matchings       ] [DEBUG   ] : GABA3                mapped to GABA4               
[matchings       ] [DEBUG   ] : K1                   mapped to Pulv                
[matchings       ] [DEBUG   ] : 

In [2]:
lgn_ds = [
    ('human_intron', LGN_human_intron()), 
    ('human_exon', LGN_human_exon()), 
    ('macaque_intron', LGN_macaque_intron()),
    ('macaque_exon', LGN_macaque_exon()),
    ('mouse_intron', LGN_mouse_intron()),
    ('mouse_exon', LGN_mouse_exon())
]

[datasets        ] [INFO    ] : loading raw   data for lgn_human_intron
[datasets        ] [INFO    ] : loading label data for lgn_human_intron
[datasets        ] [INFO    ] : loading raw   data for lgn_human_exon
[datasets        ] [INFO    ] : loading label data for lgn_human_exon
[datasets        ] [INFO    ] : loading raw   data for lgn_macaque_intron
[datasets        ] [INFO    ] : loading label data for lgn_macaque_intron
[datasets        ] [INFO    ] : loading raw   data for lgn_macaque_exon
[datasets        ] [INFO    ] : loading label data for lgn_macaque_exon
[datasets        ] [INFO    ] : loading raw   data for lgn_mouse_intron
[datasets        ] [INFO    ] : loading label data for lgn_mouse_intron
[datasets        ] [INFO    ] : loading raw   data for lgn_mouse_exon
[datasets        ] [INFO    ] : loading label data for lgn_mouse_exon


In [3]:
for n, d in lgn_ds:
    print(f'{n:<16}: {len(d._keys_to_labels.items()):<2} cell types')

human_intron    : 10 cell types
human_exon      : 10 cell types
macaque_intron  : 9  cell types
macaque_exon    : 9  cell types
mouse_intron    : 15 cell types
mouse_exon      : 15 cell types


In [4]:
l = [LGN_human_intron(), LGN_macaque_intron(), LGN_mouse_intron()]

[datasets        ] [INFO    ] : loading raw   data for lgn_human_intron
[datasets        ] [INFO    ] : loading label data for lgn_human_intron
[datasets        ] [INFO    ] : loading raw   data for lgn_macaque_intron
[datasets        ] [INFO    ] : loading label data for lgn_macaque_intron
[datasets        ] [INFO    ] : loading raw   data for lgn_mouse_intron
[datasets        ] [INFO    ] : loading label data for lgn_mouse_intron


In [5]:
for i, q in enumerate(l):
    q_cts = set(q._keys_to_labels.values())
    for r in l[i+1:]:
        r_cts = set(r._keys_to_labels.values())
        isect = list(q_cts.intersection(r_cts))
        print(f'{q.name:<20} intersects with {r.name:<20} on {len(isect)} cell types: {isect}')

lgn_human_intron     intersects with lgn_macaque_intron   on 5 cell types: ['GABA3', 'K2', 'K1', 'GABA2', 'GABA1']
lgn_human_intron     intersects with lgn_mouse_intron     on 3 cell types: ['GABA3', 'GABA2', 'GABA1']
lgn_macaque_intron   intersects with lgn_mouse_intron     on 4 cell types: ['GABA3', 'GABA2', 'GABA4', 'GABA1']


In [6]:
rcm = RefCM()

for i, (query_n, query_ds) in enumerate(lgn_ds):
    n_qcs = len(query_ds._keys_to_labels.items())
    for ref_n, ref_ds in (lgn_ds[:i] + lgn_ds[i+1:]):
        n_rcs = len(ref_ds._keys_to_labels.items())

        m = rcm.match(ref_ds, query_ds)
        
        print(f'correctly matched {m.n_correct:<2} / {n_qcs:<2} (or {100*m.pct_correct:5.1f}% of cells) from {query_n:<14} (query) to {ref_n:<14} (reference with {n_rcs:<2} cell types)')
    print()

correctly matched 10 / 10 (or 100.0% of cells) from human_intron   (query) to human_exon     (reference with 10 cell types)
correctly matched 1  / 10 (or   4.6% of cells) from human_intron   (query) to macaque_intron (reference with 9  cell types)
correctly matched 4  / 10 (or  17.1% of cells) from human_intron   (query) to macaque_exon   (reference with 9  cell types)
correctly matched 0  / 10 (or   0.0% of cells) from human_intron   (query) to mouse_intron   (reference with 15 cell types)
correctly matched 0  / 10 (or   0.0% of cells) from human_intron   (query) to mouse_exon     (reference with 15 cell types)

correctly matched 10 / 10 (or 100.0% of cells) from human_exon     (query) to human_intron   (reference with 10 cell types)
correctly matched 4  / 10 (or  17.1% of cells) from human_exon     (query) to macaque_intron (reference with 9  cell types)
correctly matched 4  / 10 (or  15.0% of cells) from human_exon     (query) to macaque_exon   (reference with 9  cell types)
correct

In [26]:
rcm = RefCM(solver='lp', n_target_clusters=4, max_merges=2)
m = rcm.match([lgn_ds[0][1], lgn_ds[1][1]], lgn_ds[2][1])
m.display_matching_graph()
m.display_matching_costs()

[matchings       ] [INFO    ] : mapped lgn_macaque_intron   to lgn_human_exon-lgn_human_intron
[matchings       ] [INFO    ] : (5  common cell types)
[matchings       ] [INFO    ] : 4 /9  correct mappings
[matchings       ] [INFO    ] : 4 /9  incorrect mapping


In [56]:
rcm = RefCM(solver='lp', discovery_threshold=10)
m = rcm.match([lgn_ds[3][1]], lgn_ds[0][1])
m.display_matching_graph()
m.display_matching_costs()

[matchings       ] [INFO    ] : mapped lgn_human_intron     to lgn_macaque_exon    
[matchings       ] [INFO    ] : (5  common cell types)
[matchings       ] [INFO    ] : 4 /10 correct mappings
[matchings       ] [INFO    ] : 3 /10 incorrect mapping
