/
_dpt.py
909 lines (845 loc) · 44.5 KB
/
_dpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
import numpy as np
import pandas as pd
import scipy as sp
import networkx as nx
from natsort import natsorted
from .. import logging as logg
from ..logging import _settings_verbosity_greater_or_equal_than
from ..neighbors import Neighbors, OnFlySymMatrix
def _diffmap(adata, n_comps=15):
logg.info('computing Diffusion Maps using n_comps={}(=n_dcs)'.format(n_comps), r=True)
dpt = DPT(adata)
dpt.compute_transitions()
dpt.compute_eigen(n_comps=n_comps)
adata.obsm['X_diffmap'] = dpt.eigen_basis
adata.uns['diffmap_evals'] = dpt.eigen_values
logg.info(' finished', time=True, end=' ' if _settings_verbosity_greater_or_equal_than(3) else '\n')
logg.hint('added\n'
' \'X_diffmap\', diffmap coordinates (adata.obsm)\n'
' \'diffmap_evals\', eigenvalues of transition matrix (adata.uns)')
def dpt(adata, n_dcs=10, n_branchings=0, min_group_size=0.01,
allow_kendall_tau_shift=True, copy=False):
"""Infer progression of cells through geodesic distance along the graph [Haghverdi16]_ [Wolf17i]_.
Reconstruct the progression of a biological process from snapshot
data. `Diffusion Pseudotime` has been introduced by [Haghverdi16]_ and
implemented within Scanpy [Wolf17]_. Here, we use a further developed
version, which is able to deal with disconnected graphs [Wolf17i]_ and can
be run in a `hierarchical` mode by setting the parameter
`n_branchings>1`. We recommend, however, to only use
:func:`~scanpy.api.tl.dpt` for computing pseudotime (`n_branchings=0`) and
to detect branchings via :func:`~scanpy.api.paga`. For pseudotime, you need
to annotate your data with a root cell. For instance::
adata.uns['iroot'] = np.flatnonzero(adata.obs['cell_types'] == 'Stem')[0]
This requires to run :func:`~scanpy.api.pp.neighbors`, first. In order to
reproduce the original implementation of DPT, use `method=='gauss'` in
this. Using the default `method=='umap'` only leads to minor quantitative
differences, though.
.. versionadded:: 1.1
:func:`~scanpy.api.tl.dpt` also requires to run
:func:`~scanpy.api.tl.diffmap` first. As previously,
:func:`~scanpy.api.tl.dpt` came with a default parameter of ``n_dcs=10`` but
:func:`~scanpy.api.tl.diffmap` has a default parameter of ``n_comps=15``,
you need to pass ``n_comps=10`` in :func:`~scanpy.api.tl.diffmap` in order
to exactly reproduce previous :func:`~scanpy.api.tl.dpt` results.
Parameters
----------
adata : :class:`~anndata.AnnData`
Annotated data matrix.
n_dcs : `int`, optional (default: 10)
The number of diffusion components to use.
n_branchings : `int`, optional (default: 0)
Number of branchings to detect.
min_group_size : [0, 1] or `float`, optional (default: 0.01)
During recursive splitting of branches ('dpt groups') for `n_branchings`
> 1, do not consider groups that contain less than `min_group_size` data
points. If a float, `min_group_size` refers to a fraction of the total
number of data points.
allow_kendall_tau_shift : `bool`, optional (default: `True`)
If a very small branch is detected upon splitting, shift away from
maximum correlation in Kendall tau criterion of [Haghverdi16]_ to
stabilize the splitting.
copy : `bool`, optional (default: `False`)
Copy instance before computation and return a copy. Otherwise, perform
computation inplace and return None.
Returns
-------
Depending on `copy`, returns or updates `adata` with the following fields.
If `n_branchings==0`, no field `dpt_groups` will be written.
dpt_pseudotime : `pd.Series` (`adata.obs`, dtype `float`)
Array of dim (number of samples) that stores the pseudotime of each
cell, that is, the DPT distance with respect to the root cell.
dpt_groups : `pd.Series` (`adata.obs`, dtype `category`)
Array of dim (number of samples) that stores the subgroup id ('0',
'1', ...) for each cell. The groups typically correspond to
'progenitor cells', 'undecided cells' or 'branches' of a process.
Notes
-----
The tool is similar to the R package `destiny` of [Angerer16]_.
"""
# standard errors, warnings etc.
adata = adata.copy() if copy else adata
if 'neighbors' not in adata.uns:
raise ValueError(
'You need to run `pp.neighbors` and `tl.diffmap` first.')
if 'iroot' not in adata.uns and 'xroot' not in adata.var:
logg.warn(
'No root cell found. To compute pseudotime, pass the index or '
'expression vector of a root cell, one of:\n'
' adata.uns[\'iroot\'] = root_cell_index\n'
' adata.var[\'xroot\'] = adata[root_cell_name, :].X')
if 'X_diffmap' not in adata.obsm.keys():
logg.warn('Trying to run `tl.dpt` without prior call of `tl.diffmap`. '
'Falling back to `tl.diffmap` with default parameters.')
_diffmap(adata)
# start with the actual computation
dpt = DPT(adata, n_dcs=n_dcs, min_group_size=min_group_size,
n_branchings=n_branchings,
allow_kendall_tau_shift=allow_kendall_tau_shift)
logg.info('computing Diffusion Pseudotime using n_dcs={}'.format(n_dcs), r=True)
if n_branchings > 1: logg.info(' this uses a hierarchical implementation')
if dpt.iroot is not None:
dpt._set_pseudotime() # pseudotimes are distances from root point
adata.uns['iroot'] = dpt.iroot # update iroot, might have changed when subsampling, for example
adata.obs['dpt_pseudotime'] = dpt.pseudotime
# detect branchings and partition the data into segments
if n_branchings > 0:
dpt.branchings_segments()
adata.obs['dpt_groups'] = pd.Categorical(
values=dpt.segs_names.astype('U'),
categories=natsorted(np.array(dpt.segs_names_unique).astype('U')))
# the "change points" separate segments in the ordering above
adata.uns['dpt_changepoints'] = dpt.changepoints
# the tip points of segments
adata.uns['dpt_grouptips'] = dpt.segs_tips
# the ordering according to segments and pseudotime
ordering_id = np.zeros(adata.n_obs, dtype=int)
for count, idx in enumerate(dpt.indices): ordering_id[idx] = count
adata.obs['dpt_order'] = ordering_id
adata.obs['dpt_order_indices'] = dpt.indices
logg.info(' finished', time=True, end=' ' if _settings_verbosity_greater_or_equal_than(3) else '\n')
logg.hint('added\n'
+ (' \'dpt_pseudotime\', the pseudotime (adata.obs)'
if dpt.iroot is not None else '')
+ ('\n \'dpt_groups\', the branching subgroups of dpt (adata.obs)\n'
+ ' \'dpt_order\', cell order (adata.obs)'
if n_branchings > 0 else ''))
return adata if copy else None
class DPT(Neighbors):
"""Hierarchical Diffusion Pseudotime.
"""
def __init__(self, adata, n_dcs=None, min_group_size=0.01,
n_branchings=0, allow_kendall_tau_shift=False):
super(DPT, self).__init__(adata, n_dcs=n_dcs)
self.flavor = 'haghverdi16'
self.n_branchings = n_branchings
self.min_group_size = min_group_size if min_group_size >= 1 else int(min_group_size * self._adata.shape[0])
self.passed_adata = adata # just for debugging purposes
self.choose_largest_segment = False
self.allow_kendall_tau_shift = allow_kendall_tau_shift
def branchings_segments(self):
"""Detect branchings and partition the data into corresponding segments.
Detect all branchings up to `n_branchings`.
Writes
------
segs : np.ndarray
Array of dimension (number of segments) × (number of data
points). Each row stores a mask array that defines a segment.
segs_tips : np.ndarray
Array of dimension (number of segments) × 2. Each row stores the
indices of the two tip points of each segment.
segs_names : np.ndarray
Array of dimension (number of data points). Stores an integer label
for each segment.
"""
self.detect_branchings()
self.postprocess_segments()
self.set_segs_names()
self.order_pseudotime()
def detect_branchings(self):
"""Detect all branchings up to `n_branchings`.
Writes Attributes
-----------------
segs : np.ndarray
List of integer index arrays.
segs_tips : np.ndarray
List of indices of the tips of segments.
"""
logg.m(' detect', self.n_branchings,
'branching' + ('' if self.n_branchings == 1 else 's'))
# a segment is a subset of points of the data set (defined by the
# indices of the points in the segment)
# initialize the search for branchings with a single segment,
# that is, get the indices of the whole data set
indices_all = np.arange(self._adata.shape[0], dtype=int)
# let's keep a list of segments, the first segment to add is the
# whole data set
segs = [indices_all]
# a segment can as well be defined by the two points that have maximal
# distance in the segment, the "tips" of the segment
#
# the rest of the points in the segment is then defined by demanding
# them to "be close to the line segment that connects the tips", that
# is, for such a point, the normalized added distance to both tips is
# smaller than one:
# (D[tips[0],i] + D[tips[1],i])/D[tips[0],tips[1] < 1
# of course, this condition is fulfilled by the full cylindrical
# subspace surrounding that line segment, where the radius of the
# cylinder can be infinite
#
# if D denotes a euclidian distance matrix, a line segment is a linear
# object, and the name "line" is justified. if we take the
# diffusion-based distance matrix Dchosen, which approximates geodesic
# distance, with "line", we mean the shortest path between two points,
# which can be highly non-linear in the original space
#
# let us define the tips of the whole data set
if False: # this is safe, but not compatible with on-the-fly computation
tips_all = np.array(np.unravel_index(np.argmax(self.distances_dpt), self.distances_dpt.shape))
else:
if self.iroot is not None:
tip_0 = np.argmax(self.distances_dpt[self.iroot])
else:
tip_0 = np.argmax(self.distances_dpt[0])
tips_all = np.array([tip_0, np.argmax(self.distances_dpt[tip_0])])
# we keep a list of the tips of each segment
segs_tips = [tips_all]
segs_connects = [[]]
segs_undecided = [True]
segs_adjacency = [[]]
logg.m(' do not consider groups with less than {} points for splitting'
.format(self.min_group_size))
for ibranch in range(self.n_branchings):
iseg, tips3 = self.select_segment(segs, segs_tips, segs_undecided)
if iseg == -1:
logg.m(' partitioning converged')
break
logg.m(' branching {}:'.format(ibranch + 1),
'split group', iseg) # [third start end]
# detect branching and update segs and segs_tips
self.detect_branching(segs, segs_tips,
segs_connects,
segs_undecided,
segs_adjacency, iseg, tips3)
# store as class members
self.segs = segs
self.segs_tips = segs_tips
self.segs_undecided = segs_undecided
# the following is a bit too much, but this allows easy storage
self.segs_adjacency = sp.sparse.lil_matrix((len(segs), len(segs)), dtype=float)
self.segs_connects = sp.sparse.lil_matrix((len(segs), len(segs)), dtype=int)
for i, seg_adjacency in enumerate(segs_adjacency):
self.segs_connects[i, seg_adjacency] = segs_connects[i]
for i in range(len(segs)):
for j in range(len(segs)):
self.segs_adjacency[i, j] = self.distances_dpt[self.segs_connects[i, j],
self.segs_connects[j, i]]
self.segs_adjacency = self.segs_adjacency.tocsr()
self.segs_connects = self.segs_connects.tocsr()
def check_adjacency(self):
n_edges_per_seg = np.sum(self.segs_adjacency > 0, axis=1).A1
for n_edges in range(1, np.max(n_edges_per_seg) + 1):
for iseg in range(self.segs_adjacency.shape[0]):
if n_edges_per_seg[iseg] == n_edges:
neighbor_segs = self.segs_adjacency[iseg].todense().A1
closest_points_other_segs = [seg[np.argmin(self.distances_dpt[self.segs_tips[iseg][0], seg])]
for seg in self.segs]
seg = self.segs[iseg]
closest_points_in_segs = [seg[np.argmin(self.distances_dpt[tips[0], seg])]
for tips in self.segs_tips]
distance_segs = [self.distances_dpt[closest_points_other_segs[ipoint], point]
for ipoint, point in enumerate(closest_points_in_segs)]
# exclude the first point, the segment itself
closest_segs = np.argsort(distance_segs)[1:n_edges+1]
# update adjacency matrix within the loop!
# self.segs_adjacency[iseg, neighbor_segs > 0] = 0
# self.segs_adjacency[iseg, closest_segs] = np.array(distance_segs)[closest_segs]
# self.segs_adjacency[neighbor_segs > 0, iseg] = 0
# self.segs_adjacency[closest_segs, iseg] = np.array(distance_segs)[closest_segs].reshape(len(closest_segs), 1)
# n_edges_per_seg = np.sum(self.segs_adjacency > 0, axis=1).A1
print(iseg, distance_segs, closest_segs)
# print(self.segs_adjacency)
# self.segs_adjacency.eliminate_zeros()
def select_segment(self, segs, segs_tips, segs_undecided):
"""Out of a list of line segments, choose segment that has the most
distant second data point.
Assume the distance matrix Ddiff is sorted according to seg_idcs.
Compute all the distances.
Returns
-------
iseg : int
Index identifying the position within the list of line segments.
tips3 : int
Positions of tips within chosen segment.
"""
scores_tips = np.zeros((len(segs), 4))
allindices = np.arange(self._adata.shape[0], dtype=int)
for iseg, seg in enumerate(segs):
# do not consider too small segments
if segs_tips[iseg][0] == -1: continue
# restrict distance matrix to points in segment
if not isinstance(self.distances_dpt, OnFlySymMatrix):
Dseg = self.distances_dpt[np.ix_(seg, seg)]
else:
Dseg = self.distances_dpt.restrict(seg)
third_maximizer = None
if segs_undecided[iseg]:
# check that none of our tips "connects" with a tip of the
# other segments
for jseg in range(len(segs)):
if jseg != iseg:
# take the inner tip, the "second tip" of the segment
for itip in range(2):
if (self.distances_dpt[segs_tips[jseg][1], segs_tips[iseg][itip]]
< 0.5 * self.distances_dpt[segs_tips[iseg][~itip], segs_tips[iseg][itip]]):
# logg.m(' group', iseg, 'with tip', segs_tips[iseg][itip],
# 'connects with', jseg, 'with tip', segs_tips[jseg][1], v=4)
# logg.m(' do not use the tip for "triangulation"', v=4)
third_maximizer = itip
# map the global position to the position within the segment
tips = [np.where(allindices[seg] == tip)[0][0]
for tip in segs_tips[iseg]]
# find the third point on the segment that has maximal
# added distance from the two tip points
dseg = Dseg[tips[0]] + Dseg[tips[1]]
if not np.isfinite(dseg).any():
continue
# add this point to tips, it's a third tip, we store it at the first
# position in an array called tips3
third_tip = np.argmax(dseg)
if third_maximizer is not None:
# find a fourth point that has maximal distance to all three
dseg += Dseg[third_tip]
fourth_tip = np.argmax(dseg)
if fourth_tip != tips[0] and fourth_tip != third_tip:
tips[1] = fourth_tip
dseg -= Dseg[tips[1]]
else:
dseg -= Dseg[third_tip]
tips3 = np.append(tips, third_tip)
# compute the score as ratio of the added distance to the third tip,
# to what it would be if it were on the straight line between the
# two first tips, given by Dseg[tips[:2]]
# if we did not normalize, there would be a danger of simply
# assigning the highest score to the longest segment
score = dseg[tips3[2]] / Dseg[tips3[0], tips3[1]]
score = len(seg) if self.choose_largest_segment else score # simply the number of points
logg.m(' group', iseg, 'score', score, 'n_points', len(seg),
'(too small)' if len(seg) < self.min_group_size else '', v=4)
if len(seg) <= self.min_group_size: score = 0
# write result
scores_tips[iseg, 0] = score
scores_tips[iseg, 1:] = tips3
iseg = np.argmax(scores_tips[:, 0])
if scores_tips[iseg, 0] == 0: return -1, None
tips3 = scores_tips[iseg, 1:].astype(int)
return iseg, tips3
def postprocess_segments(self):
"""Convert the format of the segment class members."""
# make segs a list of mask arrays, it's easier to store
# as there is a hdf5 equivalent
for iseg, seg in enumerate(self.segs):
mask = np.zeros(self._adata.shape[0], dtype=bool)
mask[seg] = True
self.segs[iseg] = mask
# convert to arrays
self.segs = np.array(self.segs)
self.segs_tips = np.array(self.segs_tips)
def set_segs_names(self):
"""Return a single array that stores integer segment labels."""
segs_names = np.zeros(self._adata.shape[0], dtype=np.int8)
self.segs_names_unique = []
for iseg, seg in enumerate(self.segs):
segs_names[seg] = iseg
self.segs_names_unique.append(iseg)
self.segs_names = segs_names
def order_pseudotime(self):
"""Define indices that reflect segment and pseudotime order.
Writes
------
indices : np.ndarray
Index array of shape n, which stores an ordering of the data points
with respect to increasing segment index and increasing pseudotime.
changepoints : np.ndarray
Index array of shape len(ssegs)-1, which stores the indices of
points where the segment index changes, with respect to the ordering
of indices.
"""
# within segs_tips, order tips according to pseudotime
if self.iroot is not None:
for itips, tips in enumerate(self.segs_tips):
if tips[0] != -1:
indices = np.argsort(self.pseudotime[tips])
self.segs_tips[itips] = self.segs_tips[itips][indices]
else:
logg.m(' group', itips, 'is very small', v=4)
# sort indices according to segments
indices = np.argsort(self.segs_names)
segs_names = self.segs_names[indices]
# find changepoints of segments
changepoints = np.arange(indices.size-1)[np.diff(segs_names) == 1] + 1
if self.iroot is not None:
pseudotime = self.pseudotime[indices]
for iseg, seg in enumerate(self.segs):
# only consider one segment, it's already ordered by segment
seg_sorted = seg[indices]
# consider the pseudotime on this segment and sort them
seg_indices = np.argsort(pseudotime[seg_sorted])
# within the segment, order indices according to increasing pseudotime
indices[seg_sorted] = indices[seg_sorted][seg_indices]
# define class members
self.indices = indices
self.changepoints = changepoints
def detect_branching(self, segs, segs_tips, segs_connects, segs_undecided, segs_adjacency,
iseg, tips3):
"""Detect branching on given segment.
Updates all list parameters inplace.
Call function _detect_branching and perform bookkeeping on segs and
segs_tips.
Parameters
----------
segs : list of np.ndarray
Dchosen distance matrix restricted to segment.
segs_tips : list of np.ndarray
Stores all tip points for the segments in segs.
iseg : int
Position of segment under study in segs.
tips3 : np.ndarray
The three tip points. They form a 'triangle' that contains the data.
"""
seg = segs[iseg]
# restrict distance matrix to points in segment
if not isinstance(self.distances_dpt, OnFlySymMatrix):
Dseg = self.distances_dpt[np.ix_(seg, seg)]
else:
Dseg = self.distances_dpt.restrict(seg)
# given the three tip points and the distance matrix detect the
# branching on the segment, return the list ssegs of segments that
# are defined by splitting this segment
result = self._detect_branching(Dseg, tips3, seg)
ssegs, ssegs_tips, ssegs_adjacency, ssegs_connects, trunk = result
# map back to global indices
for iseg_new, seg_new in enumerate(ssegs):
ssegs[iseg_new] = seg[seg_new]
ssegs_tips[iseg_new] = seg[ssegs_tips[iseg_new]]
ssegs_connects[iseg_new] = list(seg[ssegs_connects[iseg_new]])
# remove previous segment
segs.pop(iseg)
segs_tips.pop(iseg)
# insert trunk/undecided_cells at same position
segs.insert(iseg, ssegs[trunk])
segs_tips.insert(iseg, ssegs_tips[trunk])
# append other segments
segs += [seg for iseg, seg in enumerate(ssegs) if iseg != trunk]
segs_tips += [seg_tips for iseg, seg_tips in enumerate(ssegs_tips) if iseg != trunk]
if len(ssegs) == 4:
# insert undecided cells at same position
segs_undecided.pop(iseg)
segs_undecided.insert(iseg, True)
# correct edges in adjacency matrix
n_add = len(ssegs) - 1
prev_connecting_segments = segs_adjacency[iseg].copy()
if self.flavor == 'haghverdi16':
segs_adjacency += [[iseg] for i in range(n_add)]
segs_connects += [seg_connects for iseg, seg_connects in enumerate(ssegs_connects) if iseg != trunk]
prev_connecting_points = segs_connects[iseg]
for jseg_cnt, jseg in enumerate(prev_connecting_segments):
iseg_cnt = 0
for iseg_new, seg_new in enumerate(ssegs):
if iseg_new != trunk:
pos = segs_adjacency[jseg].index(iseg)
connection_to_iseg = segs_connects[jseg][pos]
if connection_to_iseg in seg_new:
kseg = len(segs) - n_add + iseg_cnt
segs_adjacency[jseg][pos] = kseg
pos_2 = segs_adjacency[iseg].index(jseg)
segs_adjacency[iseg].pop(pos_2)
idx = segs_connects[iseg].pop(pos_2)
segs_adjacency[kseg].append(jseg)
segs_connects[kseg].append(idx)
break
iseg_cnt += 1
segs_adjacency[iseg] += list(range(len(segs_adjacency) - n_add, len(segs_adjacency)))
segs_connects[iseg] += ssegs_connects[trunk]
else:
segs_adjacency += [[] for i in range(n_add)]
segs_connects += [[] for i in range(n_add)]
kseg_list = [iseg] + list(range(len(segs) - n_add, len(segs)))
for jseg in prev_connecting_segments:
pos = segs_adjacency[jseg].index(iseg)
distances = []
closest_points_in_jseg = []
closest_points_in_kseg = []
for kseg in kseg_list:
reference_point_in_k = segs_tips[kseg][0]
closest_points_in_jseg.append(segs[jseg][np.argmin(self.distances_dpt[reference_point_in_k, segs[jseg]])])
# do not use the tip in the large segment j, instead, use the closest point
reference_point_in_j = closest_points_in_jseg[-1] # segs_tips[jseg][0]
closest_points_in_kseg.append(segs[kseg][np.argmin(self.distances_dpt[reference_point_in_j, segs[kseg]])])
distances.append(self.distances_dpt[closest_points_in_jseg[-1], closest_points_in_kseg[-1]])
# print(jseg, '(', segs_tips[jseg][0], closest_points_in_jseg[-1], ')',
# kseg, '(', segs_tips[kseg][0], closest_points_in_kseg[-1], ') :', distances[-1])
idx = np.argmin(distances)
kseg_min = kseg_list[idx]
segs_adjacency[jseg][pos] = kseg_min
segs_connects[jseg][pos] = closest_points_in_kseg[idx]
pos_2 = segs_adjacency[iseg].index(jseg)
segs_adjacency[iseg].pop(pos_2)
segs_connects[iseg].pop(pos_2)
segs_adjacency[kseg_min].append(jseg)
segs_connects[kseg_min].append(closest_points_in_jseg[idx])
# if we split two clusters, we need to check whether the new segments connect to any of the other
# old segments
# if not, we add a link between the new segments, if yes, we add two links to connect them at the
# correct old segments
do_not_attach_kseg = False
for kseg in kseg_list:
distances = []
closest_points_in_jseg = []
closest_points_in_kseg = []
jseg_list = [jseg for jseg in range(len(segs))
if jseg != kseg and jseg not in prev_connecting_segments]
for jseg in jseg_list:
reference_point_in_k = segs_tips[kseg][0]
closest_points_in_jseg.append(segs[jseg][np.argmin(self.distances_dpt[reference_point_in_k, segs[jseg]])])
# do not use the tip in the large segment j, instead, use the closest point
reference_point_in_j = closest_points_in_jseg[-1] # segs_tips[jseg][0]
closest_points_in_kseg.append(segs[kseg][np.argmin(self.distances_dpt[reference_point_in_j, segs[kseg]])])
distances.append(self.distances_dpt[closest_points_in_jseg[-1], closest_points_in_kseg[-1]])
idx = np.argmin(distances)
jseg_min = jseg_list[idx]
if jseg_min not in kseg_list:
segs_adjacency_sparse = sp.sparse.lil_matrix((len(segs), len(segs)), dtype=float)
for i, seg_adjacency in enumerate(segs_adjacency):
segs_adjacency_sparse[i, seg_adjacency] = 1
G = nx.Graph(segs_adjacency_sparse)
paths_all = nx.single_source_dijkstra_path(G, source=kseg)
if jseg_min not in paths_all:
segs_adjacency[jseg_min].append(kseg)
segs_connects[jseg_min].append(closest_points_in_kseg[idx])
segs_adjacency[kseg].append(jseg_min)
segs_connects[kseg].append(closest_points_in_jseg[idx])
logg.m(' attaching new segment', kseg, 'at', jseg_min)
# if we split the cluster, we should not attach kseg
do_not_attach_kseg = True
else:
logg.m(' cannot attach new segment', kseg, 'at', jseg_min,
'(would produce cycle)')
if kseg != kseg_list[-1]:
logg.m(' continue')
continue
else:
logg.m(' do not add another link')
break
if jseg_min in kseg_list and not do_not_attach_kseg:
segs_adjacency[jseg_min].append(kseg)
segs_connects[jseg_min].append(closest_points_in_kseg[idx])
segs_adjacency[kseg].append(jseg_min)
segs_connects[kseg].append(closest_points_in_jseg[idx])
break
segs_undecided += [False for i in range(n_add)]
def _detect_branching(self, Dseg, tips, seg_reference=None):
"""Detect branching on given segment.
Call function __detect_branching three times for all three orderings of
tips. Points that do not belong to the same segment in all three
orderings are assigned to a fourth segment. The latter is, by Haghverdi
et al. (2016) referred to as 'undecided cells'.
Parameters
----------
Dseg : np.ndarray
Dchosen distance matrix restricted to segment.
tips : np.ndarray
The three tip points. They form a 'triangle' that contains the data.
Returns
-------
ssegs : list of np.ndarray
List of segments obtained from splitting the single segment defined
via the first two tip cells.
ssegs_tips : list of np.ndarray
List of tips of segments in ssegs.
"""
if self.flavor == 'haghverdi16':
ssegs = self._detect_branching_single_haghverdi16(Dseg, tips)
elif self.flavor == 'wolf17_tri':
ssegs = self._detect_branching_single_wolf17_tri(Dseg, tips)
elif self.flavor == 'wolf17_bi' or self.flavor == 'wolf17_bi_un':
ssegs = self._detect_branching_single_wolf17_bi(Dseg, tips)
else:
raise ValueError('`flavor` needs to be in {"haghverdi16", "wolf17_tri", "wolf17_bi"}.')
# make sure that each data point has a unique association with a segment
masks = np.zeros((len(ssegs), Dseg.shape[0]), dtype=bool)
for iseg, seg in enumerate(ssegs):
masks[iseg][seg] = True
nonunique = np.sum(masks, axis=0) > 1
ssegs = []
for iseg, mask in enumerate(masks):
mask[nonunique] = False
ssegs.append(np.arange(Dseg.shape[0], dtype=int)[mask])
# compute new tips within new segments
ssegs_tips = []
for inewseg, newseg in enumerate(ssegs):
if len(np.flatnonzero(newseg)) <= 1:
logg.warn('detected group with only {} cells'.format(np.flatnonzero(newseg)))
secondtip = newseg[np.argmax(Dseg[tips[inewseg]][newseg])]
ssegs_tips.append([tips[inewseg], secondtip])
undecided_cells = np.arange(Dseg.shape[0], dtype=int)[nonunique]
if len(undecided_cells) > 0:
ssegs.append(undecided_cells)
# establish the connecting points with the other segments
ssegs_connects = [[], [], [], []]
for inewseg, newseg_tips in enumerate(ssegs_tips):
reference_point = newseg_tips[0]
# closest cell to the new segment within undecided cells
closest_cell = undecided_cells[np.argmin(Dseg[reference_point][undecided_cells])]
ssegs_connects[inewseg].append(closest_cell)
# closest cell to the undecided cells within new segment
closest_cell = ssegs[inewseg][np.argmin(Dseg[closest_cell][ssegs[inewseg]])]
ssegs_connects[-1].append(closest_cell)
# also compute tips for the undecided cells
tip_0 = undecided_cells[np.argmax(Dseg[undecided_cells[0]][undecided_cells])]
tip_1 = undecided_cells[np.argmax(Dseg[tip_0][undecided_cells])]
ssegs_tips.append([tip_0, tip_1])
ssegs_adjacency = [[3], [3], [3], [0, 1, 2]]
trunk = 3
elif len(ssegs) == 3:
reference_point = np.zeros(3, dtype=int)
reference_point[0] = ssegs_tips[0][0]
reference_point[1] = ssegs_tips[1][0]
reference_point[2] = ssegs_tips[2][0]
closest_points = np.zeros((3, 3), dtype=int)
# this is another strategy than for the undecided_cells
# here it's possible to use the more symmetric procedure
# shouldn't make much of a difference
closest_points[0, 1] = ssegs[1][np.argmin(Dseg[reference_point[0]][ssegs[1]])]
closest_points[1, 0] = ssegs[0][np.argmin(Dseg[reference_point[1]][ssegs[0]])]
closest_points[0, 2] = ssegs[2][np.argmin(Dseg[reference_point[0]][ssegs[2]])]
closest_points[2, 0] = ssegs[0][np.argmin(Dseg[reference_point[2]][ssegs[0]])]
closest_points[1, 2] = ssegs[2][np.argmin(Dseg[reference_point[1]][ssegs[2]])]
closest_points[2, 1] = ssegs[1][np.argmin(Dseg[reference_point[2]][ssegs[1]])]
added_dist = np.zeros(3)
added_dist[0] = Dseg[closest_points[1, 0], closest_points[0, 1]] + Dseg[closest_points[2, 0], closest_points[0, 2]]
added_dist[1] = Dseg[closest_points[0, 1], closest_points[1, 0]] + Dseg[closest_points[2, 1], closest_points[1, 2]]
added_dist[2] = Dseg[closest_points[1, 2], closest_points[2, 1]] + Dseg[closest_points[0, 2], closest_points[2, 0]]
trunk = np.argmin(added_dist)
ssegs_adjacency = [[trunk] if i != trunk else
[j for j in range(3) if j != trunk]
for i in range(3)]
ssegs_connects = [[closest_points[i, trunk]] if i != trunk else
[closest_points[trunk, j] for j in range(3) if j != trunk]
for i in range(3)]
else:
trunk = 0
ssegs_adjacency = [[1], [0]]
reference_point_in_0 = ssegs_tips[0][0]
closest_point_in_1 = ssegs[1][np.argmin(Dseg[reference_point_in_0][ssegs[1]])]
reference_point_in_1 = closest_point_in_1 # ssegs_tips[1][0]
closest_point_in_0 = ssegs[0][np.argmin(Dseg[reference_point_in_1][ssegs[0]])]
ssegs_connects = [[closest_point_in_1], [closest_point_in_0]]
return ssegs, ssegs_tips, ssegs_adjacency, ssegs_connects, trunk
def _detect_branching_single_haghverdi16(self, Dseg, tips):
"""Detect branching on given segment.
"""
# compute branchings using different starting points the first index of
# tips is the starting point for the other two, the order does not
# matter
ssegs = []
# permutations of tip cells
ps = [[0, 1, 2], # start by computing distances from the first tip
[1, 2, 0], # -"- second tip
[2, 0, 1]] # -"- third tip
for i, p in enumerate(ps):
ssegs.append(self.__detect_branching_haghverdi16(Dseg, tips[p]))
return ssegs
def _detect_branching_single_wolf17_tri(self, Dseg, tips):
# all pairwise distances
dist_from_0 = Dseg[tips[0]]
dist_from_1 = Dseg[tips[1]]
dist_from_2 = Dseg[tips[2]]
closer_to_0_than_to_1 = dist_from_0 < dist_from_1
closer_to_0_than_to_2 = dist_from_0 < dist_from_2
closer_to_1_than_to_2 = dist_from_1 < dist_from_2
masks = np.zeros((2, Dseg.shape[0]), dtype=bool)
masks[0] = closer_to_0_than_to_1
masks[1] = closer_to_0_than_to_2
segment_0 = np.sum(masks, axis=0) == 2
masks = np.zeros((2, Dseg.shape[0]), dtype=bool)
masks[0] = ~closer_to_0_than_to_1
masks[1] = closer_to_1_than_to_2
segment_1 = np.sum(masks, axis=0) == 2
masks = np.zeros((2, Dseg.shape[0]), dtype=bool)
masks[0] = ~closer_to_0_than_to_2
masks[1] = ~closer_to_1_than_to_2
segment_2 = np.sum(masks, axis=0) == 2
ssegs = [segment_0, segment_1, segment_2]
return ssegs
def _detect_branching_single_wolf17_bi(self, Dseg, tips):
dist_from_0 = Dseg[tips[0]]
dist_from_1 = Dseg[tips[1]]
closer_to_0_than_to_1 = dist_from_0 < dist_from_1
ssegs = [closer_to_0_than_to_1, ~closer_to_0_than_to_1]
return ssegs
def __detect_branching_haghverdi16(self, Dseg, tips):
"""Detect branching on given segment.
Compute point that maximizes kendall tau correlation of the sequences of
distances to the second and the third tip, respectively, when 'moving
away' from the first tip: tips[0]. 'Moving away' means moving in the
direction of increasing distance from the first tip.
Parameters
----------
Dseg : np.ndarray
Dchosen distance matrix restricted to segment.
tips : np.ndarray
The three tip points. They form a 'triangle' that contains the data.
Returns
-------
ssegs : list of np.ndarray
List of segments obtained from "splitting away the first tip cell".
"""
# sort distance from first tip point
# then the sequence of distances Dseg[tips[0]][idcs] increases
idcs = np.argsort(Dseg[tips[0]])
# consider now the sequence of distances from the other
# two tip points, which only increase when being close to `tips[0]`
# where they become correlated
# at the point where this happens, we define a branching point
if True:
imax = self.kendall_tau_split(Dseg[tips[1]][idcs],
Dseg[tips[2]][idcs])
if False:
# if we were in euclidian space, the following should work
# as well, but here, it doesn't because the scales in Dseg are
# highly different, one would need to write the following equation
# in terms of an ordering, such as exploited by the kendall
# correlation method above
imax = np.argmin(Dseg[tips[0]][idcs]
+ Dseg[tips[1]][idcs]
+ Dseg[tips[2]][idcs])
# init list to store new segments
ssegs = []
# first new segment: all points until, but excluding the branching point
# increasing the following slightly from imax is a more conservative choice
# as the criterion based on normalized distances, which follows below,
# is less stable
if imax > 0.95 * len(idcs) and self.allow_kendall_tau_shift:
# if "everything" is correlated (very large value of imax), a more
# conservative choice amounts to reducing this
logg.warn('shifting branching point away from maximal kendall-tau correlation (suppress this with `allow_kendall_tau_shift=False`)')
ibranch = int(0.95 * imax)
else:
# otherwise, a more conservative choice is the following
ibranch = imax + 1
return idcs[:ibranch]
def kendall_tau_split(self, a, b):
"""Return splitting index that maximizes correlation in the sequences.
Compute difference in Kendall tau for all splitted sequences.
For each splitting index i, compute the difference of the two
correlation measures kendalltau(a[:i], b[:i]) and
kendalltau(a[i:], b[i:]).
Returns the splitting index that maximizes
kendalltau(a[:i], b[:i]) - kendalltau(a[i:], b[i:])
Parameters
----------
a, b : np.ndarray
One dimensional sequences.
Returns
-------
i : int
Splitting index according to above description.
"""
if a.size != b.size:
raise ValueError('a and b need to have the same size')
if a.ndim != b.ndim != 1:
raise ValueError('a and b need to be one-dimensional arrays')
import scipy as sp
min_length = 5
n = a.size
idx_range = np.arange(min_length, a.size-min_length-1, dtype=int)
corr_coeff = np.zeros(idx_range.size)
pos_old = sp.stats.kendalltau(a[:min_length], b[:min_length])[0]
neg_old = sp.stats.kendalltau(a[min_length:], b[min_length:])[0]
for ii, i in enumerate(idx_range):
if True:
# compute differences in concordance when adding a[i] and b[i]
# to the first subsequence, and removing these elements from
# the second subsequence
diff_pos, diff_neg = self._kendall_tau_diff(a, b, i)
pos = pos_old + self._kendall_tau_add(i, diff_pos, pos_old)
neg = neg_old + self._kendall_tau_subtract(n-i, diff_neg, neg_old)
pos_old = pos
neg_old = neg
if False:
# computation using sp.stats.kendalltau, takes much longer!
# just for debugging purposes
pos = sp.stats.kendalltau(a[:i+1], b[:i+1])[0]
neg = sp.stats.kendalltau(a[i+1:], b[i+1:])[0]
if False:
# the following is much slower than using sp.stats.kendalltau,
# it is only good for debugging because it allows to compute the
# tau-a version, which does not account for ties, whereas
# sp.stats.kendalltau computes tau-b version, which accounts for
# ties
pos = sp.stats.mstats.kendalltau(a[:i], b[:i], use_ties=False)[0]
neg = sp.stats.mstats.kendalltau(a[i:], b[i:], use_ties=False)[0]
corr_coeff[ii] = pos - neg
iimax = np.argmax(corr_coeff)
imax = min_length + iimax
corr_coeff_max = corr_coeff[iimax]
if corr_coeff_max < 0.3:
logg.m(' is root itself, never obtain significant correlation', v=4)
return imax
def _kendall_tau_add(self, len_old, diff_pos, tau_old):
"""Compute Kendall tau delta.
The new sequence has length len_old + 1.
Parameters
----------
len_old : int
The length of the old sequence, used to compute tau_old.
diff_pos : int
Difference between concordant and non-concordant pairs.
tau_old : float
Kendall rank correlation of the old sequence.
"""
return 2./(len_old+1)*(float(diff_pos)/len_old-tau_old)
def _kendall_tau_subtract(self, len_old, diff_neg, tau_old):
"""Compute Kendall tau delta.
The new sequence has length len_old - 1.
Parameters
----------
len_old : int
The length of the old sequence, used to compute tau_old.
diff_neg : int
Difference between concordant and non-concordant pairs.
tau_old : float
Kendall rank correlation of the old sequence.
"""
return 2./(len_old-2)*(-float(diff_neg)/(len_old-1)+tau_old)
def _kendall_tau_diff(self, a, b, i):
"""Compute difference in concordance of pairs in split sequences.
Consider splitting a and b at index i.
Parameters
----------
a, b : np.ndarray
Returns
-------
diff_pos, diff_neg : int, int
Difference between concordant and non-concordant pairs for both
subsequences.
"""
# compute ordering relation of the single points a[i] and b[i]
# with all previous points of the sequences a and b, respectively
a_pos = np.zeros(a[:i].size, dtype=int)
a_pos[a[:i] > a[i]] = 1
a_pos[a[:i] < a[i]] = -1
b_pos = np.zeros(b[:i].size, dtype=int)
b_pos[b[:i] > b[i]] = 1
b_pos[b[:i] < b[i]] = -1
diff_pos = np.dot(a_pos, b_pos).astype(float)
# compute ordering relation of the single points a[i] and b[i]
# with all later points of the sequences
a_neg = np.zeros(a[i:].size, dtype=int)
a_neg[a[i:] > a[i]] = 1
a_neg[a[i:] < a[i]] = -1
b_neg = np.zeros(b[i:].size, dtype=int)
b_neg[b[i:] > b[i]] = 1
b_neg[b[i:] < b[i]] = -1
diff_neg = np.dot(a_neg, b_neg)
return diff_pos, diff_neg