In [1]:
import numpy as np

In [3]:
def get_omega(input_string):
    length = len(input_string)
    return set([input_string[i: j + 1] for i in range(length) for j in range(i, length)])


def get_weightedJac(R1, R2, get_weight=None):
    """
    Calculate the (weighted) Jaccard distance between two formatted medical histories.
    """
    if get_weight is None:
        get_weight = lambda seq: 1
    
    O1 = get_omega(R1)
    O2 = get_omega(R2)
    union = O1.union(O2)
    intersection = O1.intersection(O2)
    
    weight_union = 0
    weight_intersection = 0
    for seq in union:
        weight = get_weight(seq)
        weight_union += weight
        if seq in intersection:
            weight_intersection += weight
    
    return 1.0 - float(weight_intersection) / float(weight_union)



In [43]:
def get_weightedJac_display(R1, R2, get_weight=None):
    """
    Calculate the (weighted) Jaccard distance between two formatted medical histories.
    """
    if get_weight is None:
        get_weight = lambda seq: 1
    
    O1 = get_omega(R1)
    O2 = get_omega(R2)
    union = O1.union(O2)
    intersection = O1.intersection(O2)

    print('O1: {}'.format(O1))
    print('O2: {}'.format(O2))
    print('Union: {}'.format(union))
    print('Intersection: {}'.format(intersection))
    
    weight_union = 0
    weight_intersection = 0
    for seq in union:
        weight = get_weight(seq)
        weight_union += weight
        if seq in intersection:
            weight_intersection += weight
    
    return 1.0 - float(weight_intersection) / float(weight_union)

# Generate random history

In [14]:
length = 50 # Let us assume that histories are of uniform length
num_histories = 100

## The no-brainer way
with which I probably will give a man breast cancer or so... (or a woman prostate cancer, which is even less likely)

**Please help me come up more sensable ways of generating faux medical histories.**

In [54]:
num_diesease = 10
p = np.array([50, 1, .5, .5, .5, .2, .2, .1, .1, 0.05])
p = p / p.sum()

matrix = np.random.choice(10, size=(num_histories, length), replace=True, p=p)

In [63]:
histories = []
for idx, row in enumerate(matrix):
    history = ''.join(list(map(str, row)))
    print('{}:\t{}'.format(idx, history))
    histories.append(history)
    
Jaccard = np.ones((num_histories, num_histories))
for i in range(num_histories):
    for j in range(i + 1, num_histories):
        jac = get_weightedJac(histories[i], histories[j])
        print('{}, {}:\t{}'.format(i, j, jac))
        Jaccard[i][j] = jac
        Jaccard[j][i] = jac

0:	00000000000100000100000000000000000207000000000000
1:	00000000000000040600000000000000300000000000000000
2:	00000000000000000000000000000004100000000000000000
3:	00000030000000000000000000000040000000000000000000
4:	00000000000000000000000000000000000000000000000000
5:	00400000000000000000000010000000000000050000000000
6:	00000000001000000000610000100000600000000000000000
7:	00000000030000000402000000000000000000050000000001
8:	10070000000000000030000200000900000000000003000000
9:	00000000000000000000010000000200000000000001000000
10:	01000000000000000000000000000000000000000000000000
11:	10000000000000000000000000000000000000000000000050
12:	00002000002000000000000000000000000000003000000000
13:	00000000000000000200000000100000400000000000600000
14:	00000000000000000000000000700000000000000000000000
15:	00000000000000000000004030000000000001000010000000
16:	00000001000000000000000000000000000040000001000000
17:	00000000000000000000000000000100000000000000000003
18:	0000000000000000

1, 55:	0.9743589743589743
1, 56:	0.9717544812601847
1, 57:	0.9822222222222222
1, 58:	0.9363762102351314
1, 59:	0.987369985141159
1, 60:	0.9410919540229885
1, 61:	0.8619139370584458
1, 62:	0.9891165172855314
1, 63:	0.9597197898423818
1, 64:	0.965625
1, 65:	0.9646494907130018
1, 66:	0.96639231824417
1, 67:	0.9767441860465116
1, 68:	0.9816971713810316
1, 69:	0.980110497237569
1, 70:	0.9827586206896551
1, 71:	0.9864572047670639
1, 72:	0.9608091024020228
1, 73:	0.9878744650499287
1, 74:	0.954954954954955
1, 75:	0.9901734104046243
1, 76:	0.9888304862023654
1, 77:	0.9681611435997401
1, 78:	0.9878744650499287
1, 79:	0.9892063492063492
1, 80:	0.9818491208167895
1, 81:	0.9903573454339194
1, 82:	0.9902578796561604
1, 83:	0.9764383561643836
1, 84:	0.991878722252301
1, 85:	0.9820295983086681
1, 86:	0.8707829408020369
1, 87:	0.9902411021814007
1, 88:	0.9838032393521295
1, 89:	0.9878744650499287
1, 90:	0.9135802469135803
1, 91:	0.9510800508259212
1, 92:	0.9596907785753728
1, 93:	0.9708680142687277
1,

11, 65:	0.9703779366700716
11, 66:	0.9548472775564409
11, 67:	0.9603340292275574
11, 68:	0.9808219178082191
11, 69:	0.9660238751147842
11, 70:	0.977296181630547
11, 71:	0.9517304189435337
11, 72:	0.9367088607594937
11, 73:	0.902668759811617
11, 74:	0.9161793372319689
11, 75:	0.9760717846460618
11, 76:	0.9349804941482445
11, 77:	0.970344009489917
11, 78:	0.9229583975346687
11, 79:	0.9003783102143758
11, 80:	0.9583333333333334
11, 81:	0.9556650246305418
11, 82:	0.9794319294809011
11, 83:	0.94967978042086
11, 84:	0.9739442946990117
11, 85:	0.7602040816326531
11, 86:	0.9786821705426356
11, 87:	0.95995995995996
11, 88:	0.9758149316508938
11, 89:	0.9229583975346687
11, 90:	0.9499323410013532
11, 91:	0.9689922480620154
11, 92:	0.9708222811671088
11, 93:	0.9756838905775076
11, 94:	0.9680170575692963
11, 95:	0.9639564124057
11, 96:	0.9281961471103327
11, 97:	0.9588014981273408
11, 98:	0.9706994328922496
11, 99:	0.7602040816326531
12, 13:	0.9583577712609971
12, 14:	0.981651376146789
12, 15:	0.97

24, 32:	0.9842209072978304
24, 33:	0.9856716417910448
24, 34:	0.9737800436999272
24, 35:	0.9805982215036378
24, 36:	0.9583604424202993
24, 37:	0.9842622950819672
24, 38:	0.9882075471698113
24, 39:	0.984179301252472
24, 40:	0.9554579673776662
24, 41:	0.9872832369942196
24, 42:	0.9717314487632509
24, 43:	0.9545454545454546
24, 44:	0.9772727272727273
24, 45:	0.9822090437361009
24, 46:	0.9836623553437712
24, 47:	0.976303317535545
24, 48:	0.8798955613577023
24, 49:	0.9190600522193212
24, 50:	0.9840637450199203
24, 51:	0.9735449735449735
24, 52:	0.9539473684210527
24, 53:	0.8698149329929802
24, 54:	0.9687131050767415
24, 55:	0.975070821529745
24, 56:	0.9612314709236032
24, 57:	0.926873857404022
24, 58:	0.9690794096978215
24, 59:	0.9810725552050473
24, 60:	0.8926598263614838
24, 61:	0.9589963280293757
24, 62:	0.9838274932614556
24, 63:	0.9560707748627212
24, 64:	0.9523809523809523
24, 65:	0.9701678060907396
24, 66:	0.7641996557659209
24, 67:	0.9851943244910549
24, 68:	0.9543838862559242
24, 6

32, 72:	0.8681925808997633
32, 73:	0.9728122344944775
32, 74:	0.9569471624266145
32, 75:	0.984137475214805
32, 76:	0.7532833020637899
32, 77:	0.9815225424981523
32, 78:	0.7219873150105709
32, 79:	0.9732540861812778
32, 80:	0.9136394790952708
32, 81:	0.8959212376933896
32, 82:	0.9862834748530372
32, 83:	0.9378603459320948
32, 84:	0.9457562220804084
32, 85:	0.955617198335645
32, 86:	0.9857328145265889
32, 87:	0.9666444296197465
32, 88:	0.962962962962963
32, 89:	0.7219873150105709
32, 90:	0.9744816586921851
32, 91:	0.9801840056617127
32, 92:	0.9885196374622357
32, 93:	0.9839679358717435
32, 94:	0.974323386537127
32, 95:	0.9644128113879004
32, 96:	0.9706422018348624
32, 97:	0.9637625289128758
32, 98:	0.9268456375838926
32, 99:	0.955617198335645
33, 34:	0.973323720259553
33, 35:	0.8130232558139535
33, 36:	0.9849246231155779
33, 37:	0.8633720930232558
33, 38:	0.9883109292811222
33, 39:	0.9843342036553525
33, 40:	0.9278876834716018
33, 41:	0.9873925501432664
33, 42:	0.9792207792207792
33, 43:

47, 78:	0.7958477508650519
47, 79:	0.9730971128608924
47, 80:	0.7575544624033732
47, 81:	0.7062730627306273
47, 82:	0.9883381924198251
47, 83:	0.9074183976261128
47, 84:	0.9077380952380952
47, 85:	0.9781659388646288
47, 86:	0.9765670767428236
47, 87:	0.9616306954436451
47, 88:	0.9561678146524734
47, 89:	0.7958477508650519
47, 90:	0.9720083974807557
47, 91:	0.9824561403508771
47, 92:	0.9785478547854786
47, 93:	0.9881235154394299
47, 94:	0.9708798017348204
47, 95:	0.9366041896361632
47, 96:	0.9700078926598263
47, 97:	0.9765415549597856
47, 98:	0.879245283018868
47, 99:	0.9781659388646288
48, 49:	0.9165596919127086
48, 50:	0.9655629139072848
48, 51:	0.9650756693830035
48, 52:	0.9473995271867612
48, 53:	0.9079345850999394
48, 54:	0.9296028880866426
48, 55:	0.8862673926194797
48, 56:	0.9495750708215297
48, 57:	0.9060402684563759
48, 58:	0.9676308539944903
48, 59:	0.815742397137746
48, 60:	0.9003831417624522
48, 61:	0.9889148191365228
48, 62:	0.9855072463768116
48, 63:	0.9704666272888364
48,

In [61]:
Jaccard.argmin()

418

In [57]:
get_weightedJac_display(histories[0], histories[1])

O1: {'000000001000001000000000000000002', '000000000000020700000000', '0000000001000001000000', '0000010000010000000000000000020700', '00000010000010', '00000000020700000000', '0000010000010000', '0000000100000100000', '00001000000000000000002070000000000', '000100000000000000000', '000000000000207', '001000001', '00100', '000100000000000', '0000000000000000020700', '0002', '000000', '1', '1000000000000000002070000000', '700000000000', '000000020700', '000001000001000000000000000002070', '00100000000000000000207000000000', '0000000000100000100', '0000100000100000000', '000000000000', '00100000000000000000207', '001000001000000000000000002', '1000001000000000000000002', '2', '0000000000207000000000000', '000100000', '00000000001000', '000000100000100', '0020', '000010000010000000000000000020700000000', '00000000100000100000000', '0000000000000020700000000', '0000000001000001000000000000', '010000000000000000020700000000', '000000000001000001000', '000000100000100000000000000000207000000

0.9907558455682436