# To-do list
1. **(Done)** Test running time (only keep the faster one)
2. **(Done)** Adjust how weight is calculated

In [2]:
import numpy as np

# Functions

In [3]:
def get_omega(input_string):
    length = len(input_string)
    return set([input_string[i: j + 1] for i in range(length) for j in range(i, length)])


def get_weightedJac(R1, R2, get_weight=None):
    """
    Calculate the (weighted) Jaccard distance between two formatted medical histories.
    """
    if get_weight is None:
        get_weight = lambda seq: 1
    
    O1 = get_omega(R1)
    O2 = get_omega(R2)
    union = O1.union(O2)
    intersection = O1.intersection(O2)
    
    weight_union = 0
    weight_intersection = 0
    for seq in union:
        weight = get_weight(seq)
        weight_union += weight
        if seq in intersection:
            weight_intersection += weight
    
    return 1.0 - float(weight_intersection) / float(weight_union)

# Tests

## Test 1

In [72]:
R1 = '0D000' # patient 1 : T1D in year 2
R2 = '00D00' # patient 2 : T1D in year 3
print(get_weightedJac(R1, R2))

0.4285714285714286


## Test 2: with get_weight function

In [77]:
R1 = 'ATCGTCTTACTCCAGCATGCTGATCTGATGCC'
R2 = 'ACTCCAACTGTCCATGAGCTGATCTGCTTCGT'

def get_weight(seq):
    weight_dict = {'A': 1, 'T': 2, 'C': 3, 'G': 4}
    weight = 0
    for letter in seq:
        weight += weight_dict[letter]
    return weight

get_weightedJac(R1, R2, get_weight)

0.9786506373717484

## Test 3: Calculate running time

In [78]:
import time
start_time = time.clock()

R1 = 'ATCGTCTTACTCCAGCAATCGTCTTACTCCAGCATGCTGATCTGATGCCTGCTGATCTGATGCC'
R2 = 'ACTCCAACTGTCCATGAGCACTCCAACTGTCCATGAGCTGATCTGCTTCGTTGATCTGCTTCGT'

num_runs = 1000
for i in range(num_runs):
    get_weightedJac(R1, R2)

print('{:.3f} seconds'.format(time.clock() - start_time))

2.130 seconds
