# Solution to exercise

## Step 1: Load all the data

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import edlib

In [None]:
signals_file = 'signals.json'
sequences_file = 'sequences.json'
kmers_table = 'kmer_table.json'

In [None]:
f = open('signals.json')
signals = json.load(f)

f = open('sequences.json')
sequences = json.load(f)

f = open('kmer_table.json')
kmer_table = json.load(f)

## Step 2: Write a transition detector

In [None]:
def transition_detector(signal, window_size, threshold):
    ranges = []
    for i in range(len(signal)-window_size+1):
        section = signal[i:i+window_size]
        sig_range = max(section)-min(section)
        ranges.append(sig_range)


    events = [0]
    threshold = 0.03
    for i in range(len(ranges)):
        if ranges[i-1] < threshold and ranges[i] > threshold:
            events.append(i+(window_size//2))
    
    events.append(len(signal))
    return events, ranges

signal = signals[0]
events, ranges = transition_detector(signal, window_size=3, threshold=0.03)

print("Event indices:")
print(events)

plt.plot(ranges)
for i in range(len(events)):
    plt.axvline(events[i],color='r')
plt.xlim([0,100])
plt.show()

plt.plot(signal)
for i in range(len(events)):
    plt.axvline(events[i],color='r')
plt.xlim([0,100])
plt.show()
    


## Step 3: Establish signal level for each event

In [None]:
def extract_event_means(signal, events):
    means  = []
    for i in range(len(events)-1):
        start_idx = 0 if i == 0 else events[i] + 1
        end_idx = events[i+1]
        means.append(np.mean(signal[start_idx:end_idx]))
    return means

In [None]:
means = extract_event_means(signal, events)

In [None]:
plt.plot(signal[1:200])
plt.plot(np.repeat(means, np.diff(events))[:200])
plt.plot()

## Step 4: Establish distance metric between event and each kmer

In [None]:
kmer_keys = list(kmer_table.keys())

In [None]:
kmer_levels = np.array([kmer_table[k] for k in kmer_keys])

In [None]:
def euclidian_distance(x,y):
    return np.abs((x - y)**2)**0.5

In [None]:
euclidian_distance(0.5, kmer_levels)

In [None]:
# Create a numpy array of the right size
emission_distances = np.zeros((len(kmer_keys), len(means)))

In [None]:
for i, mean in enumerate(means):
    emission_distances[:,i] = euclidian_distance(mean, kmer_levels)

In [None]:
# let's plot out the heatmap

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,10))
ax.imshow(emission_distances,cmap='hot')
ax.set_yticks(range(16), kmer_keys)
plt.show()

## Decode the emission distances

In [None]:
mins = np.argmin(emission_distances,axis=0)

In [None]:
kmers = np.array(kmer_keys)[mins]

In [None]:
final_sequence = []
for kmer in kmers:
    final_sequence.append(kmer[0])
final_sequence = "".join(final_sequence)

In [None]:
print(final_sequence)

In [None]:
print(sequences[0])

In [None]:
align_result = edlib.align(final_sequence, sequences[0], task='path')

In [None]:
nice_alignment = edlib.getNiceAlignment(align_result, final_sequence, sequences[0])

In [None]:
print("\n".join(nice_alignment.values()))

In [None]:
#measure the accuracy:
print(1 - align_result['editDistance'] / len(sequences[0]))