#### **Imports and variables**

In [7]:
import os
import time
from representative_subset import greedy_representative_subset, greedy_representative_subset_v2, global_alignment_score, score
from data_io import parse_fasta, write_fasta
from alignment_viewer import view_alignment
import random
from Bio import AlignIO, SeqIO
from tqdm import tqdm
import panel as pn
import panel.widgets as pnw
pn.extension()

In [2]:
datasets_folder = './data'
aligned_datasets = {
    'H1N1': 'H1N1_Aligned.fasta',
    'H3N2': 'H3N2_Aligned.fasta',
    'H5N1': 'H5N1_Aligned.fasta',
    #'H9N2': 'H9N2_Aligned.fasta',
}
unaligned_datasets = {
    'H1N1': 'H1N1_Seg4.fasta',
    'H3N2': 'H3N2_Seg4.fasta',
    'H5N1': 'H5N1_Seg4.fasta',
    #'H9N2': 'H9N2_Seg4.fasta',
}

#### **Compare Algorithm 1 vs Algorithm 2 with aligned sequences**

In [3]:
for dataset in aligned_datasets:
    print(f"Dataset {dataset}:")
    input_path = os.path.join(datasets_folder, aligned_datasets[dataset])    
    sequences = parse_fasta(input_path)
    print("\tExtracting representative subset using Algorithm 1...")
    init = time.time()
    subset = greedy_representative_subset(sequences)
    end = time.time()
    print(f"\tExtracted {len(subset)} representative sequences from {len(sequences)} in {end-init} seconds.")
    print("\tExtracting representative subset using Algorithm 2...")
    init = time.time()
    subset_v2 = greedy_representative_subset_v2(sequences)
    end = time.time()
    print(f"\tExtracted {len(subset_v2)} representative sequences from {len(sequences)} input sequences in {end-init} seconds.")

Dataset H1N1:
	Extracting representative subset using Algorithm 1...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3060/3060 [00:00<00:00, 5457.38it/s]


	Extracted 10 representative sequences from 3060 in 0.5795071125030518 seconds.
	Extracting representative subset using Algorithm 2...


  1%|█▋                                                                                                                              | 41/3060 [00:09<12:00,  4.19it/s]


KeyboardInterrupt: 

#### **Compare Algorithm 1 with/without aligned inputs**

In [5]:
for dataset in aligned_datasets:
    print(f"Dataset {dataset}:")
    print("\tExtracting representative subset using Algorithm 1 with aligned sequences...")
    init = time.time()
    subset = greedy_representative_subset(sequences, aligned=True)
    end = time.time()
    print(f"\tExtracted {len(subset)} representative sequences from {len(sequences)} in {end-init} seconds.")
    input_path = os.path.join(datasets_folder, unaligned_datasets[dataset])    
    sequences = parse_fasta(input_path)
    print("\tExtracting representative subset using Algorithm 1 without aligned sequences...")
    init = time.time()
    subset_v2 = greedy_representative_subset(sequences, algined=False)
    end = time.time()
    print(f"\tExtracted {len(subset)} representative sequences from {len(sequences)} input sequences in {end-init} seconds.")

Dataset H1N1:
	Extracting representative subset using Algorithm 1 with aligned sequences...


100%|████████████████████████████████████| 3722/3722 [00:00<00:00, 61564.43it/s]

	Extracted 7 representative sequences from 3722 in 0.06415104866027832 seconds.
	Extracting representative subset using Algorithm 1 without aligned sequences...





TypeError: greedy_representative_subset() got an unexpected keyword argument 'algined'

#### **Benchmark results**

In [3]:
subset_list = []
test_list = []
for dataset in aligned_datasets:
    print(f"Dataset {dataset}:")
    input_path = os.path.join(datasets_folder, aligned_datasets[dataset])
    output_path = os.path.join(datasets_folder, aligned_datasets[dataset].replace(".fasta", "_Subset.fasta"))
    sequences = parse_fasta(input_path)
    print("\tSplitting training and testing sets...")
    testing_indices = random.choices(list(range(len(sequences))), k=len(sequences)//10)
    training_set = [sequences[i] for i in range(len(sequences)) if i not in testing_indices]
    testing_set = [sequences[i] for i in testing_indices]
    test_list.append(testing_set)
    write_fasta(testing_set, os.path.join(datasets_folder, dataset+'_testing.fasta'))
    print(f"\tSplit testing sequences to {dataset}_testing.fasta")
    print("\tExtracting representative subset using Algorithm 1...")
    init = time.time()
    subset = greedy_representative_subset(training_set)
    subset_list.append(subset)
    end = time.time()
    print(f"\tExtracted {len(subset)} representative sequences from {len(sequences)} input sequences in {end-init} seconds.")
    print(f"Writing results into {output_path}")
    write_fasta(subset, output_path)
    print("Done! \n")

Dataset H1N1:
	Splitting training and testing sets...
	Split testing sequences to H1N1_testing.fasta
	Extracting representative subset using Algorithm 1...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2760/2760 [00:00<00:00, 5439.58it/s]


	Extracted 10 representative sequences from 3060 input sequences in 0.5312199592590332 seconds.
Writing results into ./data/H1N1_Aligned_Subset.fasta
Done! 

Dataset H3N2:
	Splitting training and testing sets...
	Split testing sequences to H3N2_testing.fasta
	Extracting representative subset using Algorithm 1...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3373/3373 [00:00<00:00, 4841.05it/s]


	Extracted 7 representative sequences from 3722 input sequences in 0.6996650695800781 seconds.
Writing results into ./data/H3N2_Aligned_Subset.fasta
Done! 

Dataset H5N1:
	Splitting training and testing sets...
	Split testing sequences to H5N1_testing.fasta
	Extracting representative subset using Algorithm 1...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1857/1857 [00:01<00:00, 1714.76it/s]

	Extracted 56 representative sequences from 2054 input sequences in 1.086327075958252 seconds.
Writing results into ./data/H5N1_Aligned_Subset.fasta
Done! 






In [71]:
accuracy_score = 0
size = sum([len(i) for i in test_list])
for i in range(len(test_list)):
    test = test_list[i]
    for j in tqdm(range(len(test))):
        min_list = []
        for subset in subset_list:
            temp = []
            for k in subset:
                temp.append(abs(score(test[j],k)))
            min_list.append(min(temp))
        if(min_list[i]==min(min_list)):
            accuracy_score += 1
print(f"Subset Accuracy: {accuracy_score/size}")

100%|████████████████████████████████████████| 306/306 [00:02<00:00, 102.82it/s]
100%|████████████████████████████████████████| 372/372 [00:02<00:00, 155.86it/s]
100%|█████████████████████████████████████████| 205/205 [00:02<00:00, 79.24it/s]

Subset Accuracy: 1.0





#### **View the Multiple Sequence Alignment of the Subsets**

#### **H1N1**

In [8]:
aln = AlignIO.read('data/H1N1_Aligned_Subset.fasta','fasta')
p = view_alignment(aln, plot_width=1000)
pn.pane.Bokeh(p)

#### **H3N2**

In [24]:
aln = AlignIO.read('data/H3N2_Aligned_Subset.fasta','fasta')
p = view_alignment(aln, plot_width=1000)
pn.pane.Bokeh(p)

#### **H5N1**

In [25]:
aln = AlignIO.read('data/H5N1_Aligned_Subset.fasta','fasta')
p = view_alignment(aln, plot_width=1000)
pn.pane.Bokeh(p)