#### **Imports and variables**

In [57]:
import os
import time
from representative_subset import greedy_representative_subset, greedy_representative_subset_v2, global_alignment_score, score
from data_io import parse_fasta, write_fasta
import random
from tqdm import tqdm

In [64]:
datasets_folder = './data'
aligned_datasets = {
    'H1N1': 'H1N1_Aligned.fasta',
    'H3N2': 'H3N2_Aligned.fasta',
    'H5N1': 'H5N1_Aligned.fasta',
    #'H9N2': 'H9N2_Aligned.fasta',
}
unaligned_datasets = {
    'H1N1': 'H1N1_Seg4.fasta',
    'H3N2': 'H3N2_Seg4.fasta',
    'H5N1': 'H5N1_Seg4.fasta',
    #'H9N2': 'H9N2_Seg4.fasta',
}

#### **Compare Algorithm 1 vs Algorithm 2 with aligned sequences**

In [4]:
for dataset in aligned_datasets:
    print(f"Dataset {dataset}:")
    input_path = os.path.join(datasets_folder, aligned_datasets[dataset])    
    sequences = parse_fasta(input_path)
    print("\tExtracting representative subset using Algorithm 1...")
    init = time.time()
    subset = greedy_representative_subset(sequences)
    end = time.time()
    print(f"\tExtracted {len(subset)} representative sequences from {len(sequences)} in {end-init} seconds.")
    print("\tExtracting representative subset using Algorithm 2...")
    init = time.time()
    subset_v2 = greedy_representative_subset_v2(sequences)
    end = time.time()
    print(f"\tExtracted {len(subset_v2)} representative sequences from {len(sequences)} input sequences in {end-init} seconds.")

Dataset H1N1:
	Extracting representative subset using Algorithm 1...


100%|█████████████████████████████████████| 3060/3060 [00:00<00:00, 5224.39it/s]


	Extracted 10 representative sequences from 3060 in 0.6243116855621338 seconds.
	Extracting representative subset using Algorithm 2...


100%|███████████████████████████████████████| 3060/3060 [08:26<00:00,  6.05it/s]


	Extracted 10 representative sequences from 3060 input sequences in 512.9585061073303 seconds.
Dataset H3N2:
	Extracting representative subset using Algorithm 1...


100%|█████████████████████████████████████| 3722/3722 [00:01<00:00, 3663.74it/s]


	Extracted 7 representative sequences from 3722 in 1.0170588493347168 seconds.
	Extracting representative subset using Algorithm 2...


  3%|█▏                                      | 112/3722 [00:34<18:30,  3.25it/s]


KeyboardInterrupt: 

#### **Compare Algorithm 1 with/without aligned inputs**

In [5]:
for dataset in aligned_datasets:
    print(f"Dataset {dataset}:")
    print("\tExtracting representative subset using Algorithm 1 with aligned sequences...")
    init = time.time()
    subset = greedy_representative_subset(sequences, aligned=True)
    end = time.time()
    print(f"\tExtracted {len(subset)} representative sequences from {len(sequences)} in {end-init} seconds.")
    input_path = os.path.join(datasets_folder, unaligned_datasets[dataset])    
    sequences = parse_fasta(input_path)
    print("\tExtracting representative subset using Algorithm 1 without aligned sequences...")
    init = time.time()
    subset_v2 = greedy_representative_subset(sequences, algined=False)
    end = time.time()
    print(f"\tExtracted {len(subset)} representative sequences from {len(sequences)} input sequences in {end-init} seconds.")

Dataset H1N1:
	Extracting representative subset using Algorithm 1 with aligned sequences...


100%|████████████████████████████████████| 3722/3722 [00:00<00:00, 61564.43it/s]

	Extracted 7 representative sequences from 3722 in 0.06415104866027832 seconds.
	Extracting representative subset using Algorithm 1 without aligned sequences...





TypeError: greedy_representative_subset() got an unexpected keyword argument 'algined'

#### **Benchmark results**

In [70]:
subset_list = []
test_list = []
for dataset in aligned_datasets:
    print(f"Dataset {dataset}:")
    input_path = os.path.join(datasets_folder, aligned_datasets[dataset])
    output_path = os.path.join(datasets_folder, aligned_datasets[dataset].replace(".fasta", "_Subset.fasta"))
    sequences = parse_fasta(input_path)
    print("\tSplitting training and testing sets...")
    testing_indices = random.choices(list(range(len(sequences))), k=len(sequences)//10)
    training_set = [sequences[i] for i in range(len(sequences)) if i not in testing_indices]
    testing_set = [sequences[i] for i in testing_indices]
    test_list.append(testing_set)
    #write_fasta(testing_set, os.path.join(datasets_folder, dataset+'_testing.fasta'))
    print(f"\tSplit testing sequences to {dataset}_testing.fasta")
    print("\tExtracting representative subset using Algorithm 1...")
    init = time.time()
    subset = greedy_representative_subset(training_set)
    subset_list.append(subset)
    end = time.time()
    print(f"\tExtracted {len(subset)} representative sequences from {len(sequences)} input sequences in {end-init} seconds.")
    print(f"Writing results into {output_path}")
    #write_fasta(subset, output_path)
    print("Done! \n")

Dataset H1N1:
	Splitting training and testing sets...
	Split testing sequences to H1N1_testing.fasta
	Extracting representative subset using Algorithm 1...


100%|███████████████████████████████████| 2765/2765 [00:00<00:00, 266713.83it/s]


	Extracted 10 representative sequences from 3060 input sequences in 0.011772871017456055 seconds.
Writing results into ./data/H1N1_Aligned_Subset.fasta
Done! 

Dataset H3N2:
	Splitting training and testing sets...
	Split testing sequences to H3N2_testing.fasta
	Extracting representative subset using Algorithm 1...


100%|███████████████████████████████████| 3363/3363 [00:00<00:00, 171346.85it/s]

	Extracted 7 representative sequences from 3722 input sequences in 0.020990848541259766 seconds.
Writing results into ./data/H3N2_Aligned_Subset.fasta
Done! 

Dataset H5N1:
	Splitting training and testing sets...
	Split testing sequences to H5N1_testing.fasta





	Extracting representative subset using Algorithm 1...


100%|███████████████████████████████████| 1858/1858 [00:00<00:00, 133458.07it/s]

	Extracted 55 representative sequences from 2054 input sequences in 0.017106056213378906 seconds.
Writing results into ./data/H5N1_Aligned_Subset.fasta
Done! 






In [71]:
accuracy_score = 0
size = sum([len(i) for i in test_list])
for i in range(len(test_list)):
    test = test_list[i]
    for j in tqdm(range(len(test))):
        min_list = []
        for subset in subset_list:
            temp = []
            for k in subset:
                temp.append(abs(score(test[j],k)))
            min_list.append(min(temp))
        if(min_list[i]==min(min_list)):
            accuracy_score += 1
print(f"Subset Accuracy: {accuracy_score/size}")

100%|████████████████████████████████████████| 306/306 [00:02<00:00, 102.82it/s]
100%|████████████████████████████████████████| 372/372 [00:02<00:00, 155.86it/s]
100%|█████████████████████████████████████████| 205/205 [00:02<00:00, 79.24it/s]

Subset Accuracy: 1.0



