#### **Imports and variables**

In [1]:
import os
import time
from representative_subset import greedy_representative_subset, greedy_representative_subset_v2, global_alignment_score
from data_io import parse_fasta, write_fasta

In [2]:
datasets_folder = './data'
aligned_datasets = {
    'H1N1': 'H1N1_Aligned.fasta',
    'H3N2': 'H3N2_Aligned.fasta',
    'H5N1': 'H5N1_Aligned.fasta',
    # 'H9N2': 'H9N2_Aligned.fasta',
}
unaligned_datasets = {
    'H1N1': 'H1N1_Seg4.fasta',
    'H3N2': 'H3N2_Seg4.fasta',
    'H5N1': 'H5N1_Seg4.fasta',
    'H9N2': 'H9N2_Seg4.fasta',
}

#### **Compare Algorithm 1 vs Algorithm 2 with aligned sequences**

In [3]:
for dataset in aligned_datasets:
    print(f"Dataset {dataset}:")
    input_path = os.path.join(datasets_folder, aligned_datasets[dataset])    
    sequences = parse_fasta(input_path)
    print("\tExtracting representative subset using Algorithm 1...")
    init = time.time()
    subset = greedy_representative_subset(sequences)
    end = time.time()
    print(f"\tExtracted {len(subset)} representative sequences from {len(sequences)} in {end-init} seconds.")
    print("\tExtracting representative subset using Algorithm 2...")
    init = time.time()
    subset_v2 = greedy_representative_subset_v2(sequences)
    end = time.time()
    print(f"\tExtracted {len(subset_v2)} representative sequences from {len(sequences)} input sequences in {end-init} seconds.")

Dataset H1N1:
	Extracting representative subset using Algorithm 1...


100%|█████████████████████████████████████| 3060/3060 [00:00<00:00, 4432.76it/s]


	Extracted 10 representative sequences from 3060 in 0.7142457962036133 seconds.
	Extracting representative subset using Algorithm 2...


100%|███████████████████████████████████████| 3060/3060 [10:11<00:00,  5.00it/s]


	Extracted 10 representative sequences from 3060 input sequences in 620.561919927597 seconds.
Dataset H3N2:
	Extracting representative subset using Algorithm 1...


100%|█████████████████████████████████████| 3722/3722 [00:01<00:00, 2975.31it/s]


	Extracted 7 representative sequences from 3722 in 1.257411003112793 seconds.
	Extracting representative subset using Algorithm 2...


100%|███████████████████████████████████████| 3722/3722 [05:05<00:00, 12.20it/s]


	Extracted 6 representative sequences from 3722 input sequences in 312.65380024909973 seconds.
Dataset H5N1:
	Extracting representative subset using Algorithm 1...


100%|█████████████████████████████████████| 2054/2054 [00:01<00:00, 1510.84it/s]


	Extracted 59 representative sequences from 2054 in 1.364534854888916 seconds.
	Extracting representative subset using Algorithm 2...


100%|███████████████████████████████████████| 2054/2054 [07:53<00:00,  4.34it/s]


	Extracted 51 representative sequences from 2054 input sequences in 494.3880021572113 seconds.


#### **Compare Algorithm 1 with/without aligned inputs**

In [None]:
for dataset in aligned_datasets:
    print(f"Dataset {dataset}:")
    print("\tExtracting representative subset using Algorithm 1 with aligned sequences...")
    init = time.time()
    subset = greedy_representative_subset(sequences, aligned=True)
    end = time.time()
    print(f"\tExtracted {len(subset)} representative sequences from {len(sequences)} in {end-init} seconds.")
    input_path = os.path.join(datasets_folder, unaligned_datasets[dataset])    
    sequences = parse_fasta(input_path)
    print("\tExtracting representative subset using Algorithm 1 without aligned sequences...")
    init = time.time()
    subset_v2 = greedy_representative_subset(sequences, algined=False)
    end = time.time()
    print(f"\tExtracted {len(subset)} representative sequences from {len(sequences)} input sequences in {end-init} seconds.")

#### **Benchmark results**

In [3]:
for dataset in aligned_datasets:
    print(f"Dataset {dataset}:")
    input_path = os.path.join(datasets_folder, aligned_datasets[dataset])    
    output_path = os.path.join(datasets_folder, aligned_datasets[dataset].replace(".fasta", "_Subset.fasta")) 
    sequences = parse_fasta(input_path)
    print("\tExtracting representative subset using Algorithm 1...")
    init = time.time()
    subset = greedy_representative_subset(sequences)
    end = time.time()
    print(f"\tExtracted {len(subset)} representative sequences from {len(sequences)} input sequences in {end-init} seconds.")
    # write_fasta(subset, os.path.join(datasets_folder, output_datasets['H1N1']), os.path.join(datasets_folder, input_datasets['H1N1']))

Dataset H1N1:
	Extracting representative subset using Algorithm 1...
	Extracted 10 representative sequences from 3060 in 0.6984028816223145 seconds.
Dataset H3N2:
	Extracting representative subset using Algorithm 1...
	Extracted 7 representative sequences from 3722 in 1.1758978366851807 seconds.
Dataset H5N1:
	Extracting representative subset using Algorithm 1...
	Extracted 59 representative sequences from 2054 in 1.253005027770996 seconds.
