# Protein embeddings improve phage-host interaction prediction

**Mark Edward M. Gonzales<sup>1, 2</sup>, Jennifer C. Ureta<sup>1, 2</sup> & Anish M.S. Shrestha<sup>1, 2</sup>**

<sup>1</sup> Bioinformatics Laboratory, Advanced Research Institute for Informatics, Computing and Networking, De La Salle University, Manila, Philippines <br>
<sup>2</sup> Department of Software Technology, College of Computer Studies, De La Salle University, Manila, Philippines 

{mark_gonzales, jennifer.ureta, anish.shrestha}@dlsu.edu.ph

<hr>

<hr>

In [15]:
import math
import pickle
import os
import warnings

import pandas as pd
import numpy as np
import sklearn

from imblearn.metrics import specificity_score
from collections import OrderedDict
from tqdm import tqdm

from ConstantsUtil import ConstantsUtil
from ClassificationUtil import ClassificationUtil
import boeckaerts as RBP_f

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 50)

pd.options.mode.chained_assignment = None

with warnings.catch_warnings(): 
    warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

In [3]:
constants = ConstantsUtil()
util = ClassificationUtil()

<hr>

In [4]:
HOSTS = 0
SCORES = 1
EVALUES = 2

In [8]:
for similarity_threshold in range(0, 101, 10):
    blast_results = {}

    for results in tqdm(os.listdir(constants.TEMP_RESULTS_BLAST)):
        blast_results[results[:-len('.xml')]] = util.iterate_blastp_results(f'{constants.TEMP_RESULTS_BLAST}/{results}',
                                                                            e_cutoff = 1e-4,
                                                                            similarity_threshold = similarity_threshold / 100)

    with open(f'{constants.TEMP_RESULTS}/blast_results_{similarity_threshold}.pickle', 'wb') as f:
        pickle.dump(blast_results, f, pickle.HIGHEST_PROTOCOL)

    print('Done reading through BLAST results...')

    with open(f'{constants.TEMP_RESULTS}/blast_results_{similarity_threshold}.pickle', 'rb') as f:
        blast_results = pickle.load(f)

    host_proba = {}
    for key, value in tqdm(blast_results.items()):
        host_proba[key] = util.get_mapping_probability(blast_results[key][SCORES], blast_results[key][HOSTS])

    print('Done computing the mapping probabilities...')

    _, _, _, y_test = util.get_train_test_sets()
    classes = np.unique(y_test.values)

    clf_probabilities = {}

    for test_set_idx, hosts in host_proba.items():
        y_prob = np.zeros(len(classes))
        for host, proba in hosts.items():
            y_prob[np.where(classes == host)[0][0]] = float(proba)

        clf_probabilities[int(test_set_idx)] = y_prob

    clf_probabilities = OrderedDict(sorted(clf_probabilities.items()))

    y_pred = []
    for class_proba in clf_probabilities.values():
        if np.max(class_proba) < 1e-9:
            y_pred.append('others')
        else:
            y_pred.append(classes[np.argmax(class_proba)])

    results = []
    for threshold in range(0, 101, 10):
        results.append(util.predict_with_threshold(clf_probabilities.values(), y_test.sort_index(), y_pred, 
                                                   unknown_threshold = threshold / 100, 
                                                   display=True))

    results_abridged = []
    for result in results:
        results_abridged.append([result[i] for i in [0, 1, 2, 3, 5, 6, 7]])

    if not os.path.exists(constants.TEMP_RESULTS):
        os.makedirs(constants.TEMP_RESULTS)

    with open(f'{constants.TEMP_RESULTS}/blast-sim-{similarity_threshold}.pickle', 'wb') as f:
        pickle.dump(results_abridged, f)

    print('Finished writing prediction results...')
    print('\n========================\n')

100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [08:09<00:00, 16.56it/s]


Done reading through BLAST results...


100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [07:35<00:00, 17.81it/s]


Done computing the mapping probabilities...
Constructing training and test sets...
Confidence threshold k: 0.0%
                   precision    recall  f1-score   support

    achromobacter     0.6667    0.7059    0.6857        17
    acinetobacter     0.8804    0.8265    0.8526        98
        aeromonas     0.7642    0.8526    0.8060        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6857    0.9231    0.7869        26
         bacillus     0.7854    0.8971    0.8375       204
      bacteroides     0.8286    0.9355    0.8788        31
    brevundimonas     0.7500    0.6000    0.6667        10
     burkholderia     0.5556    0.6410    0.5952        39
    campylobacter     0.9000    0.7984    0.8462       124
      caulobacter     0.5250    0.8400    0.6462        25
      citrobacter     0.3333    0.3824    0.3562        34
      clostridium     0.3800    0.7917    0.5135        24
      cronobacter     0.4615    0.5000    0.4800        36
  

Confidence threshold k: 30.0%
                   precision    recall  f1-score   support

    achromobacter     0.6250    0.5882    0.6061        17
    acinetobacter     0.8804    0.8265    0.8526        98
        aeromonas     0.7642    0.8526    0.8060        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6857    0.9231    0.7869        26
         bacillus     0.7888    0.8971    0.8394       204
      bacteroides     0.8286    0.9355    0.8788        31
    brevundimonas     0.7500    0.6000    0.6667        10
     burkholderia     0.5556    0.6410    0.5952        39
    campylobacter     0.9245    0.7903    0.8522       124
      caulobacter     0.5250    0.8400    0.6462        25
      citrobacter     0.3143    0.3235    0.3188        34
      clostridium     0.3800    0.7917    0.5135        24
      cronobacter     0.4865    0.5000    0.4932        36
          dickeya     0.7407    0.7692    0.7547        52
     edwardsiella     0.4

Confidence threshold k: 60.0%
                   precision    recall  f1-score   support

    achromobacter     0.6250    0.5882    0.6061        17
    acinetobacter     0.8804    0.8265    0.8526        98
        aeromonas     0.7642    0.8526    0.8060        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6857    0.9231    0.7869        26
         bacillus     0.7888    0.8971    0.8394       204
      bacteroides     0.8286    0.9355    0.8788        31
    brevundimonas     0.7500    0.6000    0.6667        10
     burkholderia     0.5556    0.6410    0.5952        39
    campylobacter     0.9223    0.7661    0.8370       124
      caulobacter     0.5250    0.8400    0.6462        25
      citrobacter     0.3235    0.3235    0.3235        34
      clostridium     0.3800    0.7917    0.5135        24
      cronobacter     0.4865    0.5000    0.4932        36
          dickeya     0.7407    0.7692    0.7547        52
     edwardsiella     0.4

Confidence threshold k: 90.0%
                   precision    recall  f1-score   support

    achromobacter     0.6250    0.5882    0.6061        17
    acinetobacter     0.8804    0.8265    0.8526        98
        aeromonas     0.7642    0.8526    0.8060        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6857    0.9231    0.7869        26
         bacillus     0.7888    0.8971    0.8394       204
      bacteroides     0.8286    0.9355    0.8788        31
    brevundimonas     0.7500    0.6000    0.6667        10
     burkholderia     0.5682    0.6410    0.6024        39
    campylobacter     0.9223    0.7661    0.8370       124
      caulobacter     0.5250    0.8400    0.6462        25
      citrobacter     0.3235    0.3235    0.3235        34
      clostridium     0.3800    0.7917    0.5135        24
      cronobacter     0.4865    0.5000    0.4932        36
          dickeya     0.7407    0.7692    0.7547        52
     edwardsiella     0.4

100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [11:19<00:00, 11.94it/s]


Done reading through BLAST results...


100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [06:04<00:00, 22.25it/s]


Done computing the mapping probabilities...
Constructing training and test sets...
Confidence threshold k: 0.0%
                   precision    recall  f1-score   support

    achromobacter     0.6667    0.7059    0.6857        17
    acinetobacter     0.8804    0.8265    0.8526        98
        aeromonas     0.7642    0.8526    0.8060        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6857    0.9231    0.7869        26
         bacillus     0.7854    0.8971    0.8375       204
      bacteroides     0.8286    0.9355    0.8788        31
    brevundimonas     0.7500    0.6000    0.6667        10
     burkholderia     0.5556    0.6410    0.5952        39
    campylobacter     0.9000    0.7984    0.8462       124
      caulobacter     0.5250    0.8400    0.6462        25
      citrobacter     0.3333    0.3824    0.3562        34
      clostridium     0.3800    0.7917    0.5135        24
      cronobacter     0.4615    0.5000    0.4800        36
  

Confidence threshold k: 30.0%
                   precision    recall  f1-score   support

    achromobacter     0.6250    0.5882    0.6061        17
    acinetobacter     0.8804    0.8265    0.8526        98
        aeromonas     0.7642    0.8526    0.8060        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6857    0.9231    0.7869        26
         bacillus     0.7888    0.8971    0.8394       204
      bacteroides     0.8286    0.9355    0.8788        31
    brevundimonas     0.7500    0.6000    0.6667        10
     burkholderia     0.5556    0.6410    0.5952        39
    campylobacter     0.9245    0.7903    0.8522       124
      caulobacter     0.5250    0.8400    0.6462        25
      citrobacter     0.3143    0.3235    0.3188        34
      clostridium     0.3800    0.7917    0.5135        24
      cronobacter     0.4865    0.5000    0.4932        36
          dickeya     0.7407    0.7692    0.7547        52
     edwardsiella     0.4

Confidence threshold k: 60.0%
                   precision    recall  f1-score   support

    achromobacter     0.6250    0.5882    0.6061        17
    acinetobacter     0.8804    0.8265    0.8526        98
        aeromonas     0.7642    0.8526    0.8060        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6857    0.9231    0.7869        26
         bacillus     0.7888    0.8971    0.8394       204
      bacteroides     0.8286    0.9355    0.8788        31
    brevundimonas     0.7500    0.6000    0.6667        10
     burkholderia     0.5556    0.6410    0.5952        39
    campylobacter     0.9223    0.7661    0.8370       124
      caulobacter     0.5250    0.8400    0.6462        25
      citrobacter     0.3235    0.3235    0.3235        34
      clostridium     0.3800    0.7917    0.5135        24
      cronobacter     0.4865    0.5000    0.4932        36
          dickeya     0.7407    0.7692    0.7547        52
     edwardsiella     0.4

Confidence threshold k: 90.0%
                   precision    recall  f1-score   support

    achromobacter     0.6250    0.5882    0.6061        17
    acinetobacter     0.8804    0.8265    0.8526        98
        aeromonas     0.7642    0.8526    0.8060        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6857    0.9231    0.7869        26
         bacillus     0.7888    0.8971    0.8394       204
      bacteroides     0.8286    0.9355    0.8788        31
    brevundimonas     0.7500    0.6000    0.6667        10
     burkholderia     0.5682    0.6410    0.6024        39
    campylobacter     0.9223    0.7661    0.8370       124
      caulobacter     0.5250    0.8400    0.6462        25
      citrobacter     0.3235    0.3235    0.3235        34
      clostridium     0.3800    0.7917    0.5135        24
      cronobacter     0.4865    0.5000    0.4932        36
          dickeya     0.7407    0.7692    0.7547        52
     edwardsiella     0.4

100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [10:43<00:00, 12.61it/s]


Done reading through BLAST results...


100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [06:04<00:00, 22.28it/s]


Done computing the mapping probabilities...
Constructing training and test sets...
Confidence threshold k: 0.0%
                   precision    recall  f1-score   support

    achromobacter     0.6667    0.7059    0.6857        17
    acinetobacter     0.8804    0.8265    0.8526        98
        aeromonas     0.7642    0.8526    0.8060        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6857    0.9231    0.7869        26
         bacillus     0.7854    0.8971    0.8375       204
      bacteroides     0.8286    0.9355    0.8788        31
    brevundimonas     0.7500    0.6000    0.6667        10
     burkholderia     0.5556    0.6410    0.5952        39
    campylobacter     0.9000    0.7984    0.8462       124
      caulobacter     0.5250    0.8400    0.6462        25
      citrobacter     0.3333    0.3824    0.3562        34
      clostridium     0.3878    0.7917    0.5205        24
      cronobacter     0.4615    0.5000    0.4800        36
  

Confidence threshold k: 30.0%
                   precision    recall  f1-score   support

    achromobacter     0.6250    0.5882    0.6061        17
    acinetobacter     0.8804    0.8265    0.8526        98
        aeromonas     0.7642    0.8526    0.8060        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6857    0.9231    0.7869        26
         bacillus     0.7888    0.8971    0.8394       204
      bacteroides     0.8286    0.9355    0.8788        31
    brevundimonas     0.7500    0.6000    0.6667        10
     burkholderia     0.5556    0.6410    0.5952        39
    campylobacter     0.9245    0.7903    0.8522       124
      caulobacter     0.5250    0.8400    0.6462        25
      citrobacter     0.3143    0.3235    0.3188        34
      clostridium     0.3878    0.7917    0.5205        24
      cronobacter     0.4865    0.5000    0.4932        36
          dickeya     0.7407    0.7692    0.7547        52
     edwardsiella     0.4

Confidence threshold k: 60.0%
                   precision    recall  f1-score   support

    achromobacter     0.6250    0.5882    0.6061        17
    acinetobacter     0.8804    0.8265    0.8526        98
        aeromonas     0.7642    0.8526    0.8060        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6857    0.9231    0.7869        26
         bacillus     0.7888    0.8971    0.8394       204
      bacteroides     0.8286    0.9355    0.8788        31
    brevundimonas     0.7500    0.6000    0.6667        10
     burkholderia     0.5556    0.6410    0.5952        39
    campylobacter     0.9223    0.7661    0.8370       124
      caulobacter     0.5250    0.8400    0.6462        25
      citrobacter     0.3235    0.3235    0.3235        34
      clostridium     0.3878    0.7917    0.5205        24
      cronobacter     0.4865    0.5000    0.4932        36
          dickeya     0.7407    0.7692    0.7547        52
     edwardsiella     0.4

Confidence threshold k: 90.0%
                   precision    recall  f1-score   support

    achromobacter     0.6250    0.5882    0.6061        17
    acinetobacter     0.8804    0.8265    0.8526        98
        aeromonas     0.7642    0.8526    0.8060        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6857    0.9231    0.7869        26
         bacillus     0.7888    0.8971    0.8394       204
      bacteroides     0.8286    0.9355    0.8788        31
    brevundimonas     0.7500    0.6000    0.6667        10
     burkholderia     0.5682    0.6410    0.6024        39
    campylobacter     0.9223    0.7661    0.8370       124
      caulobacter     0.5250    0.8400    0.6462        25
      citrobacter     0.3235    0.3235    0.3235        34
      clostridium     0.3878    0.7917    0.5205        24
      cronobacter     0.4865    0.5000    0.4932        36
          dickeya     0.7407    0.7692    0.7547        52
     edwardsiella     0.4

100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [10:29<00:00, 12.89it/s]


Done reading through BLAST results...


100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [03:36<00:00, 37.48it/s]


Done computing the mapping probabilities...
Constructing training and test sets...
Confidence threshold k: 0.0%
                   precision    recall  f1-score   support

    achromobacter     0.7500    0.7059    0.7273        17
    acinetobacter     0.8791    0.8163    0.8466        98
        aeromonas     0.7800    0.8211    0.8000        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6765    0.8846    0.7667        26
         bacillus     0.8219    0.8824    0.8511       204
      bacteroides     0.8529    0.9355    0.8923        31
    brevundimonas     0.8571    0.6000    0.7059        10
     burkholderia     0.5610    0.5897    0.5750        39
    campylobacter     0.9083    0.7984    0.8498       124
      caulobacter     0.5676    0.8400    0.6774        25
      citrobacter     0.3611    0.3824    0.3714        34
      clostridium     0.3913    0.7500    0.5143        24
      cronobacter     0.4737    0.5000    0.4865        36
  

Confidence threshold k: 30.0%
                   precision    recall  f1-score   support

    achromobacter     0.7143    0.5882    0.6452        17
    acinetobacter     0.8791    0.8163    0.8466        98
        aeromonas     0.7800    0.8211    0.8000        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6765    0.8846    0.7667        26
         bacillus     0.8219    0.8824    0.8511       204
      bacteroides     0.8529    0.9355    0.8923        31
    brevundimonas     0.8571    0.6000    0.7059        10
     burkholderia     0.5610    0.5897    0.5750        39
    campylobacter     0.9333    0.7903    0.8559       124
      caulobacter     0.5676    0.8400    0.6774        25
      citrobacter     0.3438    0.3235    0.3333        34
      clostridium     0.3913    0.7500    0.5143        24
      cronobacter     0.5000    0.5000    0.5000        36
          dickeya     0.7547    0.7692    0.7619        52
     edwardsiella     0.5

Confidence threshold k: 60.0%
                   precision    recall  f1-score   support

    achromobacter     0.7143    0.5882    0.6452        17
    acinetobacter     0.8791    0.8163    0.8466        98
        aeromonas     0.7800    0.8211    0.8000        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6765    0.8846    0.7667        26
         bacillus     0.8219    0.8824    0.8511       204
      bacteroides     0.8529    0.9355    0.8923        31
    brevundimonas     0.8571    0.6000    0.7059        10
     burkholderia     0.5610    0.5897    0.5750        39
    campylobacter     0.9314    0.7661    0.8407       124
      caulobacter     0.5676    0.8400    0.6774        25
      citrobacter     0.3548    0.3235    0.3385        34
      clostridium     0.3913    0.7500    0.5143        24
      cronobacter     0.5000    0.5000    0.5000        36
          dickeya     0.7547    0.7692    0.7619        52
     edwardsiella     0.5

Confidence threshold k: 90.0%
                   precision    recall  f1-score   support

    achromobacter     0.7143    0.5882    0.6452        17
    acinetobacter     0.8791    0.8163    0.8466        98
        aeromonas     0.7800    0.8211    0.8000        95
    agrobacterium     0.6250    0.4167    0.5000        12
     arthrobacter     0.6765    0.8846    0.7667        26
         bacillus     0.8219    0.8824    0.8511       204
      bacteroides     0.8529    0.9355    0.8923        31
    brevundimonas     0.8571    0.6000    0.7059        10
     burkholderia     0.5750    0.5897    0.5823        39
    campylobacter     0.9314    0.7661    0.8407       124
      caulobacter     0.5676    0.8400    0.6774        25
      citrobacter     0.3548    0.3235    0.3385        34
      clostridium     0.3913    0.7500    0.5143        24
      cronobacter     0.5000    0.5000    0.5000        36
          dickeya     0.7547    0.7692    0.7619        52
     edwardsiella     0.5

100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [10:23<00:00, 13.01it/s]


Done reading through BLAST results...


100%|█████████████████████████████████████████████████████████████████████████████| 8116/8116 [01:19<00:00, 102.14it/s]


Done computing the mapping probabilities...
Constructing training and test sets...
Confidence threshold k: 0.0%
                   precision    recall  f1-score   support

    achromobacter     0.7857    0.6471    0.7097        17
    acinetobacter     0.9506    0.7857    0.8603        98
        aeromonas     0.8085    0.8000    0.8042        95
    agrobacterium     0.7143    0.4167    0.5263        12
     arthrobacter     0.8077    0.8077    0.8077        26
         bacillus     0.9358    0.8578    0.8951       204
      bacteroides     0.9667    0.9355    0.9508        31
    brevundimonas     0.8571    0.6000    0.7059        10
     burkholderia     0.7692    0.5128    0.6154        39
    campylobacter     0.9083    0.7984    0.8498       124
      caulobacter     0.8077    0.8400    0.8235        25
      citrobacter     0.3714    0.3824    0.3768        34
      clostridium     0.4250    0.7083    0.5312        24
      cronobacter     0.4737    0.5000    0.4865        36
  

Confidence threshold k: 30.0%
                   precision    recall  f1-score   support

    achromobacter     0.7500    0.5294    0.6207        17
    acinetobacter     0.9506    0.7857    0.8603        98
        aeromonas     0.8085    0.8000    0.8042        95
    agrobacterium     0.7143    0.4167    0.5263        12
     arthrobacter     0.8077    0.8077    0.8077        26
         bacillus     0.9358    0.8578    0.8951       204
      bacteroides     0.9667    0.9355    0.9508        31
    brevundimonas     0.8571    0.6000    0.7059        10
     burkholderia     0.7692    0.5128    0.6154        39
    campylobacter     0.9333    0.7903    0.8559       124
      caulobacter     0.8077    0.8400    0.8235        25
      citrobacter     0.3548    0.3235    0.3385        34
      clostridium     0.4250    0.7083    0.5312        24
      cronobacter     0.5000    0.5000    0.5000        36
          dickeya     0.7843    0.7692    0.7767        52
     edwardsiella     0.5

Confidence threshold k: 60.0%
                   precision    recall  f1-score   support

    achromobacter     0.7500    0.5294    0.6207        17
    acinetobacter     0.9506    0.7857    0.8603        98
        aeromonas     0.8085    0.8000    0.8042        95
    agrobacterium     0.7143    0.4167    0.5263        12
     arthrobacter     0.8077    0.8077    0.8077        26
         bacillus     0.9358    0.8578    0.8951       204
      bacteroides     0.9667    0.9355    0.9508        31
    brevundimonas     0.8571    0.6000    0.7059        10
     burkholderia     0.7692    0.5128    0.6154        39
    campylobacter     0.9314    0.7661    0.8407       124
      caulobacter     0.8077    0.8400    0.8235        25
      citrobacter     0.3667    0.3235    0.3438        34
      clostridium     0.4250    0.7083    0.5312        24
      cronobacter     0.5000    0.5000    0.5000        36
          dickeya     0.7843    0.7692    0.7767        52
     edwardsiella     0.5

Confidence threshold k: 90.0%
                   precision    recall  f1-score   support

    achromobacter     0.7500    0.5294    0.6207        17
    acinetobacter     0.9506    0.7857    0.8603        98
        aeromonas     0.8085    0.8000    0.8042        95
    agrobacterium     0.7143    0.4167    0.5263        12
     arthrobacter     0.8077    0.8077    0.8077        26
         bacillus     0.9358    0.8578    0.8951       204
      bacteroides     0.9667    0.9355    0.9508        31
    brevundimonas     0.8571    0.6000    0.7059        10
     burkholderia     0.7692    0.5128    0.6154        39
    campylobacter     0.9314    0.7661    0.8407       124
      caulobacter     0.8077    0.8400    0.8235        25
      citrobacter     0.3667    0.3235    0.3438        34
      clostridium     0.4250    0.7083    0.5312        24
      cronobacter     0.5000    0.5000    0.5000        36
          dickeya     0.7843    0.7692    0.7767        52
     edwardsiella     0.5

100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [05:57<00:00, 22.70it/s]


Done reading through BLAST results...


100%|█████████████████████████████████████████████████████████████████████████████| 8116/8116 [00:58<00:00, 137.57it/s]


Done computing the mapping probabilities...
Constructing training and test sets...
Confidence threshold k: 0.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.5294    0.6923        17
    acinetobacter     0.9487    0.7551    0.8409        98
        aeromonas     0.8222    0.7789    0.8000        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     0.9130    0.8077    0.8571        26
         bacillus     0.9651    0.8137    0.8830       204
      bacteroides     1.0000    0.9355    0.9667        31
    brevundimonas     1.0000    0.6000    0.7500        10
     burkholderia     0.7826    0.4615    0.5806        39
    campylobacter     0.9159    0.7903    0.8485       124
      caulobacter     0.9130    0.8400    0.8750        25
      citrobacter     0.3939    0.3824    0.3881        34
      clostridium     0.4167    0.6250    0.5000        24
      cronobacter     0.4865    0.5000    0.4932        36
  

Confidence threshold k: 30.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.5294    0.6923        17
    acinetobacter     0.9487    0.7551    0.8409        98
        aeromonas     0.8222    0.7789    0.8000        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     0.9130    0.8077    0.8571        26
         bacillus     0.9651    0.8137    0.8830       204
      bacteroides     1.0000    0.9355    0.9667        31
    brevundimonas     1.0000    0.6000    0.7500        10
     burkholderia     0.7826    0.4615    0.5806        39
    campylobacter     0.9327    0.7823    0.8509       124
      caulobacter     0.9130    0.8400    0.8750        25
      citrobacter     0.3793    0.3235    0.3492        34
      clostridium     0.4167    0.6250    0.5000        24
      cronobacter     0.5143    0.5000    0.5070        36
          dickeya     0.8163    0.7692    0.7921        52
     edwardsiella     0.5

Confidence threshold k: 60.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.5294    0.6923        17
    acinetobacter     0.9487    0.7551    0.8409        98
        aeromonas     0.8222    0.7789    0.8000        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     0.9130    0.8077    0.8571        26
         bacillus     0.9651    0.8137    0.8830       204
      bacteroides     1.0000    0.9355    0.9667        31
    brevundimonas     1.0000    0.6000    0.7500        10
     burkholderia     0.7826    0.4615    0.5806        39
    campylobacter     0.9307    0.7581    0.8356       124
      caulobacter     0.9130    0.8400    0.8750        25
      citrobacter     0.3929    0.3235    0.3548        34
      clostridium     0.4167    0.6250    0.5000        24
      cronobacter     0.5143    0.5000    0.5070        36
          dickeya     0.8163    0.7692    0.7921        52
     edwardsiella     0.5

Confidence threshold k: 90.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.5294    0.6923        17
    acinetobacter     0.9487    0.7551    0.8409        98
        aeromonas     0.8222    0.7789    0.8000        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     0.9130    0.8077    0.8571        26
         bacillus     0.9651    0.8137    0.8830       204
      bacteroides     1.0000    0.9355    0.9667        31
    brevundimonas     1.0000    0.6000    0.7500        10
     burkholderia     0.7826    0.4615    0.5806        39
    campylobacter     0.9307    0.7581    0.8356       124
      caulobacter     0.9130    0.8400    0.8750        25
      citrobacter     0.3929    0.3235    0.3548        34
      clostridium     0.4167    0.6250    0.5000        24
      cronobacter     0.5143    0.5000    0.5070        36
          dickeya     0.8163    0.7692    0.7921        52
     edwardsiella     0.5

100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [05:54<00:00, 22.92it/s]


Done reading through BLAST results...


100%|█████████████████████████████████████████████████████████████████████████████| 8116/8116 [00:46<00:00, 174.09it/s]


Done computing the mapping probabilities...
Constructing training and test sets...
Confidence threshold k: 0.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.3529    0.5217        17
    acinetobacter     0.9583    0.7041    0.8118        98
        aeromonas     0.8148    0.6947    0.7500        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.7692    0.8696        26
         bacillus     0.9639    0.7843    0.8649       204
      bacteroides     1.0000    0.9355    0.9667        31
    brevundimonas     1.0000    0.6000    0.7500        10
     burkholderia     0.9474    0.4615    0.6207        39
    campylobacter     0.9143    0.7742    0.8384       124
      caulobacter     1.0000    0.8400    0.9130        25
      citrobacter     0.3750    0.3529    0.3636        34
      clostridium     0.4118    0.5833    0.4828        24
      cronobacter     0.4722    0.4722    0.4722        36
  

Confidence threshold k: 30.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.3529    0.5217        17
    acinetobacter     0.9583    0.7041    0.8118        98
        aeromonas     0.8148    0.6947    0.7500        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.7692    0.8696        26
         bacillus     0.9639    0.7843    0.8649       204
      bacteroides     1.0000    0.9355    0.9667        31
    brevundimonas     1.0000    0.6000    0.7500        10
     burkholderia     0.9474    0.4615    0.6207        39
    campylobacter     0.9314    0.7661    0.8407       124
      caulobacter     1.0000    0.8400    0.9130        25
      citrobacter     0.3571    0.2941    0.3226        34
      clostridium     0.4118    0.5833    0.4828        24
      cronobacter     0.5000    0.4722    0.4857        36
          dickeya     0.8163    0.7692    0.7921        52
     edwardsiella     1.0

Confidence threshold k: 60.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.3529    0.5217        17
    acinetobacter     0.9583    0.7041    0.8118        98
        aeromonas     0.8148    0.6947    0.7500        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.7692    0.8696        26
         bacillus     0.9639    0.7843    0.8649       204
      bacteroides     1.0000    0.9355    0.9667        31
    brevundimonas     1.0000    0.6000    0.7500        10
     burkholderia     0.9474    0.4615    0.6207        39
    campylobacter     0.9293    0.7419    0.8251       124
      caulobacter     1.0000    0.8400    0.9130        25
      citrobacter     0.3704    0.2941    0.3279        34
      clostridium     0.4118    0.5833    0.4828        24
      cronobacter     0.5000    0.4722    0.4857        36
          dickeya     0.8163    0.7692    0.7921        52
     edwardsiella     1.0

Confidence threshold k: 90.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.3529    0.5217        17
    acinetobacter     0.9583    0.7041    0.8118        98
        aeromonas     0.8148    0.6947    0.7500        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.7692    0.8696        26
         bacillus     0.9639    0.7843    0.8649       204
      bacteroides     1.0000    0.9355    0.9667        31
    brevundimonas     1.0000    0.6000    0.7500        10
     burkholderia     0.9474    0.4615    0.6207        39
    campylobacter     0.9293    0.7419    0.8251       124
      caulobacter     1.0000    0.8400    0.9130        25
      citrobacter     0.3704    0.2941    0.3279        34
      clostridium     0.4118    0.5833    0.4828        24
      cronobacter     0.5000    0.4722    0.4857        36
          dickeya     0.8163    0.7692    0.7921        52
     edwardsiella     1.0

100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [05:58<00:00, 22.61it/s]


Done reading through BLAST results...


100%|█████████████████████████████████████████████████████████████████████████████| 8116/8116 [00:36<00:00, 219.89it/s]


Done computing the mapping probabilities...
Constructing training and test sets...
Confidence threshold k: 0.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.3529    0.5217        17
    acinetobacter     0.9839    0.6224    0.7625        98
        aeromonas     0.8571    0.6316    0.7273        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.6538    0.7907        26
         bacillus     0.9618    0.7402    0.8366       204
      bacteroides     1.0000    0.9355    0.9667        31
    brevundimonas     1.0000    0.6000    0.7500        10
     burkholderia     0.9412    0.4103    0.5714        39
    campylobacter     0.9135    0.7661    0.8333       124
      caulobacter     1.0000    0.8000    0.8889        25
      citrobacter     0.4000    0.3529    0.3750        34
      clostridium     0.4242    0.5833    0.4912        24
      cronobacter     0.4722    0.4722    0.4722        36
  

Confidence threshold k: 30.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.3529    0.5217        17
    acinetobacter     0.9839    0.6224    0.7625        98
        aeromonas     0.8571    0.6316    0.7273        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.6538    0.7907        26
         bacillus     0.9618    0.7402    0.8366       204
      bacteroides     1.0000    0.9355    0.9667        31
    brevundimonas     1.0000    0.6000    0.7500        10
     burkholderia     0.9412    0.4103    0.5714        39
    campylobacter     0.9307    0.7581    0.8356       124
      caulobacter     1.0000    0.8000    0.8889        25
      citrobacter     0.3846    0.2941    0.3333        34
      clostridium     0.4242    0.5833    0.4912        24
      cronobacter     0.5000    0.4722    0.4857        36
          dickeya     0.8163    0.7692    0.7921        52
     edwardsiella     1.0

Confidence threshold k: 60.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.3529    0.5217        17
    acinetobacter     0.9839    0.6224    0.7625        98
        aeromonas     0.8571    0.6316    0.7273        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.6538    0.7907        26
         bacillus     0.9618    0.7402    0.8366       204
      bacteroides     1.0000    0.9355    0.9667        31
    brevundimonas     1.0000    0.6000    0.7500        10
     burkholderia     0.9412    0.4103    0.5714        39
    campylobacter     0.9286    0.7339    0.8198       124
      caulobacter     1.0000    0.8000    0.8889        25
      citrobacter     0.4000    0.2941    0.3390        34
      clostridium     0.4242    0.5833    0.4912        24
      cronobacter     0.5000    0.4722    0.4857        36
          dickeya     0.8163    0.7692    0.7921        52
     edwardsiella     1.0

Confidence threshold k: 90.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.3529    0.5217        17
    acinetobacter     0.9839    0.6224    0.7625        98
        aeromonas     0.8571    0.6316    0.7273        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.6538    0.7907        26
         bacillus     0.9618    0.7402    0.8366       204
      bacteroides     1.0000    0.9355    0.9667        31
    brevundimonas     1.0000    0.6000    0.7500        10
     burkholderia     0.9412    0.4103    0.5714        39
    campylobacter     0.9286    0.7339    0.8198       124
      caulobacter     1.0000    0.8000    0.8889        25
      citrobacter     0.4000    0.2941    0.3390        34
      clostridium     0.4242    0.5833    0.4912        24
      cronobacter     0.5000    0.4722    0.4857        36
          dickeya     0.8163    0.7692    0.7921        52
     edwardsiella     1.0

100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [05:56<00:00, 22.77it/s]


Done reading through BLAST results...


100%|█████████████████████████████████████████████████████████████████████████████| 8116/8116 [00:31<00:00, 254.35it/s]


Done computing the mapping probabilities...
Constructing training and test sets...
Confidence threshold k: 0.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.3529    0.5217        17
    acinetobacter     0.9833    0.6020    0.7468        98
        aeromonas     0.8889    0.5895    0.7089        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.6154    0.7619        26
         bacillus     0.9603    0.7108    0.8169       204
      bacteroides     1.0000    0.9032    0.9492        31
    brevundimonas     1.0000    0.5000    0.6667        10
     burkholderia     0.9375    0.3846    0.5455        39
    campylobacter     0.9314    0.7661    0.8407       124
      caulobacter     1.0000    0.8000    0.8889        25
      citrobacter     0.3846    0.2941    0.3333        34
      clostridium     0.4643    0.5417    0.5000        24
      cronobacter     0.4848    0.4444    0.4638        36
  

Confidence threshold k: 30.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.3529    0.5217        17
    acinetobacter     0.9833    0.6020    0.7468        98
        aeromonas     0.8889    0.5895    0.7089        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.6154    0.7619        26
         bacillus     0.9603    0.7108    0.8169       204
      bacteroides     1.0000    0.9032    0.9492        31
    brevundimonas     1.0000    0.5000    0.6667        10
     burkholderia     0.9375    0.3846    0.5455        39
    campylobacter     0.9495    0.7581    0.8430       124
      caulobacter     1.0000    0.8000    0.8889        25
      citrobacter     0.3636    0.2353    0.2857        34
      clostridium     0.4643    0.5417    0.5000        24
      cronobacter     0.5161    0.4444    0.4776        36
          dickeya     0.8333    0.7692    0.8000        52
     edwardsiella     1.0

Confidence threshold k: 60.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.3529    0.5217        17
    acinetobacter     0.9833    0.6020    0.7468        98
        aeromonas     0.8889    0.5895    0.7089        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.6154    0.7619        26
         bacillus     0.9603    0.7108    0.8169       204
      bacteroides     1.0000    0.9032    0.9492        31
    brevundimonas     1.0000    0.5000    0.6667        10
     burkholderia     0.9375    0.3846    0.5455        39
    campylobacter     0.9479    0.7339    0.8273       124
      caulobacter     1.0000    0.8000    0.8889        25
      citrobacter     0.3810    0.2353    0.2909        34
      clostridium     0.4643    0.5417    0.5000        24
      cronobacter     0.5161    0.4444    0.4776        36
          dickeya     0.8333    0.7692    0.8000        52
     edwardsiella     1.0

Confidence threshold k: 90.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.3529    0.5217        17
    acinetobacter     0.9833    0.6020    0.7468        98
        aeromonas     0.8889    0.5895    0.7089        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.6154    0.7619        26
         bacillus     0.9603    0.7108    0.8169       204
      bacteroides     1.0000    0.9032    0.9492        31
    brevundimonas     1.0000    0.5000    0.6667        10
     burkholderia     0.9375    0.3846    0.5455        39
    campylobacter     0.9479    0.7339    0.8273       124
      caulobacter     1.0000    0.8000    0.8889        25
      citrobacter     0.3810    0.2353    0.2909        34
      clostridium     0.4643    0.5417    0.5000        24
      cronobacter     0.5161    0.4444    0.4776        36
          dickeya     0.8333    0.7692    0.8000        52
     edwardsiella     1.0

100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [05:58<00:00, 22.64it/s]


Done reading through BLAST results...


100%|█████████████████████████████████████████████████████████████████████████████| 8116/8116 [00:25<00:00, 317.12it/s]


Done computing the mapping probabilities...
Constructing training and test sets...
Confidence threshold k: 0.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.2941    0.4545        17
    acinetobacter     1.0000    0.5000    0.6667        98
        aeromonas     0.8772    0.5263    0.6579        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.4615    0.6316        26
         bacillus     0.9583    0.6765    0.7931       204
      bacteroides     1.0000    0.8065    0.8929        31
    brevundimonas     1.0000    0.4000    0.5714        10
     burkholderia     0.9231    0.3077    0.4615        39
    campylobacter     0.9314    0.7661    0.8407       124
      caulobacter     1.0000    0.8000    0.8889        25
      citrobacter     0.4000    0.2941    0.3390        34
      clostridium     0.5556    0.4167    0.4762        24
      cronobacter     0.6364    0.3889    0.4828        36
  

Confidence threshold k: 30.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.2941    0.4545        17
    acinetobacter     1.0000    0.5000    0.6667        98
        aeromonas     0.8772    0.5263    0.6579        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.4615    0.6316        26
         bacillus     0.9583    0.6765    0.7931       204
      bacteroides     1.0000    0.8065    0.8929        31
    brevundimonas     1.0000    0.4000    0.5714        10
     burkholderia     0.9231    0.3077    0.4615        39
    campylobacter     0.9495    0.7581    0.8430       124
      caulobacter     1.0000    0.8000    0.8889        25
      citrobacter     0.3810    0.2353    0.2909        34
      clostridium     0.5556    0.4167    0.4762        24
      cronobacter     0.6364    0.3889    0.4828        36
          dickeya     0.8298    0.7500    0.7879        52
     edwardsiella     1.0

Confidence threshold k: 60.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.2941    0.4545        17
    acinetobacter     1.0000    0.5000    0.6667        98
        aeromonas     0.8772    0.5263    0.6579        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.4615    0.6316        26
         bacillus     0.9583    0.6765    0.7931       204
      bacteroides     1.0000    0.8065    0.8929        31
    brevundimonas     1.0000    0.4000    0.5714        10
     burkholderia     0.9231    0.3077    0.4615        39
    campylobacter     0.9479    0.7339    0.8273       124
      caulobacter     1.0000    0.8000    0.8889        25
      citrobacter     0.4000    0.2353    0.2963        34
      clostridium     0.5556    0.4167    0.4762        24
      cronobacter     0.6364    0.3889    0.4828        36
          dickeya     0.8298    0.7500    0.7879        52
     edwardsiella     1.0

Confidence threshold k: 90.0%
                   precision    recall  f1-score   support

    achromobacter     1.0000    0.2941    0.4545        17
    acinetobacter     1.0000    0.5000    0.6667        98
        aeromonas     0.8772    0.5263    0.6579        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.4615    0.6316        26
         bacillus     0.9583    0.6765    0.7931       204
      bacteroides     1.0000    0.8065    0.8929        31
    brevundimonas     1.0000    0.4000    0.5714        10
     burkholderia     0.9231    0.3077    0.4615        39
    campylobacter     0.9479    0.7339    0.8273       124
      caulobacter     1.0000    0.8000    0.8889        25
      citrobacter     0.4000    0.2353    0.2963        34
      clostridium     0.5556    0.4167    0.4762        24
      cronobacter     0.6364    0.3889    0.4828        36
          dickeya     0.8298    0.7500    0.7879        52
     edwardsiella     1.0

100%|██████████████████████████████████████████████████████████████████████████████| 8116/8116 [05:55<00:00, 22.81it/s]


Done reading through BLAST results...


100%|████████████████████████████████████████████████████████████████████████████| 8116/8116 [00:06<00:00, 1266.29it/s]


Done computing the mapping probabilities...
Constructing training and test sets...
Confidence threshold k: 0.0%
                   precision    recall  f1-score   support

    achromobacter     0.0000    0.0000    0.0000        17
    acinetobacter     1.0000    0.1224    0.2182        98
        aeromonas     0.8889    0.1684    0.2832        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.1923    0.3226        26
         bacillus     0.9375    0.2206    0.3571       204
      bacteroides     1.0000    0.4194    0.5909        31
    brevundimonas     1.0000    0.2000    0.3333        10
     burkholderia     1.0000    0.1026    0.1860        39
    campylobacter     0.9500    0.4597    0.6196       124
      caulobacter     1.0000    0.5600    0.7179        25
      citrobacter     0.8000    0.1176    0.2051        34
      clostridium     0.6667    0.1667    0.2667        24
      cronobacter     1.0000    0.0556    0.1053        36
  

Confidence threshold k: 30.0%
                   precision    recall  f1-score   support

    achromobacter     0.0000    0.0000    0.0000        17
    acinetobacter     1.0000    0.1224    0.2182        98
        aeromonas     0.8889    0.1684    0.2832        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.1923    0.3226        26
         bacillus     0.9375    0.2206    0.3571       204
      bacteroides     1.0000    0.4194    0.5909        31
    brevundimonas     1.0000    0.2000    0.3333        10
     burkholderia     1.0000    0.1026    0.1860        39
    campylobacter     1.0000    0.4516    0.6222       124
      caulobacter     1.0000    0.5600    0.7179        25
      citrobacter     0.7500    0.0882    0.1579        34
      clostridium     0.6667    0.1667    0.2667        24
      cronobacter     1.0000    0.0556    0.1053        36
          dickeya     1.0000    0.5192    0.6835        52
     edwardsiella     1.0

Confidence threshold k: 60.0%
                   precision    recall  f1-score   support

    achromobacter     0.0000    0.0000    0.0000        17
    acinetobacter     1.0000    0.1224    0.2182        98
        aeromonas     0.8889    0.1684    0.2832        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.1923    0.3226        26
         bacillus     0.9375    0.2206    0.3571       204
      bacteroides     1.0000    0.4194    0.5909        31
    brevundimonas     1.0000    0.2000    0.3333        10
     burkholderia     1.0000    0.1026    0.1860        39
    campylobacter     1.0000    0.4274    0.5989       124
      caulobacter     1.0000    0.5600    0.7179        25
      citrobacter     0.7500    0.0882    0.1579        34
      clostridium     0.6667    0.1667    0.2667        24
      cronobacter     1.0000    0.0556    0.1053        36
          dickeya     1.0000    0.5192    0.6835        52
     edwardsiella     1.0

Confidence threshold k: 90.0%
                   precision    recall  f1-score   support

    achromobacter     0.0000    0.0000    0.0000        17
    acinetobacter     1.0000    0.1224    0.2182        98
        aeromonas     0.8889    0.1684    0.2832        95
    agrobacterium     1.0000    0.3333    0.5000        12
     arthrobacter     1.0000    0.1923    0.3226        26
         bacillus     0.9375    0.2206    0.3571       204
      bacteroides     1.0000    0.4194    0.5909        31
    brevundimonas     1.0000    0.2000    0.3333        10
     burkholderia     1.0000    0.1026    0.1860        39
    campylobacter     1.0000    0.4274    0.5989       124
      caulobacter     1.0000    0.5600    0.7179        25
      citrobacter     0.7500    0.0882    0.1579        34
      clostridium     0.6667    0.1667    0.2667        24
      cronobacter     1.0000    0.0556    0.1053        36
          dickeya     1.0000    0.5192    0.6835        52
     edwardsiella     1.0

<hr>

In [9]:
model_results = []
similarity_thresholds = [i for i in range(0, 101, 10)]
for sim in similarity_thresholds:
    with open(f'{constants.TEMP_RESULTS}/blast-sim-{str(sim)}.pickle', 'rb') as f:
        model_results.append(pickle.load(f))

In [10]:
CLASS = 0
MICRO = 1
MACRO = 2
WEIGHTED = 3

PRECISION = 0
RECALL = 1
F1 = 2

Y_TEST = -3
Y_PRED = -1

In [11]:
results = []
for i in range(0, 11):
    result = []
    for threshold in range(0, 11):
        metric = "{:.2f}".format(model_results[i][threshold][WEIGHTED][PRECISION] * 100)
        
        result.append(f'{metric}%')
        
    results.append(result)

print("Weighted Precision")
results_df = pd.DataFrame(results, columns = [str(_) + '%' for _ in range(0, 101, 10)], index = similarity_thresholds)
results_df.style.highlight_max(color = 'lightgreen', axis = 0)

Weighted Precision


Unnamed: 0,0%,10%,20%,30%,40%,50%,60%,70%,80%,90%,100%
0,77.40%,76.45%,76.28%,76.22%,75.53%,75.48%,74.91%,74.66%,74.57%,74.28%,75.25%
10,77.40%,76.45%,76.28%,76.22%,75.53%,75.48%,74.91%,74.66%,74.57%,74.28%,75.25%
20,77.41%,76.46%,76.29%,76.23%,75.54%,75.49%,74.92%,74.67%,74.58%,74.30%,75.26%
30,77.32%,76.80%,76.70%,76.67%,76.17%,76.13%,75.70%,75.50%,75.43%,75.20%,76.17%
40,79.75%,79.60%,79.55%,79.55%,79.26%,79.23%,78.94%,78.79%,78.74%,78.59%,79.25%
50,81.79%,81.83%,81.81%,81.82%,81.62%,81.61%,81.41%,81.30%,81.26%,81.14%,81.38%
60,82.82%,82.93%,82.93%,82.94%,82.81%,82.81%,82.66%,82.56%,82.54%,82.45%,82.74%
70,83.12%,83.30%,83.30%,83.33%,83.23%,83.24%,83.13%,83.05%,83.03%,82.96%,83.29%
80,83.13%,83.37%,83.39%,83.41%,83.35%,83.37%,83.30%,83.22%,83.21%,83.16%,83.63%
90,82.93%,83.23%,83.26%,83.29%,83.27%,83.30%,83.28%,83.21%,83.21%,83.20%,83.85%


In [12]:
results = []
for i in range(0, 11):
    result = []
    for threshold in range(0, 11):
        metric = "{:.2f}".format(model_results[i][threshold][WEIGHTED][RECALL] * 100)
        
        result.append(f'{metric}%')
        
    results.append(result)

print("Weighted Recall")
results_df = pd.DataFrame(results, columns = [str(_) + '%' for _ in range(0, 101, 10)], index = similarity_thresholds)
results_df.style.highlight_max(color = 'lightgreen', axis = 0)

Weighted Recall


Unnamed: 0,0%,10%,20%,30%,40%,50%,60%,70%,80%,90%,100%
0,77.54%,77.13%,77.00%,76.96%,76.33%,76.26%,75.59%,75.27%,75.14%,74.64%,72.45%
10,77.54%,77.13%,77.00%,76.96%,76.33%,76.26%,75.59%,75.27%,75.14%,74.64%,72.45%
20,77.55%,77.14%,77.01%,76.97%,76.34%,76.27%,75.60%,75.28%,75.15%,74.66%,72.46%
30,77.75%,77.33%,77.19%,77.16%,76.53%,76.44%,75.79%,75.47%,75.33%,74.84%,72.54%
40,79.08%,78.65%,78.51%,78.47%,77.85%,77.75%,77.08%,76.76%,76.63%,76.13%,73.47%
50,79.50%,79.09%,78.96%,78.92%,78.28%,78.18%,77.51%,77.19%,77.06%,76.56%,73.40%
60,78.61%,78.20%,78.07%,78.03%,77.39%,77.29%,76.63%,76.31%,76.17%,75.68%,72.57%
70,77.38%,76.97%,76.84%,76.80%,76.16%,76.06%,75.39%,75.07%,74.94%,74.45%,71.28%
80,75.39%,74.99%,74.85%,74.82%,74.17%,74.08%,73.41%,73.09%,72.95%,72.46%,69.33%
90,71.86%,71.45%,71.32%,71.28%,70.64%,70.54%,69.87%,69.57%,69.43%,68.94%,65.89%


In [13]:
results = []
for i in range(0, 11):
    result = []
    for threshold in range(0, 11):
        metric = "{:.2f}".format(model_results[i][threshold][WEIGHTED][F1] * 100)
        
        result.append(f'{metric}%')
        
    results.append(result)

print("Weighted F1")
results_df = pd.DataFrame(results, columns = [str(_) + '%' for _ in range(0, 101, 10)], index = similarity_thresholds)
results_df.style.highlight_max(color = 'lightgreen', axis = 0)

Weighted F1


Unnamed: 0,0%,10%,20%,30%,40%,50%,60%,70%,80%,90%,100%
0,76.46%,76.17%,76.07%,76.04%,75.53%,75.49%,74.94%,74.68%,74.57%,74.20%,73.03%
10,76.46%,76.17%,76.07%,76.04%,75.53%,75.49%,74.94%,74.68%,74.57%,74.20%,73.03%
20,76.48%,76.19%,76.08%,76.06%,75.54%,75.50%,74.96%,74.69%,74.59%,74.21%,73.04%
30,77.08%,76.77%,76.67%,76.65%,76.14%,76.08%,75.56%,75.29%,75.19%,74.81%,73.43%
40,79.22%,78.89%,78.79%,78.77%,78.25%,78.18%,77.63%,77.36%,77.25%,76.87%,74.90%
50,80.01%,79.72%,79.63%,79.60%,79.09%,79.02%,78.48%,78.22%,78.11%,77.72%,75.26%
60,79.40%,79.14%,79.04%,79.02%,78.53%,78.46%,77.93%,77.67%,77.57%,77.18%,74.78%
70,78.42%,78.17%,78.08%,78.06%,77.57%,77.51%,76.99%,76.73%,76.63%,76.25%,73.80%
80,76.78%,76.55%,76.46%,76.44%,75.96%,75.90%,75.39%,75.13%,75.03%,74.65%,72.23%
90,73.82%,73.59%,73.51%,73.49%,73.01%,72.95%,72.44%,72.19%,72.08%,71.70%,69.28%


In [16]:
results = []
for i in range(0, 11):
    result = []
    for threshold in range(0, 11):
        y_test = model_results[i][threshold][Y_TEST]
        y_pred = model_results[i][threshold][Y_PRED]
        metric = "{:.2f}".format(specificity_score(y_test, y_pred, average='weighted') * 100)
        
        result.append(f'{metric}%')
        
    results.append(result)

print("Weighted Specificity")
results_df = pd.DataFrame(results, columns = [str(_) + '%' for _ in range(0, 101, 10)], index = similarity_thresholds)
results_df.style.highlight_max(color = 'lightgreen', axis = 0)

Weighted Specificity


Unnamed: 0,0%,10%,20%,30%,40%,50%,60%,70%,80%,90%,100%
0,98.46%,98.38%,98.36%,98.35%,98.26%,98.25%,98.16%,98.11%,98.09%,98.01%,97.46%
10,98.46%,98.38%,98.36%,98.35%,98.26%,98.25%,98.16%,98.11%,98.09%,98.01%,97.46%
20,98.46%,98.38%,98.36%,98.35%,98.26%,98.25%,98.16%,98.11%,98.09%,98.01%,97.46%
30,98.31%,98.24%,98.22%,98.21%,98.12%,98.11%,98.01%,97.97%,97.94%,97.87%,97.34%
40,98.10%,98.02%,98.00%,97.99%,97.90%,97.89%,97.80%,97.75%,97.73%,97.65%,97.12%
50,97.90%,97.83%,97.81%,97.80%,97.71%,97.70%,97.61%,97.56%,97.54%,97.46%,96.91%
60,97.65%,97.58%,97.56%,97.55%,97.46%,97.45%,97.35%,97.30%,97.28%,97.21%,96.68%
70,97.36%,97.29%,97.27%,97.27%,97.17%,97.16%,97.07%,97.02%,97.00%,96.93%,96.40%
80,97.03%,96.96%,96.94%,96.93%,96.84%,96.83%,96.74%,96.69%,96.67%,96.60%,96.08%
90,96.48%,96.41%,96.39%,96.38%,96.29%,96.28%,96.19%,96.14%,96.12%,96.05%,95.55%
