# generating predictions for test set

In [1]:
import numpy as np
import pandas as pd
import timeit
from sklearn import preprocessing
import matplotlib.pyplot as plt
import pickle as pkl
import umap
import umap.plot
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD

from matplotlib import rcParams
plt.style.use("ggplot")
rcParams['figure.figsize'] = (12, 6)
from numpy.random import seed
seed(1)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from joblib import dump, load
import pickle as pkl
#load trained model
clf = load('models/svm_ver1.2.joblib')

with open('svd_ver1.2.pkl', 'rb') as pickle_file: # PCA embeddings trained on full data
    preprocess=pkl.load(pickle_file) 

In [3]:
df_y = pd.read_csv('datasets/train_labels.csv')
le = preprocessing.LabelEncoder()
le.fit(df_y['genome_name'].unique())

In [4]:
#prediction for all patients
def predict(threshold=0.99, model=None, preprocess=None):
    all_precision = []
    for test_id in range(1,14):
        print('predicting for test {}'.format(test_id))
        
        starting_time = timeit.default_timer()
        with open('datasets/test_datasets/test{}.6mer.npy'.format(test_id), 'rb') as read_file:
            df_test = np.load(read_file)
            df_test = pd.DataFrame(df_test)
            
        if preprocess != None:
            df_test = preprocess.transform(df_test)
        
        y_predprob = model.predict_proba(df_test)
        
        #we get only predictions larger than the threshold and if there is more than one, we take the argmax again
        final_predictions = le.inverse_transform(np.unique([np.argmax(item) for item in y_predprob  if len(np.where(item>= threshold)[0]) >=1]
                                                    ))
        #my pathogens dectected, decoy will be ignored
        final_predictions = [item for item in final_predictions if item !='decoy']

        if len(final_predictions) == 0:
            final_predictions = ['decoy']

        print(final_predictions)
        
        with open('prediction2/test{}.txt'.format(test_id), 'w') as f:
            f.write('pred_label \n')
            f.write('\n'.join(final_predictions))

In [5]:
predict(model=clf, preprocess=preprocess)

predicting for test 1
['pseudomonas_aeruginosa']
predicting for test 2
['staphylococcus_aureus']
predicting for test 3
['corynebacterium_ulcerans', 'pseudomonas_aeruginosa']
predicting for test 4
['mycobacterium_ulcerans', 'staphylococcus_pyogenes']
predicting for test 5
['decoy']
predicting for test 6
['burkholderia_pseudomallei', 'corynebacterium_diphtheriae', 'mycobacterium_tuberculosis', 'mycobacterium_ulcerans', 'pseudomonas_aeruginosa']
predicting for test 7
['corynebacterium_diphtheriae']
predicting for test 8
['staphylococcus_aureus']
predicting for test 9
['corynebacterium_diphtheriae', 'corynebacterium_ulcerans', 'staphylococcus_aureus']
predicting for test 10
['corynebacterium_diphtheriae', 'corynebacterium_ulcerans']
predicting for test 11
['corynebacterium_diphtheriae', 'corynebacterium_ulcerans', 'mycobacterium_ulcerans']
predicting for test 12
['burkholderia_pseudomallei', 'corynebacterium_diphtheriae', 'mycobacterium_ulcerans']
predicting for test 13
['staphylococcus_au