In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [2]:
# load training dataset
with open('datasets/train.dataset.6mer.npy', 'rb') as open_file:
    df = np.load(open_file)
df = pd.DataFrame(df)

In [3]:
labels = pd.read_csv('datasets/train_labels.csv')

# snippet to load the grouth truth training labels and normalize the label predictions. 
# your trained model will predict in this space (31 classes - pathogens and decoy)
le = preprocessing.LabelEncoder()
le.fit(labels['genome_name'].unique())
y_index = le.transform(labels['genome_name'].values)

# use index as train labels
df['genome_label'] = y_index

In [21]:
df['genome_label'].value_counts()

7     424388
1       3787
16      3342
8       3166
13      2999
9       2840
19      2832
3       2715
18      2594
17      2578
30      2404
23      2388
12      2354
4       2248
0       2124
10      1801
29      1738
11      1576
28      1561
20      1489
21      1376
6       1306
5       1274
15      1194
25      1182
14      1150
27      1128
24      1092
26      1047
22       942
2        910
Name: genome_label, dtype: int64

In [17]:
df.head()

Unnamed: 0,3,9,10,11,13,14,15,18,19,22,...,2071,2072,2073,2074,2075,2076,2077,2078,2079,genome_label
0,0.000823,0.001646,0.000823,0.00247,0.000411,0.001235,0.0,0.001235,0.001235,0.000411,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20
1,0.001558,0.001039,0.001299,0.000779,0.000779,0.000779,0.000779,0.000779,0.002077,0.002077,...,0.000519,0.0,0.0,0.000519,0.00026,0.0,0.0,0.0,0.0,20
2,0.001851,0.000617,0.001234,0.0,0.000617,0.001234,0.001851,0.002468,0.000617,0.001851,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20
3,0.001378,0.001378,0.000276,0.003859,0.001102,0.001102,0.0,0.001378,0.001102,0.002756,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20
4,0.001701,0.00144,0.001178,0.001047,0.001047,0.001178,0.000785,0.000916,0.002487,0.001571,...,0.0,0.0,0.0,0.000262,0.000131,0.0,0.0,0.0,0.0,20


In [6]:
df.shape

(62924, 1919)

Remove samples with total k-mer count < 0.9

In [7]:
total_kmers = df.iloc[:, :-1].sum(axis=1)
df = df.loc[total_kmers >= 0.9, :]
y_index = y_index[total_kmers >= 0.9]

Downsample decoys

In [23]:
downsampled = df[df['genome_label'] == 7].sample(420601, random_state=4)
downsampled.index

Int64Index([415385, 276717, 301096,  88285,  15499, 377950, 329334, 425205,
             90461, 477715,
            ...
             94812, 381037, 167116, 117181, 139215, 154100, 240125, 385251,
            431883, 105292],
           dtype='int64', length=420601)

In [24]:
df.drop(index=downsampled.index, inplace=True)

Remove columns with high correlation

In [28]:
mask = np.triu(np.ones_like(df.corr(), dtype=bool)) # remove duplicate upper triangle correlation values
corr_matrix = df.corr().abs() # create positive correlation matrix
tri_df = corr_matrix.mask(mask) # create and apply upper triangle mask

In [29]:
high_corr = [c for c in tri_df.columns if any(tri_df[c] > 0.7)]
len(high_corr)

162

In [31]:
df.drop(columns=high_corr, inplace=True)

KeyError: '[0 1 2 4 5 6 7 8 12 16 17 20 21 24 28 32 36 48 64 65 68 69 80 81 84 127\n 189 250 251 254 265 266 267 269 280 310 311 314 325 326 328 427 441 465\n 468 484 513 644 647 658 684 687 688 743 813 815 827 840 850 851 853 888\n 890 891 904 905 907 916 952 987 999 1090 1091 1112 1139 1147 1148 1149\n 1152 1155 1246 1248 1373 1375 1391 1392 1393 1403 1404 1411 1490 1491\n 1499 1508 1509 1514 1515 1516 1517 1605 1612 1642 1643 1664 1665 1666\n 1671 1746 1752 1771 1772 1818 1820 1845 1846 1847 1852 1858 1859 1861\n 1862 1863 1864 1865 1866 1867 1868 1869 1883 1884 1888 1889 1902 1903\n 1906 1907 1912 1916 1918 1920 1921 1922 1923 1924 1925 1926 1929 1934\n 1935 1937 1938 1939 1987 1998 2000 2031 2041 2049 2050 2056 2062 2063] not found in axis'

In [43]:
df.to_csv('pre_svd.csv', index=False)

In [4]:
df = pd.read_csv('pre_svd.csv')

Feature extraction using tSVD

In [47]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500, random_state=4)
svd.fit(df.iloc[:, :-1].values)

TruncatedSVD(n_components=500, random_state=4)

In [45]:
import pickle as pk
pk.dump(svd, open('transformers/svd.pkl', 'wb'))

In [37]:
import pickle as pk
svd = pk.load(open('transformers/svd.pkl', 'rb'))

Subsample training set and perform dimensionality reduction

In [48]:
df_train = df.groupby('genome_label').apply(lambda x: x.sample(910, random_state=4))
X_train = svd.transform(df_train.iloc[:, :-1].values)
y_train = df_train['genome_label'].astype(int).values

In [49]:
X_train

array([[ 2.16119806e-02, -1.12246967e-02,  3.10852930e-04, ...,
         1.48419513e-04, -3.78807179e-04, -2.21491643e-04],
       [ 2.18505849e-02, -1.15034471e-02,  1.63952861e-03, ...,
         2.48144278e-04,  6.19929214e-04, -2.66408205e-04],
       [ 2.15662925e-02, -9.79539106e-03,  7.32436260e-04, ...,
         3.93659798e-04, -9.01028979e-05, -6.77395971e-05],
       ...,
       [ 2.17961952e-02, -5.53790644e-03, -2.73646639e-03, ...,
         5.97028994e-04, -4.89305601e-05,  1.07936952e-04],
       [ 2.26182567e-02, -1.38002959e-03, -4.36690458e-03, ...,
        -2.82249076e-04, -1.28196469e-05, -3.08790280e-05],
       [ 2.21711476e-02, -2.21356876e-03, -4.22842009e-03, ...,
         2.35392374e-04, -5.13989618e-04, -2.91539009e-04]])

In [50]:
df_train

Unnamed: 0_level_0,Unnamed: 1_level_0,3,9,10,11,13,14,15,18,19,22,...,2071,2072,2073,2074,2075,2076,2077,2078,2079,genome_label
genome_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,34420,0.003378,0.001987,0.000199,0.002783,0.001590,0.001391,0.001391,0.001391,0.001391,0.000795,...,0.000795,0.000000,0.000596,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0
0,32679,0.001767,0.001632,0.000816,0.001224,0.001767,0.000544,0.001495,0.001359,0.000544,0.001495,...,0.000000,0.000136,0.000272,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0
0,33660,0.001410,0.000846,0.000564,0.001974,0.002115,0.001410,0.001268,0.000564,0.000846,0.000987,...,0.000000,0.000000,0.000282,0.000000,0.000141,0.000000,0.000000,0.000000,0.0,0
0,34726,0.001965,0.002321,0.000535,0.001071,0.002142,0.000893,0.001965,0.000893,0.001785,0.001250,...,0.000000,0.000000,0.000535,0.000000,0.000000,0.000000,0.000000,0.000357,0.0,0
0,32832,0.000804,0.001608,0.000804,0.002813,0.000000,0.001205,0.002010,0.000804,0.000000,0.002010,...,0.000804,0.000000,0.000402,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30,57745,0.001081,0.000480,0.000360,0.000480,0.000960,0.000480,0.000840,0.001560,0.000840,0.000480,...,0.000240,0.000480,0.000840,0.000000,0.000480,0.000360,0.000240,0.000000,0.0,30
30,57330,0.000887,0.000127,0.000253,0.000380,0.000634,0.000760,0.000380,0.000380,0.000253,0.000760,...,0.000253,0.000127,0.000634,0.000000,0.000507,0.000127,0.000127,0.000000,0.0,30
30,58178,0.000985,0.001232,0.000000,0.000739,0.000246,0.000493,0.000739,0.000739,0.000985,0.000739,...,0.000000,0.000246,0.000246,0.000000,0.000246,0.000000,0.000000,0.000493,0.0,30
30,57039,0.001042,0.000651,0.000261,0.000521,0.000521,0.000782,0.001564,0.000782,0.000782,0.000782,...,0.000521,0.000130,0.001173,0.000261,0.000391,0.000130,0.000261,0.000000,0.0,30


Train SVM model

In [51]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from joblib import dump, load
svm = SVC(C=10, kernel='rbf', gamma=10, probability=False, random_state=4)
model = CalibratedClassifierCV(svm, method='sigmoid', cv=5) # wrap the SVM classifier in a CalibratedClassifierCV instance
model.fit(X_train, y_train)

CalibratedClassifierCV(base_estimator=SVC(C=10, gamma=10, random_state=4), cv=5)

In [44]:
dump(model, 'models/svm.joblib')

['models/svm.joblib']

In [36]:
from joblib import load
model = load('models/svm.joblib')

Predictions

In [13]:
def precision_per_patient(patient_id, preds):
    df_true = pd.read_csv(f'datasets/validation/patient{patient_id}_labels.txt')
    tp, fp, tp_labels = 0, 0, df_true['true_label'].shape[0]
    print(f'my prediction(s) for patient {patient_id}: {preds}')
    truth = df_true['true_label'].values
    print(f'true pathogen(s): {truth}')
    #if don't predict any pathogen, it means there is only decoy in the test dataset (your prediction)
    if len(preds) == 0:
        preds = ['decoy']
    for item in np.unique(preds):
        if item in df_true['true_label'].values:
            tp+=1
        else:
            fp+=1
    #you have to predict all labels correctly, but you are penalized for any false positive
    return tp/(tp_labels+fp)

In [28]:
with open(f'datasets/validation/patient1.6mer.npy', 'rb') as read_file:
    df_test = np.load(read_file)
df_test = pd.DataFrame(df_test)
df_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2070,2071,2072,2073,2074,2075,2076,2077,2078,2079
0,0.000655,0.003929,0.001309,0.001528,0.002401,0.003492,0.001965,0.000655,0.000873,0.001091,...,0.000218,0.000873,0.000000,0.000000,0.000437,0.000000,0.000000,0.0,0.000000,0.000000
1,0.001521,0.000869,0.003042,0.001304,0.001304,0.000652,0.000652,0.001521,0.002172,0.001304,...,0.000000,0.000000,0.000000,0.000217,0.000434,0.000000,0.000000,0.0,0.000000,0.000000
2,0.001271,0.003496,0.000636,0.001271,0.002224,0.002542,0.003813,0.000953,0.001271,0.000953,...,0.000000,0.000636,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
3,0.002781,0.002411,0.001483,0.000927,0.002781,0.001854,0.001669,0.000556,0.002039,0.000741,...,0.000000,0.000741,0.000000,0.000000,0.000000,0.000371,0.000000,0.0,0.000000,0.000000
4,0.001933,0.003410,0.001478,0.001592,0.003639,0.003298,0.003071,0.001592,0.001137,0.001250,...,0.000114,0.000227,0.000000,0.000114,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9993,0.001433,0.002867,0.001433,0.000358,0.002867,0.003225,0.002867,0.001433,0.002151,0.000358,...,0.000358,0.000717,0.001076,0.000000,0.000717,0.000000,0.000000,0.0,0.000000,0.000000
9994,0.008995,0.003229,0.001384,0.000923,0.003229,0.001384,0.001615,0.001153,0.000461,0.001384,...,0.000692,0.000000,0.000461,0.000000,0.000461,0.001153,0.000231,0.0,0.000461,0.000461
9995,0.004356,0.001452,0.001816,0.002905,0.002542,0.001089,0.003267,0.000363,0.001452,0.001452,...,0.000363,0.000726,0.000726,0.000363,0.000000,0.000726,0.000000,0.0,0.000000,0.000000
9996,0.000876,0.000876,0.002628,0.000000,0.001753,0.000000,0.000876,0.001753,0.000000,0.003506,...,0.000876,0.003506,0.001753,0.000876,0.000000,0.001753,0.000000,0.0,0.000000,0.000000


In [31]:
feat = df_train.columns[:-1].values.astype(int)

In [40]:
len(feat)

1918

In [52]:
import time

def predict(threshold, model, transformer):
    start = time.time()

    all_precision = []
    for patient_id in range(1, 11):
        print(f'predicting for patient {patient_id}')
        
        with open(f'datasets/validation/patient{patient_id}.6mer.npy', 'rb') as read_file:
            df_test = np.load(read_file)
        df_test = pd.DataFrame(df_test)
        X_test = transformer.transform(df_test.loc[:, feat].values)

        # we can use model.predict_proba to find a good threshold and predict only for case where the model is confident
        y_predprob = model.predict_proba(X_test) # probability of each class per read
        
        # we get only predictions larger than the threshold and if there is more than one, we take the argmax again
        num_reads = X_test.shape[0]
        predictions_count = {}
        for read in y_predprob:
            print(np.max(read))
            if np.max(read) >= threshold:
                label = np.argmax(read)
                if label in predictions_count:
                    predictions_count[label] += 1
                else:
                    predictions_count[label] = 1
        print(predictions_count)
        final_predictions = le.inverse_transform([label for label, count in predictions_count.items() if count > 5]) # only consider labels with more than 5 reads predicting that label, can change to % num_reads
        
        # my pathogens detected, decoy will be ignored
        final_predictions = [item for item in final_predictions if item !='decoy']
        
        precision = precision_per_patient(patient_id, final_predictions)
        print(f'precision: {precision}')
        all_precision.append(precision)

    # performance per patient and its final average
    print([f'patient {c}: {item}' for c, item in enumerate(all_precision, start=1)])
    print(f'avg: {np.mean(all_precision)}')

    end = time.time()
    print(f'Time taken for prediction: {end - start} seconds')

predict(0.99, model, svd)

predicting for patient 1
[[1.41285914e-03 2.26661276e-13 4.24031640e-08 ... 1.35942793e-08
  7.50025335e-10 4.05550495e-08]
 [4.53872635e-06 5.09407894e-13 4.25357250e-08 ... 5.49291528e-07
  1.02596792e-09 1.81261193e-09]
 [1.06146365e-04 4.69804152e-13 4.87709057e-10 ... 1.66067970e-07
  9.85680997e-10 2.14344616e-07]
 ...
 [5.25304626e-08 5.34631927e-15 7.47404749e-13 ... 1.38204574e-08
  3.48247694e-08 2.45565843e-08]
 [2.03813152e-20 1.38442210e-14 2.57355193e-15 ... 1.19997177e-06
  2.14069226e-13 3.93216055e-06]
 [4.60813902e-07 7.70559188e-14 1.05848405e-07 ... 1.05667380e-08
  2.54374774e-11 1.25631701e-09]]
0.6685466043346386
0.6781654788248928
0.6423967578578118
0.7735425071169557
0.680372420474391
0.6882256080519014
0.7113964680266822
0.6390707455062917
0.686185460720706
0.5795403264630453
0.6876280365491455
0.6848735050419718
0.6868842056003741
0.6865137949885398
0.5692368379650261
0.6543944883250348
0.6814348041635645
0.6503172380125062
0.7260286970323426
0.75418958686886

KeyboardInterrupt: 