In [1]:
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data = np.load('../data/interim/kmers/kmer_matrix.npz')
kmers = data["kmers"]
kmer_order = data["kmer_order"]
genome_order = data["genome_order"]
print(kmers.shape)
# kmers = data['kmers']
# kmer_order = data['kmer_order']
# kmers.shape

(2552, 1947555)


In [21]:
# Load MIC data
mics = joblib.load('../data/interim/mic_class_dataframe2.pkl')
mic_order = joblib.load('../data/interim/mic_class_order_dict2.pkl')
mics = mics.loc[genome_order]

In [49]:
# Function to compute within 1 dilution accuracy

def within1d(y1,y2,w1d):
    if y1 == y2:
        return True
    elif y1 == w1d[0]:
        return True
    elif y1 == w1d[1]:
        return True
    else:
        return False

def within1d_accuracy(y_pred,y_true,le,mo):
    # Turn the MIC order into a dictionary
    ordered_indices = le.transform(mo)
    within1d_dict = {}
    length = len(ordered_indices)
    for i in range(length):
        label = ordered_indices[i]
        upper = "NA"
        lower = "NA"
        if i < (length-1):
            upper = ordered_indices[i+1]
        if i > 0:
            lower = ordered_indices[i-1]
        
        within1d_dict[label] = [upper, lower]
        
    return np.sum([ within1d(y1,y2,within1d_dict[y2]) for y1,y2 in zip(y_pred,y_true) ])/len(y_pred)

    


In [51]:
for i in mics:
#i = "AMC"
    y = mics[i]
    has_mic = (y != 'invalid') & (y )
    y = y[has_mic]
    X = kmers[has_mic,:]

    # Encode labels
    le = LabelEncoder()
    le.fit(y)
    y = le.transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=36)
    # print(np.unique(y, return_counts=True))
    # print(np.unique(y_train, return_counts=True))
    # print(np.unique(y_test, return_counts=True))

    feature_selection = SelectKBest(f_classif, k=270)
    feature_selection.fit(X_train, y_train)
    X_train_fs1 = feature_selection.transform(X_train)
    X_test_fs1 = feature_selection.transform(X_test)

    svm = LinearSVC(C=1, penalty='l1', dual=False)
    svm.fit(X_train_fs1,y_train)
    y_svm = svm.predict(X_test_fs1)

    print("===============================")
    print(i)
    print(accuracy_score(y_test,y_svm))
    print(within1d_accuracy(y_svm,y_test,le,mic_order[i]))
    print(classification_report(y_test,y_svm))
    print(confusion_matrix(y_test,y_svm))



AMC
0.8228346456692913
0.9330708661417323
             precision    recall  f1-score   support

          0       0.36      0.39      0.38        33
          1       0.00      0.00      0.00        27
          2       0.29      0.33      0.31        15
          3       0.71      0.69      0.70        39
          4       0.89      0.97      0.93       297
          5       0.91      0.88      0.89        97

avg / total       0.78      0.82      0.80       508

[[ 13   0   6   4   5   5]
 [  0   0   0   0  27   0]
 [  5   0   5   2   1   2]
 [  6   0   4  27   1   1]
 [  5   1   1   2 288   0]
 [  7   0   1   3   1  85]]
AMP
0.8960784313725491
0.9764705882352941
             precision    recall  f1-score   support

          1       0.47      0.33      0.39        42
          2       0.00      0.00      0.00         3
          3       0.00      0.00      0.00         1
          4       0.89      0.94      0.91       274
          5       0.98      0.97      0.98       190

avg / 

  'precision', 'predicted', average, warn_for)
  f = msb / msw


AZM
0.7317073170731707
0.9822616407982262
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         4
          1       0.62      0.29      0.40        86
          2       0.75      0.95      0.84       318
          3       0.17      0.03      0.05        34
          4       0.00      0.00      0.00         5
          5       1.00      0.50      0.67         4

avg / total       0.67      0.73      0.68       451

[[  0   0   3   1   0   0]
 [  0  25  59   0   2   0]
 [  0  13 302   3   0   0]
 [  0   0  33   1   0   0]
 [  0   2   2   1   0   0]
 [  0   0   2   0   0   2]]




CHL
0.6659528907922913
0.974304068522484
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         7
          1       1.00      0.33      0.50         3
          2       0.74      0.40      0.52       187
          3       0.63      0.90      0.74       237
          4       0.00      0.00      0.00         5
          5       0.96      0.79      0.86        28

avg / total       0.68      0.67      0.64       467

[[  0   0   0   7   0   0]
 [  0   1   1   1   0   0]
 [  0   0  74 112   0   1]
 [  1   0  21 214   1   0]
 [  0   0   3   2   0   0]
 [  2   0   1   3   0  22]]
CIP
0.8531746031746031
0.9702380952380952
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        58
          1       0.00      0.00      0.00         1
          3       0.00      0.00      0.00         0
          4       0.00      0.00      0.00         6
          5       0.00      0.00      0.00         4
        

  'recall', 'true', average, warn_for)


CRO
0.8755020080321285
0.9417670682730924
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         6
          2       0.34      0.61      0.44        23
          4       0.25      0.07      0.11        14
          5       0.25      0.20      0.22         5
          6       0.35      0.30      0.33        23
          7       0.97      0.99      0.98       390
          8       0.82      0.76      0.79        37

avg / total       0.87      0.88      0.87       498

[[  0   0   0   0   0   6   0]
 [  0  14   0   1   5   0   3]
 [  0   6   1   0   7   0   0]
 [  0   3   0   1   0   1   0]
 [  0  13   1   1   7   1   0]
 [  0   2   0   0   0 385   3]
 [  0   3   2   1   1   2  28]]




FIS
0.696969696969697
0.9523809523809523
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         4
          1       0.00      0.00      0.00         3
          2       0.58      0.79      0.67       158
          3       0.39      0.24      0.30        49
          4       0.52      0.28      0.37        85
          5       0.96      0.99      0.97       163

avg / total       0.67      0.70      0.67       462

[[  0   0   2   2   0   0]
 [  0   0   2   1   0   0]
 [  0   0 125  13  16   4]
 [  0   0  31  12   5   1]
 [  0   0  56   3  24   2]
 [  0   0   1   0   1 161]]
FOX
0.6995884773662552
0.948559670781893
             precision    recall  f1-score   support

          0       0.47      0.35      0.40        26
          1       0.76      0.88      0.81       219
          2       0.62      0.61      0.61       125
          3       0.00      0.00      0.00        26
          4       0.00      0.00      0.00        16
         



NAL
0.6616702355460385
0.9700214132762313
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         3
          1       0.33      0.50      0.40         2
          2       0.41      0.06      0.11       139
          3       1.00      0.50      0.67         2
          4       0.68      0.97      0.80       306
          5       0.00      0.00      0.00         9
          6       0.40      0.33      0.36         6

avg / total       0.58      0.66      0.57       467

[[  0   0   0   0   2   0   1]
 [  0   1   0   0   1   0   0]
 [  0   0   9   0 129   0   1]
 [  0   1   0   1   0   0   0]
 [  0   1   8   0 296   0   1]
 [  0   0   3   0   6   0   0]
 [  0   0   2   0   2   0   2]]


  f = msb / msw


SXT
0.8273195876288659
0.8737113402061856
             precision    recall  f1-score   support

          0       0.67      0.42      0.52        19
          1       0.00      0.00      0.00         3
          2       0.00      0.00      0.00         1
          6       0.67      0.08      0.15        24
          7       0.00      0.00      0.00         1
          8       0.85      0.98      0.91       314
          9       0.44      0.15      0.23        26

avg / total       0.79      0.83      0.78       388

[[  8   0   0   0   0  11   0]
 [  1   0   0   1   0   1   0]
 [  0   0   0   0   0   1   0]
 [  0   0   0   2   0  19   3]
 [  0   0   0   0   0   1   0]
 [  3   2   0   0   0 307   2]
 [  0   0   0   0   0  22   4]]
TET
0.9385245901639344
0.9774590163934426
             precision    recall  f1-score   support

          0       0.33      0.33      0.33         3
          1       0.57      0.68      0.62        19
          2       0.00      0.00      0.00         5
     