In [8]:
import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, matthews_corrcoef

In [2]:
data = np.load('../data/interim/kmers/kmer_matrix.npz')
kmers = data["kmers"]
kmer_order = data["kmer_order"]
genome_order = data["genome_order"]
print(kmers.shape)
# kmers = data['kmers']
# kmer_order = data['kmer_order']
# kmers.shape

(2552, 1947555)


In [3]:
from sklearn.metrics import matthews_corrcoef# Load MIC data
mics = joblib.load('../data/interim/mic_class_dataframe2.pkl')
mic_order = joblib.load('../data/interim/mic_class_order_dict2.pkl')
mics = mics.loc[genome_order]

In [4]:
# Function to compute within 1 dilution accuracy

def within1d(y1,y2,w1d):
    if y1 == y2:
        return True
    elif y1 == w1d[0]:
        return True
    elif y1 == w1d[1]:
        return True
    else:
        return False

def within1d_accuracy(y_pred,y_true,le,mo):
    # Turn the MIC order into a dictionary
    ordered_indices = le.transform(mo)
    within1d_dict = {}
    length = len(ordered_indices)
    for i in range(length):
        label = ordered_indices[i]
        upper = "NA"
        lower = "NA"
        if i < (length-1):
            upper = ordered_indices[i+1]
        if i > 0:
            lower = ordered_indices[i-1]
        
        within1d_dict[label] = [upper, lower]
        
    return np.sum([ within1d(y1,y2,within1d_dict[y2]) for y1,y2 in zip(y_pred,y_true) ])/len(y_pred)

    


In [46]:
for i in mics:
    #i = "AMC"
    y = mics[i]
    has_mic = (y != 'invalid') & (y )
    y = y[has_mic]
    X = kmers[has_mic,:]

    # Encode labels
    le = LabelEncoder()
    le.fit(y)
    y = le.transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=36)
    # print(np.unique(y, return_counts=True))
    # print(np.unique(y_train, return_counts=True))
    # print(np.unique(y_test, return_counts=True))

    feature_selection = SelectKBest(f_classif, k=270)
    feature_selection.fit(X_train, y_train)
    X_train_fs1 = feature_selection.transform(X_train)
    X_test_fs1 = feature_selection.transform(X_test)

    svm = LinearSVC(C=1, penalty='l1', dual=False)
    svm.fit(X_train_fs1,y_train)
    y_svm = svm.predict(X_test_fs1)

    print("\n\n===============================")
    print(i)
    labels = le.classes_
    print("accuracy:",accuracy_score(y_test,y_svm))
    print("1-d accuracy:",within1d_accuracy(y_svm,y_test,le,mic_order[i]))
    print("mcc:",matthews_corrcoef(y_test, y_svm))
    print(classification_report(y_test,y_svm,target_names=labels))
    cm = confusion_matrix(y_test,y_svm)
    labels2 = labels[np.unique(np.concatenate((y_test, y_svm)))]
    df = pd.DataFrame(cm, columns=labels2, index=labels2)
    print(df)




AMC
accuracy: 0.8267716535433071
1-d accuracy: 0.9330708661417323
mcc: 0.7059615900760681
             precision    recall  f1-score   support

    16.0000       0.40      0.42      0.41        33
     2.0000       0.00      0.00      0.00        27
     4.0000       0.29      0.33      0.31        15
     8.0000       0.73      0.69      0.71        39
   <=1.0000       0.89      0.97      0.93       297
  >=32.0000       0.91      0.89      0.90        97

avg / total       0.79      0.83      0.80       508

           16.0000  2.0000  4.0000  8.0000  <=1.0000  >=32.0000
16.0000         14       0       5       3         6          5
2.0000           0       0       0       0        27          0
4.0000           5       0       5       2         1          2
8.0000           6       0       4      27         1          1
<=1.0000         4       1       2       2       288          0
>=32.0000        6       0       1       3         1         86


AMP
accuracy: 0.896078431372549

  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  f = msb / msw
  f = msb / msw




AZM
accuracy: 0.729490022172949
1-d accuracy: 0.9822616407982262
mcc: 0.27898680581571567
             precision    recall  f1-score   support

    16.0000       0.00      0.00      0.00         4
     2.0000       0.61      0.29      0.39        86
     4.0000       0.75      0.95      0.84       318
     8.0000       0.17      0.03      0.05        34
   <=1.0000       0.00      0.00      0.00         5
   >16.0000       1.00      0.50      0.67         4

avg / total       0.67      0.73      0.68       451

          16.0000  2.0000  4.0000  8.0000  <=1.0000  >16.0000
16.0000         0       0       3       1         0         0
2.0000          0      25      59       0         2         0
4.0000          0      14     301       3         0         0
8.0000          0       0      33       1         0         0
<=1.0000        0       2       2       1         0         0
>16.0000        0       0       2       0         0         2






CHL
accuracy: 0.6745182012847966
1-d accuracy: 0.9764453961456103
mcc: 0.4373690928387482
             precision    recall  f1-score   support

    16.0000       0.00      0.00      0.00         7
    32.0000       1.00      0.33      0.50         3
     4.0000       0.75      0.40      0.52       187
     8.0000       0.63      0.91      0.75       237
   <=2.0000       0.00      0.00      0.00         5
   >32.0000       0.96      0.82      0.88        28

avg / total       0.69      0.67      0.64       467

          16.0000  32.0000  4.0000  8.0000  <=2.0000  >32.0000
16.0000         0        0       0       7         0         0
32.0000         0        1       1       1         0         0
4.0000          0        0      75     111         0         1
8.0000          0        0      20     216         1         0
<=2.0000        0        0       3       2         0         0
>32.0000        0        0       1       4         0        23


CIP
accuracy: 0.8551587301587301
1-d a

  .format(len(labels), len(target_names))
  'recall', 'true', average, warn_for)


          0.0300  0.0600  0.2500  0.5000  1.0000  <=0.0150  >=4.0000
0.0300         0       0       0       0       0        58         0
0.0600         0       0       0       0       0         1         0
0.2500         0       0       0       0       0         0         0
0.5000         0       0       0       0       0         6         0
1.0000         0       0       1       0       0         3         0
<=0.0150       1       0       0       1       0       390         1
>=4.0000       0       0       0       0       0         1        41


CRO
accuracy: 0.8775100401606426
1-d accuracy: 0.9437751004016064
mcc: 0.672561396214367
             precision    recall  f1-score   support

     0.5000       0.00      0.00      0.00         6
     1.0000       0.34      0.61      0.44        23
    16.0000       0.20      0.07      0.11        14
     2.0000       0.25      0.20      0.22         5
    32.0000       0.37      0.30      0.33        23
     4.0000       0.98      0.98      





FIS
accuracy: 0.696969696969697
1-d accuracy: 0.9545454545454546
mcc: 0.5699075600159483
             precision    recall  f1-score   support

   128.0000       0.00      0.00      0.00         4
   256.0000       0.00      0.00      0.00         3
    32.0000       0.58      0.80      0.67       158
    64.0000       0.38      0.24      0.30        49
  <=16.0000       0.52      0.27      0.36        85
  >256.0000       0.96      0.99      0.97       163

avg / total       0.67      0.70      0.67       462

           128.0000  256.0000  32.0000  64.0000  <=16.0000  >256.0000
128.0000          0         0        2        2          0          0
256.0000          0         0        2        1          0          0
32.0000           0         0      126       13         16          3
64.0000           0         0       32       12          4          1
<=16.0000         0         0       56        3         23          3
>256.0000         0         0        0        1          1    





NAL
accuracy: 0.6638115631691649
1-d accuracy: 0.9721627408993576
mcc: 0.150553544347903
             precision    recall  f1-score   support

     1.0000       0.00      0.00      0.00         3
    16.0000       0.33      0.50      0.40         2
     2.0000       0.41      0.06      0.11       139
    32.0000       1.00      0.50      0.67         2
     4.0000       0.68      0.97      0.80       306
     8.0000       0.00      0.00      0.00         9
   >32.0000       0.50      0.33      0.40         6

avg / total       0.58      0.66      0.57       467

          1.0000  16.0000  2.0000  32.0000  4.0000  8.0000  >32.0000
1.0000         0        0       0        0       2       0         1
16.0000        0        1       0        0       1       0         0
2.0000         0        0       9        0     129       0         1
32.0000        0        1       0        1       0       0         0
4.0000         0        1       8        0     297       0         0
8.0000         





SXT
accuracy: 0.8273195876288659
1-d accuracy: 0.8762886597938144
mcc: 0.3324434747744018
             precision    recall  f1-score   support

     0.2500       0.62      0.42      0.50        19
     0.5000       0.00      0.00      0.00         3
     1.0000       0.00      0.00      0.00         1
    16.0000       1.00      0.08      0.15        24
    32.0000       0.00      0.00      0.00         1
     4.0000       0.85      0.98      0.91       314
    64.0000       0.44      0.15      0.23        26

avg / total       0.81      0.83      0.78       388

          0.2500  0.5000  1.0000  64.0000  8.0000  <=0.1250  >64.0000
0.2500         8       0       0        0       0        11         0
0.5000         2       0       0        0       0         1         0
1.0000         0       0       0        0       0         1         0
64.0000        0       0       0        2       0        19         3
8.0000         0       0       0        0       0         1         0
<=0.1250

  .format(len(labels), len(target_names))




TET
accuracy: 0.9426229508196722
1-d accuracy: 0.9795081967213115
mcc: 0.895376468230389
             precision    recall  f1-score   support

    16.0000       0.25      0.33      0.29         3
    32.0000       0.65      0.68      0.67        19
     8.0000       0.00      0.00      0.00         5
   <=4.0000       0.97      0.98      0.97       217
   >32.0000       0.96      0.96      0.96       244

avg / total       0.93      0.94      0.94       488

          16.0000  32.0000  8.0000  <=4.0000  >32.0000
16.0000         1        1       0         0         1
32.0000         1       13       0         0         5
8.0000          1        0       0         4         0
<=4.0000        0        0       0       212         5
>32.0000        1        6       0         3       234


TIO
accuracy: 0.7494646680942184
1-d accuracy: 0.9892933618843683
mcc: 0.5441047907715568
             precision    recall  f1-score   support

     0.2500       0.00      0.00      0.00         2
     0

  .format(len(labels), len(target_names))
