In [1]:
## Run ML methods on PanPred and panta outputs 
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn import datasets
from sklearn import svm
import random
import os
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from numpy import genfromtxt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
import pandas as pd
import numpy as np

In [2]:
version = '_v9'  # Remove missing labels, and I = resistance

In [3]:
def run_ML(X_train, y_train, X_test, y_test, data_set, approach="Default"):
    base_dir = '/data/hoan/amromics/prediction/output/predPantaPanPred_v6'
    # pantaout_dir = '/data/hoan/amromics/prediction/output/pantaSaPatric/'
    if not os.path.isdir(base_dir):
        os.system('mkdir '+ base_dir)
    
    path_dir = base_dir +'/' + data_set + '_addsample_'+approach            
    # Save the test true labels
    np.savetxt(path_dir + "_test_true_labels.csv", y_test, delimiter=",")
    # LightGBM
    model = lgb.LGBMClassifier()
    model.fit(X_train, y_train)
    y_predict=model.predict(X_test) 
    np.savetxt(path_dir + "_LightGBM_labels.csv", y_predict, delimiter=",")
    print(f1_score(y_predict, y_test, average='macro'))

In [4]:
sample_isolate = pd.read_csv('/data/hoan/amromics/prediction/data/Ecoli1936metafiles/sample_isolate.csv')
sample_isolate.head(2)
isolate2sample = {}
for idx in range(len(sample_isolate.index)):
    isolate2sample[sample_isolate.iloc[idx,1]] =  sample_isolate.iloc[idx,0]+'.contig'

### Run PanPred 

In [5]:
# pandata = pd.read_csv("PanPred/test_data/gene_presence_absence.csv")

In [6]:
metadata = pd.read_csv('data/Ecoli1936metafiles/PanPred_Metadata.csv')
metadata = metadata.set_index(metadata['Isolate'])

In [7]:
accessorygene =  pd.read_csv('PanPred/test_data/AccessoryGene.csv', index_col=0)

In [8]:
populationstructure =  pd.read_csv('PanPred/test_data/PopulationStructure.csv_labelencoded.csv', index_col=0)

In [9]:
new_accessorygene = accessorygene.loc[metadata['Isolate']]

#### Run ML models

In [10]:
# for idx in range(2, 14):
#     y_class = metadata.iloc[:,idx].values
#     print(metadata.columns[idx])
#     y = np.array([1 if y_class[i]=='R' else 0 for i in range(1936)])
#     run_ML(new_accessorygene.values, y, 'Ecoli1936','classic')

In [11]:
# new_accessorygene.head(2)

### Run Panta

In [12]:
# Read data
pantaout_dir = '/data/hoan/amromics/prediction/output/pantatest1/'
pa_matrixdb = pd.read_pickle(pantaout_dir + 'PAmatrixdb.pkl')
pa_matrixnew = pd.read_pickle(pantaout_dir + 'PAmatrix.pkl')
# AMR Kmer
kmer_matrixdbdf = pd.read_pickle(pantaout_dir + 'AMRKmerdb.pkl')
kmer_matrixdftest = pd.read_pickle(pantaout_dir + 'AMRKmer.pkl')
# SNPs
snpmatrixdbfinaldf = pd.read_pickle(pantaout_dir + 'SNPmatrixdb.pkl')
snpmatrixtestfinaldf = pd.read_pickle(pantaout_dir + 'SNPmatrix.pkl')

In [13]:
metadata_panta = pd.read_csv("data/Ecoli1936metafiles/metadata_final.csv")

In [14]:
metadata_panta.shape

(1653, 14)

In [15]:
metadata_panta.head(2)

Unnamed: 0,Isolate,Year,CTZ,CTX,AMP,AMX,AMC,TZP,CXM,CET,GEN,TBM,TMP,CIP
0,11658_4#1,2006.0,S,S,S,,S,S,R,S,S,S,S,S
1,11657_5#1,2006.0,S,S,R,,R,S,S,S,S,S,R,R


In [16]:
sampleID = [isolate2sample[val] for val in metadata_panta['Isolate'].values]

In [17]:
metadata_panta.index = sampleID

In [18]:
train_isolate = list(pa_matrixdb.index)

In [19]:
test_isolate = list(set(pa_matrixnew.index).difference(set(train_isolate)))

In [20]:
# take train data
train_data = pd.concat([pa_matrixdb, kmer_matrixdbdf, snpmatrixdbfinaldf], axis=1)

In [21]:
full_data = pd.concat([pa_matrixnew, kmer_matrixdftest, snpmatrixtestfinaldf], axis=1)

In [22]:
full_data.head(2)

Unnamed: 0,groups_0,vgrG1,namA,groups_2,groups_4,groups_5,rsmI,flu_2,ltrA,intA_2,...,gstB_2@72,dnaE@110,groL@436,yibH@336,yihV@64,groups_1226@12,COQ5_1@3340,rihB@204,rihB@276,eae@2191
SAMEA2204229.contig,2,5,0,1,1,2,1,0,0,5,...,1.0,18.0,13.0,4.0,4.0,19.0,15.0,2.0,2.0,1.0
SAMEA2204230.contig,0,9,0,0,0,0,2,1,2,2,...,12.0,2.0,13.0,17.0,4.0,19.0,0.0,2.0,2.0,19.0


In [23]:
# test_isolate[0:4]

In [24]:
test_data = full_data.loc[test_isolate]

In [25]:
train_data_leaky = full_data.loc[train_isolate]

In [26]:
train_data_leaky.head(2)

Unnamed: 0,groups_0,vgrG1,namA,groups_2,groups_4,groups_5,rsmI,flu_2,ltrA,intA_2,...,gstB_2@72,dnaE@110,groL@436,yibH@336,yihV@64,groups_1226@12,COQ5_1@3340,rihB@204,rihB@276,eae@2191
SAMEA2204229.contig,2,5,0,1,1,2,1,0,0,5,...,1.0,18.0,13.0,4.0,4.0,19.0,15.0,2.0,2.0,1.0
SAMEA2204230.contig,0,9,0,0,0,0,2,1,2,2,...,12.0,2.0,13.0,17.0,4.0,19.0,0.0,2.0,2.0,19.0


In [27]:
test_data.head(2)

Unnamed: 0,groups_0,vgrG1,namA,groups_2,groups_4,groups_5,rsmI,flu_2,ltrA,intA_2,...,gstB_2@72,dnaE@110,groL@436,yibH@336,yihV@64,groups_1226@12,COQ5_1@3340,rihB@204,rihB@276,eae@2191
SAMEA2205477.contig,3,0,4,2,0,0,6,2,2,3,...,0.0,2.0,5.0,4.0,17.0,19.0,19.0,2.0,2.0,1.0
SAMEA2204952.contig,4,2,2,3,1,0,4,1,5,4,...,12.0,18.0,13.0,4.0,17.0,19.0,15.0,2.0,2.0,1.0


In [28]:
train_labels = metadata_panta.loc[train_isolate]
test_labels =  metadata_panta.loc[test_isolate]

In [29]:
## Preprocess PanPred
PanPred_data = new_accessorygene.loc[metadata_panta['Isolate'].values]
PanPred_data.index = metadata_panta.index

In [30]:
PanPred_data.head(2)

Unnamed: 0,yeiU,yhhS,ybaE,eutR,ibrB,ytfP,aslB,narQ,tolR,galM,...,group_48768,group_48873,group_48916,group_48933,group_48937,group_48958,group_49020,group_49174,group_49253,group_49257
SAMEA2204229.contig,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
SAMEA2204230.contig,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [31]:
PanPred_train = PanPred_data.loc[train_isolate]
PanPred_test = PanPred_data.loc[test_isolate]

In [32]:
from pangraph.utils import binary_label

In [33]:
# https://stackoverflow.com/questions/41458834/how-is-scikit-learn-cross-val-predict-accuracy-score-calculated
## No _ in the method name, please
max_idx_amr = 14; # max value = 14

In [34]:
# for idx in range(2, 3):
for idx in range(2, max_idx_amr):
### PanKA
    y_class = train_labels.iloc[:,idx].values
    y, nonenan_index = binary_label(y_class) # v6
    train_data_new = train_data.iloc[nonenan_index, ]
    y_train = y[nonenan_index]
    PanPred_train_data_new = PanPred_train.iloc[nonenan_index, ] # PanPred
    train_data_leaky_new = train_data_leaky.iloc[nonenan_index, ].values
    
    y_class = test_labels.iloc[:,idx].values
    y, nonenan_index = binary_label(y_class) # v6
    test_data_new = test_data.iloc[nonenan_index, ]
    y_test = y[nonenan_index]
    PanPred_test_data_new = PanPred_test.iloc[nonenan_index, ]
    
    # Run unimodal gene
    print(metadata_panta.columns[idx])
    run_ML(train_data_new, y_train, test_data_new, y_test, 'Ecoli1936', 'pantaAdd')
    run_ML(train_data_leaky_new, y_train, test_data_new, y_test, 'Ecoli1936', 'pantaAddLeaky')
    run_ML(PanPred_train_data_new, y_train, PanPred_test_data_new, y_test, 'Ecoli1936', 'PanPred')

CTZ
0.7904169024712319
0.7870571010248901
0.8108938132913199
CTX
0.980594831789371
0.980594831789371
0.9580832016850974
AMP
0.9550387596899225
0.9550387596899225
0.9160717731043797
AMX
0.9572042300066093
0.9530495542261992
0.8978303747534516
AMC
0.8137295081967213
0.8077632565264152
0.7935134925307472
TZP
0.4827144686299616
0.4827144686299616
0.4827144686299616
CXM
0.9037866158609192
0.8877897660506356
0.8392778753292361
CET
0.9338537178772366
0.9338537178772366
0.844286941580756
GEN
0.92666212843204
0.9316698050447079
0.945669714900484
TBM
0.7858703999054877
0.7858703999054877
0.7825304457375968
TMP
0.930952380952381
0.930952380952381
0.8821983273596177
CIP
0.9748305422647527
0.9823926781433863
0.8868721838963419
