In [1]:
## Run ML methods on PanPred and panta outputs 
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn import datasets
from sklearn import svm
import random
import os
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from numpy import genfromtxt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
import pandas as pd
import numpy as np

In [2]:
def run_ML(X, y, data_set, approach="Default", feature_selection = False, FS_method = 'mutual_info_classif', X2 = None):
    # base_dir = '/data/hoan/amromics/prediction/output/predPantaPanPred'+version
    base_dir = '/data/hoan/amromics/prediction/output/predPantaPanPred_v6'
    if not os.path.isdir(base_dir):
        os.system('mkdir '+ base_dir)
    score = []
    methods = []
    n_loops = 2
    n_folds = 5
    n_samples = y.shape[0]
    if X2 is not None:
        print("Original shape of input:", X.shape, X2.shape)
    for i in range(n_loops):
        cv = KFold(n_splits=n_folds, shuffle=True, random_state = i)
        for fold, (train_idx, test_idx) in enumerate(cv.split(X)):
            path_dir = base_dir +'/' + data_set + '_run_'+str(i)+'_'+ 'fold_'+str(fold)+'_'+approach
            print('Run: ', i, ', fold: ', fold)
            X_train = X[train_idx]
            X_test = X[test_idx]
            y_train = y[train_idx]
            y_test = y[test_idx]
            if False:
                if i <= 0:
                    print("Run feature selection", 'method = ', FS_method)
                if FS_method == 'mutual_info_classif':
                    fs_fit = SelectKBest(mutual_info_classif, k=1000).fit(X_train, y_train)
                elif FS_method == 'chi2':
                    fs_fit = SelectKBest(chi2, k=1000).fit(X_train, y_train)
                else:
                    print("Please input correct feature selection method")
                X_train = fs_fit.transform(X_train)
                X_test = fs_fit.transform(X_test)
            if X2 is not None:
                X2_train = X2[train_idx]
                X2_test = X2[test_idx]
                if feature_selection:
                    fs2_fit = SelectKBest(chi2, k=20000).fit(X2_train, y_train)
                    X2_train = fs2_fit.transform(X2_train)
                    X2_test = fs2_fit.transform(X2_test)
                X_train = np.append(X_train, X2_train, axis = 1)
                X_test = np.append(X_test, X2_test, axis = 1)
                # print('Scale the combine data')
                # scaler = StandardScaler()
                # X_train = scaler.fit_transform(X_train)
                # X_test = scaler.fit_transform(X_test)
                
            # print("Standize the data")
            # Save the test true labels
            np.savetxt(path_dir + "_test_true_labels.csv", y_test, delimiter=",")
            if i <= 0 and fold <= 0:
                print("n_samples: ", n_samples)
                print("Reduced shape of the data: ", X_train.shape, X_test.shape)
            print(test_idx[:10])


            model = lgb.LGBMClassifier()
            model.fit(X_train, y_train)
            methods.append('LightGBM')
            print(methods[-1], end =', ')
            # clfG.fit(X_train, y_train)
            y_predict=model.predict(X_test) 
            np.savetxt(path_dir + "_LightGBM_labels.csv", y_predict, delimiter=",")
            score.append(f1_score(y_predict, y_test, average='macro'))
        
    # Print statistics
    n_methods = len(set(methods))
    score_np = np.array(score)
    # Each column is a method
    print(methods[:n_methods])
    average_score = np.mean(score_np.reshape((n_loops*n_folds, n_methods)), axis=0)
    print(np.round(average_score, 2))

### Run PanPred 

In [3]:
# pandata = pd.read_csv("PanPred/test_data/gene_presence_absence.csv")

In [4]:
metadata = pd.read_csv('data/Ecoli1936metafiles/PanPred_Metadata.csv')
metadata = metadata.set_index(metadata['Isolate'])

In [5]:
accessorygene =  pd.read_csv('PanPred/test_data/AccessoryGene.csv', index_col=0)

In [6]:
populationstructure =  pd.read_csv('PanPred/test_data/PopulationStructure.csv_labelencoded.csv', index_col=0)

In [7]:
new_accessorygene = accessorygene.loc[metadata['Isolate']]

#### Run ML models

In [8]:
# for idx in range(2, 14):
#     y_class = metadata.iloc[:,idx].values
#     print(metadata.columns[idx])
#     y = np.array([1 if y_class[i]=='R' else 0 for i in range(1936)])
#     run_ML(new_accessorygene.values, y, 'Ecoli1936','classic')

In [9]:
# new_accessorygene.head(2)

### Run Panta

In [10]:
sample_isolate = pd.read_csv('/data/hoan/amromics/prediction/data/Ecoli1936metafiles/sample_isolate.csv')
sample_isolate.head(2)
sample2isolate = {}
for idx in range(len(sample_isolate.index)):
    sample2isolate[sample_isolate.iloc[idx,0]+'.contig'] = sample_isolate.iloc[idx,1]

In [11]:
version = '_v9'

In [12]:
# pa_matrix = pd.read_csv('/data/hoan/amromics/prediction/output/pantaEcoli1936/gene_presence_absence.Rtab', sep='\t', index_col=0).T
pa_matrix = pd.read_csv('/data/hoan/amromics/prediction/output/pantaEcoli1936align'+version+'/gene_presence_absence.Rtab', sep='\t', index_col=0).T

In [13]:
isolate_index = [sample2isolate[sample] for sample in pa_matrix.index]
metadata_panta = metadata.loc[isolate_index]

In [14]:
metadata_panta.head(2)

Unnamed: 0_level_0,Isolate,Year,CTZ,CTX,AMP,AMX,AMC,TZP,CXM,CET,GEN,TBM,TMP,CIP
Isolate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
11658_4#1,11658_4#1,2006.0,S,S,S,,S,S,R,S,S,S,S,S
11657_5#1,11657_5#1,2006.0,S,S,R,,R,S,S,S,S,S,R,R


In [15]:
sample_list = list(pa_matrix.index)

In [16]:
# metadata_panta

In [17]:
sample_list_faa = [sample + '.faa' for sample in sample_list] #, 'SAMEA2204229.contig.fna'

In [18]:
len(sample_list_faa)

1653

In [19]:
from Bio import SeqIO
from skbio import Sequence

  import pandas.util.testing as pdt


In [20]:
print("I am here")

I am here


In [104]:
## Divide into 3 folds
min_idx = 0 # 0, 600, 1200
max_idx = min_idx + 600 if min_idx < 1000 else len(sample_list_faa)
data_fold = str(round(min_idx/600))

In [105]:
min_idx, max_idx, data_fold

(1200, 1653, '2')

In [58]:
import timeit
kmer_seq_set = set()
sample_idx = 0
ksize = 10
pairdata = []
# for seq_idx in sample_list_faa:
for seq_idx in range(min_idx, max_idx):
    # print(seq_id)
    # start = timeit.default_timer()
    data_dir = '/data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/'+sample_list_faa[seq_idx]
    print(data_dir)
    kmer_seq = []
    fasta_sequences = SeqIO.parse(open(data_dir),'fasta')
    for fasta in fasta_sequences:
        name, sequence = fasta.id, str(fasta.seq)
        n_kmers = len(sequence) - ksize + 1
        for i in range(n_kmers):
            kmer_seq.append(sequence[i:i + ksize])
            pairdata.append((sample_idx, sequence[i:i + ksize]))

    sample_idx += 1
    kmer_seq_set.update(kmer_seq)
    # stop = timeit.default_timer()
    # print('Time: ', stop - start) 

/data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/SAMEA2204229.contig.faa
/data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/SAMEA2204230.contig.faa
/data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/SAMEA2204231.contig.faa
/data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/SAMEA2204232.contig.faa
/data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/SAMEA2204233.contig.faa
/data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/SAMEA2204234.contig.faa
/data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/SAMEA2204235.contig.faa
/data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/SAMEA2204236.contig.faa


In [59]:
len(kmer_seq_set)

2612113

In [1]:
# for idx in range(800, 1200):
# # for idx in range(1200, len(sample_list_faa)):
#     data_dir = '/data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/'+sample_list_faa[idx][:-3]
#     # cmd = 'cp -r '+data_dir+'* /data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/Fold1' # 0-400
#     cmd = 'cp -r '+data_dir+'fna /data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/Fold3' # 400-800
#     # print(cmd)
#     os.system(cmd)

In [2]:
# # for idx in range(800, 1200):
# for idx in range(1200, len(sample_list_faa)):
#     data_dir = '/data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/'+sample_list_faa[idx][:-3]
#     # cmd = 'cp -r '+data_dir+'* /data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/Fold1' # 0-400
#     cmd = 'cp -r '+data_dir+'fna /data/hoan/amromics/prediction/data/Ecoli1936/prokkaMore/Fold4' # 400-800
#     # print(cmd)
#     os.system(cmd)

In [62]:
kmer_seq_set = list(kmer_seq_set)
kmer2index = {}
for i in range(len(kmer_seq_set)):
    kmer2index[kmer_seq_set[i]] = i

In [63]:
# n_samples = len(sample_list_faa)

In [64]:
# kmer_matrix = np.zeros((n_samples, len(kmer_seq_set)))
kmer_matrix = np.zeros((sample_idx, len(kmer_seq_set)), dtype = np.int)

In [3]:
# kmer_matrix = np.zeros((600, 7813616), dtype = np.int)

In [5]:
# kmer_matrix.shape

In [66]:
start = timeit.default_timer()
for idx, kmer in pairdata:
    kmer_matrix[idx, kmer2index[kmer]] = 1
stop = timeit.default_timer()
print('Time: ', stop - start)

Time:  10.80304080992937


In [97]:
# kmer_matrix.shape

In [68]:
# kmer_matrix_df = pd.DataFrame(data=kmer_matrix, columns=kmer_seq_set)

In [70]:
# kmer_matrix_df

In [76]:
selector = VarianceThreshold(threshold=0.01)
kmer_matrix_VT = selector.fit_transform(kmer_matrix)

In [92]:
selected_features = np.array([kmer_seq_set[idx] for idx in selector.get_support(indices=True)])

In [93]:
len(selected_features)

In [95]:
# selected_features[1:10]

In [None]:
np.save('/data/hoan/amromics/prediction/data/kmer_Fold'+data_fold+'_mat_VT1.npy', kmer_matrix_VT) # save numpy array
np.save('/data/hoan/amromics/prediction/data/kmer_Fold'+data_fold+'_mat_VT1_features.npy', selected_features) # save numpy array

In [96]:
stop

18840566.691894583

In [90]:
from pangraph.utils import binary_label
from sklearn.feature_selection import mutual_info_classif, chi2

In [None]:
mutual_mat = []
for idx in range(2, 14):
    y_class = metadata_panta.iloc[:,idx].values
    print(metadata_panta.columns[idx])
    y, nonenan_index = binary_label(y_class) # v6
    pa_matrix_new = kmer_matrix_VT[nonenan_index, ]
    y_new = y[nonenan_index].astype(int)
    scores, pvalue = chi2(pa_matrix_new, y_new)
    mutual_mat.append(scores)
mutual_mat = np.array(mutual_mat)

In [None]:
mutual_mat_mean = mutual_mat.mean(axis=0)

In [None]:
top_features = np.argsort(mutual_mat_mean)[::-1][:100000]
kmer_matrix_VT_top_features = kmer_matrix_VT[:,top_features]
kmer_matrix_VT_top_features.shape

In [None]:
y_class = metadata_panta.iloc[:,4].values
def binary_label(y_class):
    y_bin = []
    nonenan_index = []
    for i in range(len(y_class)):
        if y_class[i]=='R' or y_class[i]=='I':
            y_bin.append(1)
            nonenan_index.append(i)
        elif y_class[i]=='S':
            y_bin.append(0)
            nonenan_index.append(i)
        else:
            y_bin.append(y_class[i])
    return np.array(y_bin), nonenan_index

In [None]:
# https://stackoverflow.com/questions/41458834/how-is-scikit-learn-cross-val-predict-accuracy-score-calculated
## No _ in the method name, please
max_idx_amr = 14; # max value = 14

In [None]:
# for idx in range(2, 3):
for idx in range(2, max_idx_amr):
    y_class = metadata_panta.iloc[:,idx].values
    print(metadata_panta.columns[idx])
    # y = np.array([1 if y_class[i]=='R' else 0 for i in range(len(y_class))]) version _v5
    y, nonenan_index = binary_label(y_class) # v6
    pa_matrix_new = pa_matrix[nonenan_index, ]
    y_new = y[nonenan_index]
    snp_mat_new = snp_mat[nonenan_index,]
    # Run unimodal gene
    # run_ML(pa_matrix, y, 'Ecoli1936'+'_'+metadata_panta.columns[idx],'pantaPangenome', False, 'mutual_info_classif', None)
    # run_ML(full_matrix, y, 'Ecoli1936'+'_'+metadata_panta.columns[idx],'pantaPangenome', False, 'mutual_info_classif', None)
    # run_ML(snp_mat, y, 'Ecoli1936'+'_'+metadata_panta.columns[idx],'pantaSnp', True, 'chi2')
    # run_ML(pa_matrix, y, 'Ecoli1936'+'_'+metadata_panta.columns[idx],'pantaCombine', False, 'mutual_info_classif', snp_mat)
    # run_ML(pa_matrix, y, 'Ecoli1936'+'_'+metadata_panta.columns[idx],'pantaCombineScale', True, 'chi2', snp_mat)
    # run_ML(pa_matrix, y, 'Ecoli1936'+'_'+metadata_panta.columns[idx],'pantaCombinehighGene', False, 'chi2', snp_mat)
    # run_ML(pa_matrix_new, y_new, 'Ecoli1936'+'_'+metadata_panta.columns[idx],'pantaPangenome', False, 'chi2', None)
    # run_ML(snp_mat_new, y_new, 'Ecoli1936'+'_'+metadata_panta.columns[idx],'pantaVT10', False, 'chi2', None)
    run_ML(snp_mat_new, y_new, 'Ecoli1936'+'_'+metadata_panta.columns[idx], panta_single, False, 'chi2', None)
    # run_ML(snp_mat_new, y_new, 'Ecoli1936'+'_'+metadata_panta.columns[idx], "pantaNewPanV7", False, 'chi2', None)
    run_ML(pa_matrix_new, y_new, 'Ecoli1936'+'_'+metadata_panta.columns[idx], panta_combine, False, 'chi2', snp_mat_new)

In [None]:
# ## Run PanPred on panta isolate
# pa_matrixPanPred = accessorygene.loc[isolate_index]
# for idx in range(2, max_idx_amr):
#     y_class = metadata_panta.iloc[:,idx].values
#     print(metadata_panta.columns[idx])
#     y, nonenan_index = binary_label(y_class) # v6
#     pa_matrixPanPred_new = pa_matrixPanPred.values[nonenan_index, ]
#     y_new = y[nonenan_index]
#     run_ML(pa_matrixPanPred_new, y_new, 'Ecoli1936'+'_'+metadata_panta.columns[idx],'PanPred' )

In [None]:
# ## Run PanPred on panta isolate: + population structure
# pa_matrixPanPred = accessorygene.loc[isolate_index]
# ps_matrixPanPred = populationstructure.loc[isolate_index]
# combinematrixPanPred = np.concatenate((pa_matrixPanPred.values, ps_matrixPanPred.values), axis=1)
# # combinematrixPanPred_new = 
# # scaler = StandardScaler()
# # scaled_combinematrixPanPred = scaler.fit_transform(combinematrixPanPred)

In [None]:
# for idx in range(2, max_idx_amr):
#     y_class = metadata_panta.iloc[:,idx].values
#     print(metadata_panta.columns[idx])
#     # y = np.array([1 if y_class[i]=='R' else 0 for i in range(len(y_class))])
#     # run_ML(scaled_combinematrixPanPred, y, 'Ecoli1936'+'_'+metadata_panta.columns[idx],'PanPredCombine' )
#     y, nonenan_index = binary_label(y_class) # v6
#     combinematrixPanPred_new = combinematrixPanPred[nonenan_index, ]
#     y_new = y[nonenan_index]
#     run_ML(combinematrixPanPred_new, y_new, 'Ecoli1936'+'_'+metadata_panta.columns[idx],'PanPredCombine' )