In [30]:
## Run ML methods on PanPred and panta outputs 
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn import datasets
from sklearn import svm
import random
import os

In [2]:
import pandas as pd
import numpy as np

In [20]:
version = '_v3'

In [33]:
def run_ML(X, y, data_set, approach="Default"):
    base_dir = '/data/hoan/amromics/prediction/output/predPantaPanPred'+version
    if not os.path.isdir(base_dir):
        os.system('mkdir '+ base_dir)
    score = []
    methods = []
    n_loops = 5
    n_samples = y.shape[0]
    for i in range(n_loops):
        path_dir = base_dir +'/' + data_set + '_run_'+str(i)+'_'+approach
        print('Run: ', i)
        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
        # print(X_train.shape, X_test.shape)
        random.seed(i)
        train_idx = random.sample(range(n_samples), int(n_samples*0.8))
        test_idx = [i for i in range(n_samples) if i not in train_idx]
        X_train = X[train_idx]
        X_test = X[test_idx]
        y_train = y[train_idx]
        y_test = y[test_idx]
        # Save the test true labels
        np.savetxt(path_dir + "_test_true_labels.csv", y_test, delimiter=",")
        if i <= 0:
            print("n_samples: ", n_samples)
            print(X_train.shape, X_test.shape)
        print(train_idx[:10])
        
        # SVM
        methods.append('SVM')
        print(methods[-1], end =', ')
        clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
        y_predict = clf.predict(X_test)
        np.savetxt(path_dir + "_SVM_labels.csv", y_predict, delimiter=",")
        score.append(f1_score(y_predict, y_test, average='macro'))
       
        # Decision Tree
        methods.append('Decision Tree')
        print(methods[-1], end =', ')
        clf = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
        np.savetxt(path_dir + "_DecisionTree_labels.csv", y_predict, delimiter=",")
        y_predict = clf.predict(X_test)
        score.append(f1_score(y_predict, y_test, average='macro'))
        
        # RF
        methods.append('RF')
        print(methods[-1], end =', ')
        clf = RandomForestClassifier().fit(X_train, y_train)
        y_predict = clf.predict(X_test)
        np.savetxt(path_dir + "_RandomForest_labels.csv", y_predict, delimiter=",")
        score.append(f1_score(y_predict, y_test, average='macro'))
        
        # # Neural network
        # methods.append('Neural network')
        # print(methods[-1], end =', ')
        # clf = MLPClassifier(alpha=1, max_iter=2000).fit(X_train, y_train)
        # y_predict = clf.predict(X_test)
        # np.savetxt(path_dir + "_NeuralNet_labels.csv", y_predict, delimiter=",")
        # score.append(f1_score(y_predict, y_test, average='macro'))
        
        # Adaboost
        methods.append('Adaboost')
        print(methods[-1], end =', ')
        clf = AdaBoostClassifier().fit(X_train, y_train)
        np.savetxt(path_dir + "_Adaboost_labels.csv", y_predict, delimiter=",")
        y_predict = clf.predict(X_test)
        score.append(f1_score(y_predict, y_test, average='macro'))
        
        # GradientBoostingClassifier
        methods.append('Gradient Boost Decision Tree')
        print(methods[-1], end =', ')
        clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0).fit(X_train, y_train)
        np.savetxt(path_dir + "_GBDT_labels.csv", y_predict, delimiter=",")
        y_predict = clf.predict(X_test)
        score.append(f1_score(y_predict, y_test, average='macro'))
        
#         ## K-NN 
#         methods.append('kNN')
#         print(methods[-1], end =', ')
#         clf = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)
#         np.savetxt(path_dir + "_NearestNeighbors_labels.csv", y_predict, delimiter=",")
#         y_predict = clf.predict(X_test)
#         score.append(f1_score(y_predict, y_test, average='macro'))
        
#         # Naive Bayes
#         methods.append('NaiveBayes')
#         print(methods[-1], end ='\n')
#         clf = GaussianNB().fit(X_train, y_train)
#         np.savetxt(path_dir + "_NaiveBayes_labels.csv", y_predict, delimiter=",")
#         y_predict = clf.predict(X_test)
#         score.append(f1_score(y_predict, y_test, average='macro'))
        
    # Print statistics
    n_methods = len(set(methods))
    score_np = np.array(score)
    # Each column is a method
    print(methods[:n_methods])
    average_score = np.mean(score_np.reshape((n_loops, n_methods)), axis=0)
    print(np.round(average_score, 2))

### Run PanPred 

In [4]:
# pandata = pd.read_csv("PanPred/test_data/gene_presence_absence.csv")

In [5]:
# pandata.head()

In [6]:
metadata = pd.read_csv('PanPred/test_data/Metadata.csv')
metadata = metadata.set_index(metadata['Isolate'])

In [7]:
# metadata.head(2)

In [8]:
accessorygene =  pd.read_csv('PanPred/test_data/AccessoryGene.csv', index_col=0)

In [9]:
accessorygene.head(2)

Unnamed: 0,yeiU,yhhS,ybaE,eutR,ibrB,ytfP,aslB,narQ,tolR,galM,...,group_48768,group_48873,group_48916,group_48933,group_48937,group_48958,group_49020,group_49174,group_49253,group_49257
11657_5#1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
11657_5#10,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
populationstructure =  pd.read_csv('PanPred/test_data/PopulationStructure.csv_labelencoded.csv')

In [11]:
new_accessorygene = accessorygene.loc[metadata['Isolate']]

#### Run ML models

In [12]:
# for idx in range(2, 14):
#     y_class = metadata.iloc[:,idx].values
#     print(metadata.columns[idx])
#     y = np.array([1 if y_class[i]=='R' else 0 for i in range(1936)])
#     run_ML(new_accessorygene.values, y, 'Ecoli1936','classic')

In [13]:
# new_accessorygene.head(2)

### Run Panta

In [15]:
sample_isolate = pd.read_csv('/data/hoan/amromics/prediction/data/Ecoli1936metafiles/sample_isolate.csv')
sample_isolate.head(2)
sample2isolate = {}
for idx in range(len(sample_isolate.index)):
    sample2isolate[sample_isolate.iloc[idx,0]+'.contig'] = sample_isolate.iloc[idx,1]

In [22]:
# pa_matrix = pd.read_csv('/data/hoan/amromics/prediction/output/pantaEcoli1936/gene_presence_absence.Rtab', sep='\t', index_col=0).T
pa_matrix = pd.read_csv('/data/hoan/amromics/prediction/output/pantaEcoli1936align'+version+'/gene_presence_absence.Rtab', sep='\t', index_col=0).T


In [23]:
pa_matrix.head(2)

Gene,groups_0,namA,groups_2,groups_3,groups_4,groups_5,groups_6,groups_7,groups_8,groups_9,...,groups_74779,groups_74780,groups_74781,groups_74782,traI_2_16929,groups_74784,groups_74785,groups_74786,groups_74787,groups_74788
SAMEA2204229.contig,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SAMEA2204230.contig,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
pa_matrix.shape

(1653, 74789)

In [25]:
isolate_index = [sample2isolate[sample] for sample in pa_matrix.index]

In [26]:
# isolate_index

In [27]:
metadata_panta = metadata.loc[isolate_index]

In [28]:
metadata_panta

Unnamed: 0_level_0,Isolate,Year,CTZ,CTX,AMP,AMX,AMC,TZP,CXM,CET,GEN,TBM,TMP,CIP
Isolate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
11658_4#1,11658_4#1,2006.0,S,S,S,,S,S,R,S,S,S,S,S
11657_5#1,11657_5#1,2006.0,S,S,R,,R,S,S,S,S,S,R,R
11658_4#2,11658_4#2,2006.0,S,S,S,,S,S,S,S,S,S,S,S
11658_5#1,11658_5#1,2006.0,S,S,R,,S,S,S,S,S,S,R,R
11658_5#2,11658_5#2,2007.0,S,R,R,,R,S,R,R,S,R,R,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18090_6#61,18090_6#61,2015.0,R,R,R,,S,S,R,R,R,R,S,S
18090_8#30,18090_8#30,2015.0,S,R,R,,S,S,R,R,R,R,R,S
18090_6#63,18090_6#63,2015.0,S,R,R,,S,S,R,R,R,R,R,R
18090_8#33,18090_8#33,2015.0,S,R,R,,R,S,R,R,R,R,S,S


In [34]:
for idx in range(2, 14):
    y_class = metadata_panta.iloc[:,idx].values
    print(metadata_panta.columns[idx])
    y = np.array([1 if y_class[i]=='R' else 0 for i in range(len(y_class))])
    run_ML(pa_matrix.values, y, 'Ecoli1936'+'_'+metadata_panta.columns[idx],'panta')

CTZ
Run:  0
n_samples:  1653
(1322, 74789) (331, 74789)
[788, 1552, 861, 82, 530, 1047, 995, 829, 1605, 621]
SVM, Decision Tree, RF, Adaboost, 

KeyboardInterrupt: 

In [39]:
## Run PanPred on panta isolate
pa_matrixPanPred = accessorygene.loc[isolate_index]
for idx in range(2, 14):
    y_class = metadata_panta.iloc[:,idx].values
    print(metadata_panta.columns[idx])
    y = np.array([1 if y_class[i]=='R' else 0 for i in range(len(y_class))])
    run_ML(pa_matrixPanPred.values, y, 'Ecoli1936'+'_'+metadata_panta.columns[idx],'PanPred')

CTZ
Run:  0
n_samples:  667
(533, 17198) (134, 17198)
[394, 430, 41, 265, 523, 497, 414, 310, 488, 366]
SVM, Decision Tree, RF, Adaboost, Gradient Boost Decision Tree, kNN, NaiveBayes
Run:  1
[137, 582, 64, 261, 120, 507, 460, 483, 388, 214]
SVM, Decision Tree, RF, Adaboost, Gradient Boost Decision Tree, kNN, NaiveBayes
Run:  2
[57, 93, 86, 369, 173, 315, 257, 620, 217, 621]
SVM, Decision Tree, RF, Adaboost, Gradient Boost Decision Tree, kNN, NaiveBayes
Run:  3
[243, 606, 557, 133, 378, 618, 485, 640, 594, 67]
SVM, Decision Tree, RF, Adaboost, Gradient Boost Decision Tree, kNN, NaiveBayes
Run:  4
[241, 310, 105, 405, 490, 158, 92, 68, 20, 411]
SVM, Decision Tree, RF, Adaboost, Gradient Boost Decision Tree, kNN, NaiveBayes
['SVM', 'Decision Tree', 'RF', 'Adaboost', 'Gradient Boost Decision Tree', 'kNN', 'NaiveBayes']
[0.49 0.74 0.5  0.68 0.72 0.5  0.5 ]
CTX
Run:  0
n_samples:  667
(533, 17198) (134, 17198)
[394, 430, 41, 265, 523, 497, 414, 310, 488, 366]
SVM, Decision Tree, RF, Adaboos

In [40]:
# pa_matrixPanPred

In [41]:
# metadata_panta