In [1]:
# Run ML on all PanPred outputs
## Run ML methods    
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn import datasets
from sklearn import svm
import random

In [2]:
import pandas as pd
import numpy as np

In [25]:
def run_ML(X, y, data_set, approach="Default"):
    score = []
    methods = []
    n_loops = 10
    n_samples = y.shape[0]
    for i in range(n_loops):
        path_dir = '/data/hoan/amromics/prediction/output/PanPred/' + data_set + '_run_'+str(i)+'_'+approach
        print('Run: ', i)
        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
        # print(X_train.shape, X_test.shape)
        random.seed(i)
        print("n_samples: ", n_samples)
        train_idx = random.sample(range(n_samples), int(n_samples*0.8))
        test_idx = [i for i in range(n_samples) if i not in train_idx]
        print(train_idx[:10])
        X_train = X[train_idx]
        X_test = X[test_idx]
        y_train = y[train_idx]
        y_test = y[test_idx]
        # Save the test true labels
        np.savetxt(path_dir + "_test_true_labels.csv", y_test, delimiter=",")
        print(X_train.shape, X_test.shape)
        
        # SVM
        methods.append('SVM')
        print(methods[-1], end =', ')
        clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)
        y_predict = clf.predict(X_test)
        np.savetxt(path_dir + "_SVM_labels.csv", y_predict, delimiter=",")
        score.append(f1_score(y_predict, y_test, average='macro'))
       
        # Decision Tree
        methods.append('Decision Tree')
        print(methods[-1], end =', ')
        clf = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
        np.savetxt(path_dir + "_DecisionTree_labels.csv", y_predict, delimiter=",")
        y_predict = clf.predict(X_test)
        score.append(f1_score(y_predict, y_test, average='macro'))
        
        # RF
        methods.append('RF')
        print(methods[-1], end =', ')
        clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1).fit(X_train, y_train)
        y_predict = clf.predict(X_test)
        np.savetxt(path_dir + "_RandomForest_labels.csv", y_predict, delimiter=",")
        score.append(f1_score(y_predict, y_test, average='macro'))
        
        # Neural network
        methods.append('Neural network')
        print(methods[-1], end =', ')
        clf = MLPClassifier(alpha=1, max_iter=2000).fit(X_train, y_train)
        y_predict = clf.predict(X_test)
        np.savetxt(path_dir + "_NeuralNet_labels.csv", y_predict, delimiter=",")
        score.append(f1_score(y_predict, y_test, average='macro'))
        
        # Adaboost
        methods.append('Adaboost')
        print(methods[-1], end =', ')
        clf = AdaBoostClassifier().fit(X_train, y_train)
        np.savetxt(path_dir + "_Adaboost_labels.csv", y_predict, delimiter=",")
        y_predict = clf.predict(X_test)
        score.append(f1_score(y_predict, y_test, average='macro'))
        
        # GradientBoostingClassifier
        methods.append('Gradient Boost Decision Tree')
        print(methods[-1], end =', ')
        clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0).fit(X_train, y_train)
        np.savetxt(path_dir + "_GBDT_labels.csv", y_predict, delimiter=",")
        y_predict = clf.predict(X_test)
        score.append(f1_score(y_predict, y_test, average='macro'))
        
        ## K-NN 
        methods.append('kNN')
        print(methods[-1], end =', ')
        clf = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train)
        np.savetxt(path_dir + "_NearestNeighbors_labels.csv", y_predict, delimiter=",")
        y_predict = clf.predict(X_test)
        score.append(f1_score(y_predict, y_test, average='macro'))
        
        # Naive Bayes
        methods.append('NaiveBayes')
        print(methods[-1], end ='\n')
        clf = GaussianNB().fit(X_train, y_train)
        np.savetxt(path_dir + "_NaiveBayes_labels.csv", y_predict, delimiter=",")
        y_predict = clf.predict(X_test)
        score.append(f1_score(y_predict, y_test, average='macro'))
        
    # Print statistics
    n_methods = len(set(methods))
    score_np = np.array(score)
    # Each column is a method
    print(methods[:n_methods])
    average_score = np.mean(score_np.reshape((n_loops, n_methods)), axis=0)
    print(np.round(average_score, 2))

In [4]:
# pandata = pd.read_csv("PanPred/test_data/gene_presence_absence.csv")

In [5]:
# pandata.head()

In [6]:
metadata = pd.read_csv('PanPred/test_data/Metadata.csv')

In [7]:
metadata.head()

Unnamed: 0,Isolate,Year,CTZ,CTX,AMP,AMX,AMC,TZP,CXM,CET,GEN,TBM,TMP,CIP
0,11657_5#10,2010.0,S,S,S,,S,S,S,S,S,S,S,S
1,11657_5#11,2010.0,S,S,R,,R,S,S,S,S,S,R,R
2,11657_5#12,2010.0,S,S,S,,S,S,S,S,S,S,S,S
3,11657_5#13,2010.0,S,S,R,,R,S,S,S,S,S,S,R
4,11657_5#14,2010.0,S,S,R,,S,S,S,S,S,S,R,S


In [8]:
accessorygene =  pd.read_csv('PanPred/test_data/AccessoryGene.csv', index_col=0)

In [11]:
populationstructure =  pd.read_csv('PanPred/test_data/PopulationStructure.csv_labelencoded.csv')

In [15]:
new_accessorygene = accessorygene.loc[metadata['Isolate']]

### Run ML models

In [32]:
for idx in range(2, 14):
    y_class = metadata.iloc[:,idx].values
    print(metadata.columns[idx])
    y = np.array([1 if y_class[i]=='R' else 0 for i in range(1936)])
    run_ML(new_accessorygene.values, y, 'Ecoli1936','classic')

[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]


In [34]:
metadata.columns[2]

'CTZ'