In [1]:
import pandas as pd

pos

In [3]:
df_pos2x = pd.read_csv('../../data/basedata/df_pos_MCH.csv')

from sklearn.model_selection import train_test_split
columns_to_drop = ['Metabolitename', 'Ontology', 'dataset', 'AlignmentID', 'AverageMz']
X = df_pos2x.drop(columns=columns_to_drop).astype(float).values
y = df_pos2x['Ontology']
print(len(y.unique()))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

63


kNN, RF, SVM, XGBoost, MLP

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint as sp_randint
import joblib

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

param_random_knn = {
    'n_neighbors': sp_randint(3, 30),  
    'weights': ['uniform', 'distance'], 
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
    'leaf_size': sp_randint(10, 50), 
    'p': [1, 2]
}

random_search_knn = RandomizedSearchCV(estimator=KNeighborsClassifier(), 
                                       param_distributions=param_random_knn, 
                                       n_iter=500, scoring='f1_macro', verbose=1,
                                       cv=5, n_jobs=60, random_state=42)

random_search_knn.fit(X_train, y_train_encoded)
print('Best parameters for kNN:', random_search_knn.best_params_)

y_pred_knn = random_search_knn.predict(X_test)
print('kNN Accuracy:', accuracy_score(y_test_encoded, y_pred_knn))

best_knn_model = random_search_knn.best_estimator_
joblib.dump(best_knn_model, '../../data/model_comp/pos_pred_result/best_knn_model_random_pos.joblib')

In [11]:
from sklearn.ensemble import RandomForestClassifier

param_random_rf = {
    'n_estimators': sp_randint(100, 1000),  
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],  
    'min_samples_split': sp_randint(2, 20), 
    'min_samples_leaf': sp_randint(1, 20),  
    'bootstrap': [True, False] 
}

random_search_rf = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42), 
                                    param_distributions=param_random_rf, 
                                    n_iter=500, scoring='f1_macro', verbose=True,
                                    cv=5, n_jobs=60, random_state=42)

random_search_rf.fit(X_train, y_train_encoded)
print('Best parameters for Random Forest:', random_search_rf.best_params_)

y_pred_rf = random_search_rf.predict(X_test)
print('Random Forest Accuracy:', accuracy_score(y_test_encoded, y_pred_rf))

best_rf_model = random_search_rf.best_estimator_
joblib.dump(best_rf_model, '../../data/model_comp/pos_pred_result/best_rf_model_random_pos.joblib')

In [12]:
from sklearn.svm import SVC
from scipy.stats import uniform

param_random_svm = {
    'C': uniform(loc=0.1, scale=100),
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'] + list(uniform(loc=0.001, scale=1).rvs(10)) 
}

random_search_svm = RandomizedSearchCV(estimator=SVC(random_state=42), 
                                       param_distributions=param_random_svm, 
                                       n_iter=500, scoring='f1_macro', verbose=1,
                                       cv=5, n_jobs=60, random_state=42)


random_search_svm.fit(X_train, y_train_encoded)
print('Best parameters for SVM:', random_search_svm.best_params_)

y_pred_svm = random_search_svm.predict(X_test)
print('SVM Accuracy:', accuracy_score(y_test_encoded, y_pred_svm))

best_svm_model = random_search_svm.best_estimator_
joblib.dump(best_svm_model, '../../data/model_comp/pos_pred_result/best_svm_model_random_pos.joblib')

In [13]:
import xgboost as xgb
from scipy.stats import uniform

param_random_xgb = {
    'n_estimators': sp_randint(50, 500),
    'max_depth': sp_randint(3, 15), 
    'learning_rate': uniform(loc=0.01, scale=0.29)
}

random_search_xgb = RandomizedSearchCV(estimator=xgb.XGBClassifier(eval_metric='mlogloss', random_state=42), 
                                       param_distributions=param_random_xgb, 
                                       n_iter=500, scoring='f1_macro', verbose=1,
                                       cv=5, n_jobs=60, random_state=42)

random_search_xgb.fit(X_train, y_train_encoded)
print('Best parameters for XGBoost:', random_search_xgb.best_params_)

y_pred_xgb = random_search_xgb.predict(X_test)
print('XGBoost Accuracy:', accuracy_score(y_test_encoded, y_pred_xgb))

best_xgb_model = random_search_xgb.best_estimator_
joblib.dump(best_xgb_model, '../../data/model_comp/pos_pred_result/best_xgb_model_random_pos.joblib')

In [14]:
from sklearn.neural_network import MLPClassifier

def generate_hidden_layer_sizes():
    num_layers = sp_randint.rvs(1, 4)
    layer_sizes = [sp_randint.rvs(10, 201) for _ in range(num_layers)] 
    return tuple(layer_sizes)

param_random_mlp = {
    'hidden_layer_sizes': [generate_hidden_layer_sizes() for _ in range(100)], 
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': uniform(loc=0.00001, scale=0.01),
    'learning_rate': ['constant', 'adaptive']
}

random_search_mlp = RandomizedSearchCV(estimator=MLPClassifier(max_iter=3000, random_state=42), 
                                       param_distributions=param_random_mlp, 
                                       n_iter=500, scoring='f1_macro', verbose=1,
                                       cv=5, n_jobs=60, random_state=42)

random_search_mlp.fit(X_train, y_train_encoded)
print('Best parameters for MLP:', random_search_mlp.best_params_)

y_pred_mlp = random_search_mlp.predict(X_test)
print('MLP Accuracy:', accuracy_score(y_test_encoded, y_pred_mlp))

best_mlp_model = random_search_mlp.best_estimator_
joblib.dump(best_mlp_model, '../../data/model_comp/pos_pred_result/best_mlp_model_random_pos.joblib')

In [15]:
from sklearn.preprocessing import LabelEncoder
import joblib

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

best_knn_model = joblib.load('../../data/model_comp/pos_pred_result/best_knn_model_random_pos.joblib')
y_pred_knn = best_knn_model.predict(X_test)
print('kNN Accuracy:', accuracy_score(y_test_encoded, y_pred_knn))

best_rf_model = joblib.load('../../data/model_comp/pos_pred_result/best_rf_model_random_pos.joblib')
y_pred_rf = best_rf_model.predict(X_test)
print('rf Accuracy:', accuracy_score(y_test_encoded, y_pred_rf))

best_svm_model = joblib.load('../../data/model_comp/pos_pred_result/best_svm_model_random_pos.joblib')
y_pred_svm = best_svm_model.predict(X_test)
print('svm Accuracy:', accuracy_score(y_test_encoded, y_pred_svm))

best_xgb_model = joblib.load('../../data/model_comp/pos_pred_result/best_xgb_model_random_pos.joblib')
y_pred_xgb = best_xgb_model.predict(X_test)
print('xgb Accuracy:', accuracy_score(y_test_encoded, y_pred_xgb))

best_mlp_model = joblib.load('../../data/model_comp/pos_pred_result/best_mlp_model_random_pos.joblib')
y_pred_mlp = best_mlp_model.predict(X_test)
print('mlp Accuracy:', accuracy_score(y_test_encoded, y_pred_mlp))

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def cal_score(y_test, y_pred_rf):
    accuracy = accuracy_score(y_test, y_pred_rf)
    precision = precision_score(y_test, y_pred_rf, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred_rf, average='macro')
    f1 = f1_score(y_test, y_pred_rf, average='macro')
    list = [accuracy, precision, recall, f1]

    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 Score:', f1)

    return list

In [12]:
print('kNN')
knn_result_list = cal_score(y_test_encoded, y_pred_knn)
print('Random Forest')
rf_result_list = cal_score(y_test_encoded, y_pred_rf)
print('SVM')
svm_result_list = cal_score(y_test_encoded, y_pred_svm)
print('XGBoost')
xgb_result_list = cal_score(y_test_encoded, y_pred_xgb)
print('MLP')
mlp_result_list = cal_score(y_test_encoded, y_pred_mlp)

kNN
Accuracy: 0.9077468953282082
Precision: 0.8879481749130694
Recall: 0.8019469447711542
F1 Score: 0.8281308868239396
Random Forest
Accuracy: 0.9373151981076286
Precision: 0.8449819335904621
Recall: 0.776629771791378
F1 Score: 0.7993297184338356
SVM
Accuracy: 0.9319929036073329
Precision: 0.9011837583930935
Recall: 0.8734180910994636
F1 Score: 0.8817973811371769
XGBoost
Accuracy: 0.9716144293317563
Precision: 0.93245309051721
Recall: 0.8955304885487574
F1 Score: 0.9051529818034342
MLP
Accuracy: 0.9444115907746895
Precision: 0.9084685589095505
Recall: 0.8873986169457868
F1 Score: 0.8944738987855485


In [13]:
pos_result_list = [knn_result_list, rf_result_list, svm_result_list, xgb_result_list, mlp_result_list]

In [14]:
import pickle
with open('../../data/model_comp/pos_pred_result/pos_result_list_random.pkl', 'wb') as f:
    pickle.dump(pos_result_list, f)

In [15]:
import numpy as np
joblib.dump(label_encoder, '../../data/model_comp/pos_pred_result/label_encoder_pos_random.pkl')

np.save('../../data/model_comp/pos_pred_result/y_test_encoded_pos_random.npy', y_test_encoded)
np.save('../../data/model_comp/pos_pred_result/y_pred_knn_pos_random.npy', y_pred_knn)
np.save('../../data/model_comp/pos_pred_result/y_pred_rf_pos_random.npy', y_pred_rf)
np.save('../../data/model_comp/pos_pred_result/y_pred_svm_pos_random.npy', y_pred_svm)
np.save('../../data/model_comp/pos_pred_result/y_pred_xgb_pos_random.npy', y_pred_xgb)
np.save('../../data/model_comp/pos_pred_result/y_pred_mlp_pos_random.npy', y_pred_mlp)

neg

In [16]:
df_neg2x = pd.read_csv('../../data/basedata/df_neg_MCH.csv')

from sklearn.model_selection import train_test_split
columns_to_drop = ['Metabolitename', 'Ontology', 'dataset', 'AlignmentID', 'AverageMz']
X = df_neg2x.drop(columns=columns_to_drop).astype(float).values
y = df_neg2x['Ontology']
print(len(y.unique()))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint as sp_randint
import joblib


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


param_random_knn = {
    'n_neighbors': sp_randint(3, 30),  
    'weights': ['uniform', 'distance'], 
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
    'leaf_size': sp_randint(10, 50), 
    'p': [1, 2]
}

random_search_knn = RandomizedSearchCV(estimator=KNeighborsClassifier(), 
                                       param_distributions=param_random_knn, 
                                       n_iter=500, scoring='f1_macro', verbose=1,
                                       cv=5, n_jobs=60, random_state=42)

random_search_knn.fit(X_train, y_train_encoded)
print('Best parameters for kNN:', random_search_knn.best_params_)

y_pred_knn = random_search_knn.predict(X_test)
print('kNN Accuracy:', accuracy_score(y_test_encoded, y_pred_knn))

best_knn_model = random_search_knn.best_estimator_
joblib.dump(best_knn_model, '../../data/model_comp/neg_pred_result/best_knn_model_random_neg.joblib')

In [18]:
from sklearn.ensemble import RandomForestClassifier

param_random_rf = {
    'n_estimators': sp_randint(100, 1000),  
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],  
    'min_samples_split': sp_randint(2, 20), 
    'min_samples_leaf': sp_randint(1, 20),  
    'bootstrap': [True, False] 
}

random_search_rf = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42), 
                                    param_distributions=param_random_rf, 
                                    n_iter=500, scoring='f1_macro', verbose=True,
                                    cv=5, n_jobs=60, random_state=42)

random_search_rf.fit(X_train, y_train_encoded)
print('Best parameters for Random Forest:', random_search_rf.best_params_)

y_pred_rf = random_search_rf.predict(X_test)
print('Random Forest Accuracy:', accuracy_score(y_test_encoded, y_pred_rf))

best_rf_model = random_search_rf.best_estimator_
joblib.dump(best_rf_model, '../../data/model_comp/neg_pred_result/best_rf_model_random_neg.joblib')

In [19]:
from sklearn.svm import SVC

param_random_svm = {
    'C': uniform(loc=0.1, scale=100),
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'] + list(uniform(loc=0.001, scale=1).rvs(10)) 
}

random_search_svm = RandomizedSearchCV(estimator=SVC(random_state=42), 
                                       param_distributions=param_random_svm, 
                                       n_iter=500, scoring='f1_macro', verbose=1,
                                       cv=5, n_jobs=60, random_state=42)

random_search_svm.fit(X_train, y_train_encoded)
print('Best parameters for SVM:', random_search_svm.best_params_)

y_pred_svm = random_search_svm.predict(X_test)
print('SVM Accuracy:', accuracy_score(y_test_encoded, y_pred_svm))

best_svm_model = random_search_svm.best_estimator_
joblib.dump(best_svm_model, '../../data/model_comp/neg_pred_result/best_svm_model_random_neg.joblib')

In [20]:
import xgboost as xgb

param_random_xgb = {
    'n_estimators': sp_randint(50, 500),
    'max_depth': sp_randint(3, 15), 
    'learning_rate': uniform(loc=0.01, scale=0.29)
}

random_search_xgb = RandomizedSearchCV(estimator=xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42), 
                                       param_distributions=param_random_xgb, 
                                       n_iter=500, scoring='f1_macro', verbose=1,
                                       cv=5, n_jobs=60, random_state=42)

random_search_xgb.fit(X_train, y_train_encoded)
print('Best parameters for XGBoost:', random_search_xgb.best_params_)

y_pred_xgb = random_search_xgb.predict(X_test)
print('XGBoost Accuracy:', accuracy_score(y_test_encoded, y_pred_xgb))

best_xgb_model = random_search_xgb.best_estimator_
joblib.dump(best_xgb_model, '../../data/model_comp/neg_pred_result/best_xgb_model_random_neg.joblib')

In [21]:
from sklearn.neural_network import MLPClassifier

param_random_mlp = {
    'hidden_layer_sizes': [generate_hidden_layer_sizes() for _ in range(100)], 
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': uniform(loc=0.00001, scale=0.01),
    'learning_rate': ['constant', 'adaptive']
}

random_search_mlp = RandomizedSearchCV(estimator=MLPClassifier(max_iter=1000, random_state=42), 
                                       param_distributions=param_random_mlp, 
                                       n_iter=500, scoring='f1_macro', verbose=1,
                                       cv=5, n_jobs=60, random_state=42)

random_search_mlp.fit(X_train, y_train_encoded)
print('Best parameters for MLP:', random_search_mlp.best_params_)

y_pred_mlp = random_search_mlp.predict(X_test)
print('MLP Accuracy:', accuracy_score(y_test_encoded, y_pred_mlp))

best_mlp_model = random_search_mlp.best_estimator_
joblib.dump(best_mlp_model, '../../data/model_comp/neg_pred_result/best_mlp_model_random_neg.joblib')

In [22]:
print('kNN')
knn_result_list = cal_score(y_test_encoded, y_pred_knn)
print('Random Forest')
rf_result_list = cal_score(y_test_encoded, y_pred_rf)
print('SVM')
svm_result_list = cal_score(y_test_encoded, y_pred_svm)
print('XGBoost')
xgb_result_list = cal_score(y_test_encoded, y_pred_xgb)
print('MLP')
mlp_result_list = cal_score(y_test_encoded, y_pred_mlp)

kNN
Accuracy: 0.925990675990676
Precision: 0.8186401666241402
Recall: 0.7742998633351145
F1 Score: 0.7762772019020245
Random Forest
Accuracy: 0.9667832167832168
Precision: 0.8993323645085642
Recall: 0.8253045908392568
F1 Score: 0.8500052792103635
SVM
Accuracy: 0.9621212121212122
Precision: 0.8831005163794904
Recall: 0.8392577089137998
F1 Score: 0.8487249232203361
XGBoost
Accuracy: 0.9755244755244755
Precision: 0.9328796201852226
Recall: 0.8892637363935751
F1 Score: 0.8952263126905086
MLP
Accuracy: 0.9662004662004662
Precision: 0.8988951918165983
Recall: 0.8527392807521416
F1 Score: 0.8663300677146465


In [23]:
neg_result_list = [knn_result_list, rf_result_list, svm_result_list, xgb_result_list, mlp_result_list]

In [24]:
import pickle
with open('../../data/model_comp/neg_pred_result/neg_result_list_random.pkl', 'wb') as f:
    pickle.dump(neg_result_list, f)

In [25]:
import numpy as np
joblib.dump(label_encoder, '../../data/model_comp/neg_pred_result/label_encoder_neg_random.pkl')

np.save('../../data/model_comp/neg_pred_result/y_test_encoded_neg_random.npy', y_test_encoded)
np.save('../../data/model_comp/neg_pred_result/y_pred_knn_neg_random.npy', y_pred_knn)
np.save('../../data/model_comp/neg_pred_result/y_pred_rf_neg_random.npy', y_pred_rf)
np.save('../../data/model_comp/neg_pred_result/y_pred_svm_neg_random.npy', y_pred_svm)
np.save('../../data/model_comp/neg_pred_result/y_pred_xgb_neg_random.npy', y_pred_xgb)
np.save('../../data/model_comp/neg_pred_result/y_pred_mlp_neg_random.npy', y_pred_mlp)