In [1]:
import pandas as pd

pos

In [2]:
df_pos2x = pd.read_csv('../../data/basedata/df_pos_MCH.csv')

from sklearn.model_selection import train_test_split
columns_to_drop = ['Metabolitename', 'Ontology', 'dataset', 'AlignmentID', 'AverageMz']
X = df_pos2x.drop(columns=columns_to_drop).astype(float).values
y = df_pos2x['Ontology']
print(len(y.unique()))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

63


kNN, RF, SVM, XGBoost, MLP

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

param_grid_knn = {'n_neighbors': [3,5,7,9,11,13,15]}
grid_search_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5, n_jobs=60)

grid_search_knn.fit(X_train, y_train_encoded)
print('Best parameters for kNN:', grid_search_knn.best_params_)

y_pred_knn = grid_search_knn.predict(X_test)
print('kNN Accuracy:', accuracy_score(y_test_encoded, y_pred_knn))

best_knn_model = grid_search_knn.best_estimator_
joblib.dump(best_knn_model, '../../data/model_comp/pos_pred_result/best_knn_model_pos.joblib')

In [5]:
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5)
grid_search_rf.fit(X_train, y_train_encoded)
print('Best parameters for Random Forest:', grid_search_rf.best_params_)

y_pred_rf = grid_search_rf.predict(X_test)
print('Random Forest Accuracy:', accuracy_score(y_test_encoded, y_pred_rf))

best_rf_model = grid_search_rf.best_estimator_
joblib.dump(best_rf_model, '../../data/model_comp/pos_pred_result/best_rf_model_pos.joblib')

In [6]:
from sklearn.svm import SVC

param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf']
}

grid_search_svm = GridSearchCV(SVC(random_state=42), param_grid_svm, cv=5)
grid_search_svm.fit(X_train, y_train_encoded)
print('Best parameters for SVM:', grid_search_svm.best_params_)

y_pred_svm = grid_search_svm.predict(X_test)
print('SVM Accuracy:', accuracy_score(y_test_encoded, y_pred_svm))

best_svm_model = grid_search_svm.best_estimator_
joblib.dump(best_svm_model, '../../data/model_comp/pos_pred_result/best_svm_model_pos.joblib')

In [7]:
import xgboost as xgb

param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3]
}

grid_search_xgb = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42), param_grid_xgb, cv=5)
grid_search_xgb.fit(X_train, y_train_encoded)
print('Best parameters for XGBoost:', grid_search_xgb.best_params_)

y_pred_xgb = grid_search_xgb.predict(X_test)
print('XGBoost Accuracy:', accuracy_score(y_test_encoded, y_pred_xgb))

best_xgb_model = grid_search_xgb.best_estimator_
joblib.dump(best_xgb_model, '../../data/model_comp/pos_pred_result/best_xgb_model_pos.joblib')

In [8]:
from sklearn.neural_network import MLPClassifier

param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant', 'adaptive']
}

grid_search_mlp = GridSearchCV(MLPClassifier(max_iter=1000, random_state=42), param_grid_mlp, cv=5)
grid_search_mlp.fit(X_train, y_train_encoded)
print('Best parameters for MLP:', grid_search_mlp.best_params_)

y_pred_mlp = grid_search_mlp.predict(X_test)
print('MLP Accuracy:', accuracy_score(y_test_encoded, y_pred_mlp))

best_mlp_model = grid_search_mlp.best_estimator_
joblib.dump(best_mlp_model, '../../data/model_comp/pos_pred_result/best_mlp_model_pos.joblib')

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def cal_score(y_test, y_pred_rf):
    accuracy = accuracy_score(y_test, y_pred_rf)
    precision = precision_score(y_test, y_pred_rf, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred_rf, average='macro')
    f1 = f1_score(y_test, y_pred_rf, average='macro')
    list = [accuracy, precision, recall, f1]

    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 Score:', f1)

    return list

In [22]:
print('kNN')
knn_result_list = cal_score(y_test_encoded, y_pred_knn)
print('Random Forest')
rf_result_list = cal_score(y_test_encoded, y_pred_rf)
print('SVM')
svm_result_list = cal_score(y_test_encoded, y_pred_svm)
print('XGBoost')
xgb_result_list = cal_score(y_test_encoded, y_pred_xgb)
print('MLP')
mlp_result_list = cal_score(y_test_encoded, y_pred_mlp)

kNN
Accuracy: 0.8732227488151659
Precision: 0.827201275233229
Recall: 0.6885202147399908
F1 Score: 0.7220970680161969
Random Forest
Accuracy: 0.9259478672985783
Precision: 0.8205742398137006
Recall: 0.731809722718779
F1 Score: 0.7627713562103623
SVM
Accuracy: 0.9342417061611374
Precision: 0.9065703972592458
Recall: 0.8825678092003171
F1 Score: 0.8890054773268861
XGBoost
Accuracy: 0.9691943127962085
Precision: 0.9201977738689365
Recall: 0.8819468418728298
F1 Score: 0.8944619601208434
MLP
Accuracy: 0.9354265402843602
Precision: 0.9007847040270226
Recall: 0.8478168306914614
F1 Score: 0.8654859787888672


In [23]:
pos_result_list = [knn_result_list, rf_result_list, svm_result_list, xgb_result_list, mlp_result_list]

In [26]:
import pickle
with open('../../data/model_comp/pos_pred_result/pos_result_list.pkl', 'wb') as f:
    pickle.dump(pos_result_list, f)

In [16]:
import numpy as np
joblib.dump(label_encoder, '../../data/model_comp/pos_pred_result/label_encoder_pos.pkl')

np.save('../../data/model_comp/pos_pred_result/y_test_encoded_pos.npy', y_test_encoded)
np.save('../../data/model_comp/pos_pred_result/y_pred_knn_pos.npy', y_pred_knn)
np.save('../../data/model_comp/pos_pred_result/y_pred_rf_pos.npy', y_pred_rf)
np.save('../../data/model_comp/pos_pred_result/y_pred_svm_pos.npy', y_pred_svm)
np.save('../../data/model_comp/pos_pred_result/y_pred_xgb_pos.npy', y_pred_xgb)
np.save('../../data/model_comp/pos_pred_result/y_pred_mlp_pos.npy', y_pred_mlp)

neg

In [9]:
df_neg2x = pd.read_csv('../../data/basedata/df_neg_MCH.csv')

from sklearn.model_selection import train_test_split
columns_to_drop = ['Metabolitename', 'Ontology', 'dataset', 'AlignmentID', 'AverageMz']
X = df_neg2x.drop(columns=columns_to_drop).astype(float).values
y = df_neg2x['Ontology']
print(len(y.unique()))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

param_grid_knn = {'n_neighbors': [3,5,7,9,11,13,15]}
grid_search_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5)

grid_search_knn.fit(X_train, y_train_encoded)
print('Best parameters for kNN:', grid_search_knn.best_params_)

y_pred_knn = grid_search_knn.predict(X_test)
print('kNN Accuracy:', accuracy_score(y_test_encoded, y_pred_knn))

best_knn_model = grid_search_knn.best_estimator_
joblib.dump(best_knn_model, '../../data/model_comp/neg_pred_result/best_knn_model_neg.joblib')

In [11]:
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5)
grid_search_rf.fit(X_train, y_train_encoded)
print('Best parameters for Random Forest:', grid_search_rf.best_params_)

y_pred_rf = grid_search_rf.predict(X_test)
print('Random Forest Accuracy:', accuracy_score(y_test_encoded, y_pred_rf))

best_rf_model = grid_search_rf.best_estimator_
joblib.dump(best_rf_model, '../../data/model_comp/neg_pred_result/best_rf_model_neg.joblib')

In [12]:
from sklearn.svm import SVC

param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf']
}

grid_search_svm = GridSearchCV(SVC(random_state=42), param_grid_svm, cv=5)
grid_search_svm.fit(X_train, y_train_encoded)
print('Best parameters for SVM:', grid_search_svm.best_params_)

y_pred_svm = grid_search_svm.predict(X_test)
print('SVM Accuracy:', accuracy_score(y_test_encoded, y_pred_svm))

best_svm_model = grid_search_svm.best_estimator_
joblib.dump(best_svm_model, '../../data/model_comp/neg_pred_result/best_svm_model_neg.joblib')

In [13]:
import xgboost as xgb

param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3]
}

grid_search_xgb = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42), param_grid_xgb, cv=5)
grid_search_xgb.fit(X_train, y_train_encoded)
print('Best parameters for XGBoost:', grid_search_xgb.best_params_)

y_pred_xgb = grid_search_xgb.predict(X_test)
print('XGBoost Accuracy:', accuracy_score(y_test_encoded, y_pred_xgb))

best_xgb_model = grid_search_xgb.best_estimator_
joblib.dump(best_xgb_model, '../../data/model_comp/neg_pred_result/best_xgb_model_neg.joblib')

In [14]:
from sklearn.neural_network import MLPClassifier

param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant', 'adaptive']
}

grid_search_mlp = GridSearchCV(MLPClassifier(max_iter=1000, random_state=42), param_grid_mlp, cv=5)
grid_search_mlp.fit(X_train, y_train_encoded)
print('Best parameters for MLP:', grid_search_mlp.best_params_)

y_pred_mlp = grid_search_mlp.predict(X_test)
print('MLP Accuracy:', accuracy_score(y_test_encoded, y_pred_mlp))

best_mlp_model = grid_search_mlp.best_estimator_
joblib.dump(best_mlp_model, '../../data/model_comp/neg_pred_result/best_mlp_model_neg.joblib')

In [None]:
print('kNN')
knn_result_list = cal_score(y_test_encoded, y_pred_knn)
print('Random Forest')
rf_result_list = cal_score(y_test_encoded, y_pred_rf)
print('SVM')
svm_result_list = cal_score(y_test_encoded, y_pred_svm)
print('XGBoost')
xgb_result_list = cal_score(y_test_encoded, y_pred_xgb)
print('MLP')
mlp_result_list = cal_score(y_test_encoded, y_pred_mlp)

In [None]:
neg_result_list = [knn_result_list, rf_result_list, svm_result_list, xgb_result_list, mlp_result_list]

In [None]:
import pickle
with open('../../data/model_comp/neg_pred_result/neg_result_list.pkl', 'wb') as f:
    pickle.dump(neg_result_list, f)

In [None]:
import numpy as np
joblib.dump(label_encoder, '../../data/model_comp/neg_pred_result/label_encoder_neg.pkl')

np.save('../../data/model_comp/neg_pred_result/y_test_encoded_neg.npy', y_test_encoded)
np.save('../../data/model_comp/neg_pred_result/y_pred_knn_neg.npy', y_pred_knn)
np.save('../../data/model_comp/neg_pred_result/y_pred_rf_neg.npy', y_pred_rf)
np.save('../../data/model_comp/neg_pred_result/y_pred_svm_neg.npy', y_pred_svm)
np.save('../../data/model_comp/neg_pred_result/y_pred_xgb_neg.npy', y_pred_xgb)
np.save('../../data/model_comp/neg_pred_result/y_pred_mlp_neg.npy', y_pred_mlp)