In [1]:
import torch
print(torch.cuda.is_available())

True


In [2]:
import pandas as pd
import joblib
import re

In [3]:
def calculate_average_atomic_mass(molecular_formula):
    # Define mass
    average_atomic_masses = {'H': 1.007825, 'C': 12.000000, 'O':15.994915}
    # Calculate atomic mass from molecular formula
    elements_with_counts = re.findall(r'([A-Z][a-z]*)(\d*)', molecular_formula)
    element_counts = {element[0]: int(element[1]) if element[1] else 1 for element in elements_with_counts}
    average_atomic_mass = sum(element_counts[element] * average_atomic_masses[element] for element in element_counts)
    return average_atomic_mass
    
def cal_mod(averagemz):
    num = ((averagemz % calculate_average_atomic_mass('CH2')) % calculate_average_atomic_mass('H2')) % (calculate_average_atomic_mass('H14') % calculate_average_atomic_mass('CH2')) 
    return num

In [4]:
def create_pred_class(row):
    classes = [f"{col}:{round(value, 3)}" for col, value in sorted(row.items(), key=lambda x: x[1], reverse=True) if value >= 0.01] #1%
    return ','.join(classes)

def check_ontology_in_pred_class(row):
    ontology = row['Ontology']
    predclass = row['predclass']
    return ontology in predclass

def cal_accscore(df, loaded_model, number_to_class):

    X1_test = df.drop(columns=columns_to_drop).values

    y_pred_test = loaded_model.predict_proba(X1_test)
    df_predres = pd.DataFrame(y_pred_test).rename(columns=number_to_class)
    df_predres['pred1st'] = df_predres.idxmax(axis=1)
    df_predres['predclass'] = df_predres.drop('pred1st', axis=1).apply(create_pred_class, axis=1)
    df_predres['prednumber'] = df_predres['predclass'].apply(lambda x: len(x.split(',')))

    df_result = df_predres[['pred1st', 'predclass', 'prednumber']]

    return df_result

pos

In [15]:
df_pos = pd.read_csv('../../data/HCdata/df_HCtable_pos.csv')
df_pos['MCHvalue'] = cal_mod(df_pos['AverageMz'])

loaded_model_pos = joblib.load('../../data/model_comp/pos_pred_result/best_xgb_model_random_pos.joblib')
label_encoder_pos = joblib.load('../../data/model_comp/pos_pred_result/label_encoder_pos_random.pkl')
number_to_class_pos = {index: label for index, label in enumerate(label_encoder_pos.classes_)}
df_feature_pos = pd.read_csv('../../data/model_comp/pos_pred_result/df_feature_pos.csv')

columns_to_drop = ['Metabolitename', 'Ontology', 'dataset', 'AlignmentID', 'AverageMz']
df_pos2 = df_pos.set_index(columns_to_drop)[df_feature_pos.feature].reset_index()

  df_pos = pd.read_csv('../../data/HCdata/df_HCtable_pos.csv')


In [6]:
df_pos_pred = cal_accscore(df_pos2, loaded_model_pos, number_to_class_pos)
df_pos_result = df_pos_pred.merge(df_pos2[columns_to_drop], left_index=True, right_index=True)
df_pos_result.to_csv('../../data/HCdata/df_pos_pred_res.csv', index=False)
df_pos_result[~df_pos_result['Metabolitename'].str.contains('low score:')].to_csv('../../data/HCdata/df_pos_pred_class.csv', index=False)

In [35]:
df_pos_result_con = df_pos_result[df_pos_result['Ontology'].isin(label_encoder_pos.classes_.tolist()) & 
                            ~df_pos_result['Metabolitename'].str.contains('low score:')]
value_pred1st = (df_pos_result_con[df_pos_result_con['pred1st'] == df_pos_result_con['Ontology']].shape[0] / df_pos_result_con.shape[0])*100
value_predclass = (df_pos_result_con.apply(check_ontology_in_pred_class, axis=1).sum() / df_pos_result_con.shape[0])*100

print(f'common anotation: correct {value_pred1st:.2f}%, candidates: {value_predclass:.2f}%')

common anotation: correct 65.42%, candidates: 86.87%


neg

In [10]:
df_neg = pd.read_csv('../../data/HCdata/df_HCtable_neg.csv')
df_neg['MCHvalue'] = cal_mod(df_neg['AverageMz'])

loaded_model_neg = joblib.load('../../data/model_comp/neg_pred_result/best_xgb_model_random_neg.joblib')
label_encoder_neg = joblib.load('../../data/model_comp/neg_pred_result/label_encoder_neg_random.pkl')
number_to_class_neg = {index: label for index, label in enumerate(label_encoder_neg.classes_)}
df_feature_neg = pd.read_csv('../../data/model_comp/neg_pred_result/df_feature_neg.csv')

columns_to_drop = ['Metabolitename', 'Ontology', 'dataset', 'AlignmentID', 'AverageMz']
df_neg2 = df_neg.set_index(columns_to_drop)[df_feature_neg.feature].reset_index()

  df_neg = pd.read_csv('../../data/HCdata/df_HCtable_neg.csv')


In [14]:
df_neg_pred = cal_accscore(df_neg2, loaded_model_neg, number_to_class_neg)
df_neg_result = df_neg_pred.merge(df_neg2[columns_to_drop], left_index=True, right_index=True)
df_neg_result.to_csv('../../data/HCdata/df_neg_pred_res.csv', index=False)
df_neg_result[~df_neg_result['Metabolitename'].str.contains('low score:')].to_csv('../../data/HCdata/df_neg_pred_class.csv', index=False)

In [13]:
df_neg_result_con = df_neg_result[df_neg_result['Ontology'].isin(label_encoder_neg.classes_.tolist()) & 
                            ~df_neg_result['Metabolitename'].str.contains('low score:')]
value_pred1st = (df_neg_result_con[df_neg_result_con['pred1st'] == df_neg_result_con['Ontology']].shape[0] / df_neg_result_con.shape[0])*100
value_predclass = (df_neg_result_con.apply(check_ontology_in_pred_class, axis=1).sum() / df_neg_result_con.shape[0])*100

print(f'common anotation: correct {value_pred1st:.2f}%, candidates: {value_predclass:.2f}%')

common anotation: correct 69.94%, candidates: 90.45%
