# msemblator corect count

In [7]:
from rdkit import Chem
from rdkit.Chem import inchi
import pandas as pd

def smiles_to_shortinchikey(smiles):
    """Convert SMILES to short InChIKey (first 14 characters)."""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        inchikey = inchi.MolToInchiKey(mol)
        short_inchikey = inchikey[:14]
        return short_inchikey
    except:
        return None
    
def msemblator_output_tf(msemblator_output_path,answer_path):
    ext = answer_path.split('.')[-1]
    if ext == 'csv':
        answer_df = pd.read_csv(answer_path)
    elif ext in ['tsv', 'txt']:
        answer_df = pd.read_table(answer_path)
    msemblator_output_df = pd.read_csv(msemblator_output_path)
    for i in range(5):
        msemblator_output_df[f'msfinder_shortinchikey_{i+1}'] = msemblator_output_df[f'msfinder_structure_{i+1}'].apply(smiles_to_shortinchikey)
        msemblator_output_df[f'sirius_shortinchikey_{i+1}'] = msemblator_output_df[f'sirius_structure_{i+1}'].apply(smiles_to_shortinchikey)
        msemblator_output_df[f'metfrag_shortinchikey_{i+1}'] = msemblator_output_df[f'metfrag_structure_{i+1}'].apply(smiles_to_shortinchikey)
    msemblator_output_df['Top_score_shortinchikey'] = msemblator_output_df['Canonical_SMILES'].apply(smiles_to_shortinchikey)
    merged_df = pd.merge(msemblator_output_df, answer_df, on='filename', how='left')
    if 'InChIKey14' in merged_df.columns:
        merged_df['key_for_match'] = merged_df['InChIKey14']
    else:
        merged_df['key_for_match'] = merged_df['shortinchikey']
    for i in range(1,6):
        merged_df[f'msfinder_match_{i}'] = (merged_df['key_for_match'] == merged_df[f'msfinder_shortinchikey_{i}']).astype(int)
        merged_df[f'sirius_match_{i}']  = (merged_df['key_for_match'] == merged_df[f'sirius_shortinchikey_{i}']).astype(int)
        merged_df[f'metfrag_match_{i}'] = (merged_df['key_for_match'] == merged_df[f'metfrag_shortinchikey_{i}']).astype(int)
        merged_df['Top_score_match'] = (merged_df['key_for_match'] == merged_df['Top_score_shortinchikey']).astype(int)
    print(f'Top score correct indentifications: {merged_df["Top_score_match"].sum()}')
    print(f'msfinder correct indentifications in top 1: {merged_df["msfinder_match_1"].sum()}')
    print(f'sirius correct indentifications in top 1: {merged_df["sirius_match_1"].sum()}')
    print(f'metfrag correct indentifications in top 1: {merged_df["metfrag_match_1"].sum()}')
    print(f'msfinder correct indentifications in top 5: {merged_df[[f"msfinder_match_{i}" for i in range(1,6)]].sum().sum()}')
    print(f'sirius correct indentifications in top 5: {merged_df[[f"sirius_match_{i}" for i in range(1,6)]].sum().sum()}')
    print(f'metfrag correct indentifications in top 5: {merged_df[[f"metfrag_match_{i}" for i in range(1,6)]].sum().sum()}')
    return merged_df

In [None]:
def msemblator_output_scoredf_tf(msemblator_output_path,answer_path):
    ext = answer_path.split('.')[-1]
    if ext == 'csv':
        answer_df = pd.read_csv(answer_path)
    elif ext in ['tsv', 'txt']:
        answer_df = pd.read_table(answer_path)
    msemblator_output_df = pd.read_csv(msemblator_output_path)
    msemblator_output_df['msemblator_shortinchikey'] = msemblator_output_df['Canonical_SMILES'].apply(smiles_to_shortinchikey)
    merged_df = pd.merge(msemblator_output_df, answer_df, left_on='filename', right_on='Name')
    merged_df['TF'] = (merged_df['msemblator_shortinchikey'] == merged_df['InChIKey14']).astype(int)
    print(f'Total correct indentifications: {merged_df["TF"].sum()} out of {len(merged_df)}')

    return merged_df

In [10]:
a = msemblator_output_tf(r"D:\nist2023\inchikeyfilter\top100\structure\result\casmi_structure_result_random.csv",r"D:\HMT\CASMI2022DATA\CASMI2022_answer\casmianswer_all.txt")
a.to_csv(r"D:\nist2023\inchikeyfilter\top100\structure\result\casmi_structure_summary_output_score.csv", index=False)

[15:50:51] Explicit valence for atom # 7 N, 4, is greater than permitted


Top score correct indentifications: 130
msfinder correct indentifications in top 1: 69
sirius correct indentifications in top 1: 122
metfrag correct indentifications in top 1: 87
msfinder correct indentifications in top 5: 183
sirius correct indentifications in top 5: 189
metfrag correct indentifications in top 5: 171


In [14]:
import pandas as pd
def msemblator_output_formula_tf(msemblator_output_path,answer_path):
    ext = answer_path.split('.')[-1]
    if ext == 'csv':
        answer_df = pd.read_csv(answer_path)
    elif ext in ['tsv', 'txt']:
        answer_df = pd.read_table(answer_path)
    msemblator_output_df = pd.read_csv(msemblator_output_path)
    # msemblator_output_df['msemblator_shortinchikey'] = msemblator_output_df['Canonical_SMILES'].apply(smiles_to_shortinchikey)
    merged_df = pd.merge(msemblator_output_df, answer_df, left_on='filename', right_on='Name')
    merged_df['TF'] = (merged_df['formula'] == merged_df['Formula']).astype(int)
    print(f'Total correct indentifications: {merged_df["TF"].sum()} out of {len(merged_df)}')

    return merged_df

In [18]:
merged_df = msemblator_output_formula_tf(r"D:\nist2023\inchikeyfilter\top100\formula\new_model\result\testing_formula_log.csv",
                                        r"D:\nist2023\inchikeyfilter\top100\answer_inchikey2023_filtered.csv")
merged_df.to_csv(r"D:\nist2023\inchikeyfilter\top100\formula\new_model\result\testing_formula_log_TF.csv",index=False)

Total correct indentifications: 874 out of 69208


In [None]:
# merged_df = msemblator_output_tf(r"D:\sawai\mori\structure_summary.csv", r"D:\HMT\CASMI2022DATA\CASMI2022_answer\casmianswer_all.txt")
# merged_df = msemblator_output_scoredf_tf(r'D:\nist2023\inchikeyfilter\top100\structure\result\testing_data_structure_xgboost.csv',
#                                         r"D:\nist2023\inchikeyfilter\top100\answer_inchikey2023_filtered.csv")
# merged_df.to_csv(r'D:\nist2023\inchikeyfilter\top100\structure\result\testing_data_structure_xgboost_with_TF.csv',index=False)
# filtered_df = merged_df[merged_df["metfrag_match_1"] == 1]
# filtered_df

Total correct indentifications: 890 out of 27905


# msemblator accuracy

In [None]:
from rdkit import Chem
from rdkit.Chem import inchi
import pandas as pd

def inchikey_from_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    inchi_str = inchi.MolToInchi(mol)
    inchikey = inchi.InchiToInchiKey(inchi_str)
    inchikey = inchikey[:14]
    return inchikey


output_structure_score_path = r"D:\sawai\mori\structure_score.csv"
casmi_answer_path = r"D:\HMT\CASMI2022DATA\CASMI2022_answer\casmianswer_all.txt"
structure_score_df = pd.read_csv(output_structure_score_path)
casmi_answer_df = pd.read_table(casmi_answer_path)

structure_score_df = structure_score_df.astype(str)
casmi_answer_df = casmi_answer_df.astype(str)
structure_score_top_df = structure_score_df[structure_score_df['rank'] == '1']
merged_df = pd.merge(structure_score_top_df, casmi_answer_df, left_on='filename', right_on='compoundnumber')
merged_df['shortinchikey_msemblator'] = merged_df['Canonical_SMILES'].apply(inchikey_from_smiles)
merged_df['TF'] = (merged_df['shortinchikey_msemblator'] == merged_df['shortinchikey']).astype(int)
accuracy = merged_df['TF'].sum() / len(merged_df)
print(f"Top-1 accuracy: {accuracy:.4f}")
alkaloid_df = merged_df[merged_df['superclass'] == 'Alkaloids and derivatives']
