In [None]:
import os
import shutil
import getpass
import pandas as pd
import time
import logging
from metfrag_file_processing import creat_metfrag_file
from metfrag_struc_cmd import run_metfrag_command
from splitting_msp import read_msp
from msfinder_struc_cmd import run_msfinder
from msp_to_ms import convert_msp_file_to_ms
from sirius_struc_cmd import sirius_login, run_sirius_struc
from creating_struc_summary import struc_summary
from struc_utility import clear_folder, clear_folder_except, save_file, generate_unique_filename
from struc_score_normalization import ClippingTransformer
from msfinder_struc_summary import process_msfinder_output
from sirius_struc_summary import process_sirius_output
from struc_score_calc import predict_and_append, aggregate_probability_with_rank
from metfrag_summary import process_metfrag_output


# sirius structure

In [None]:
import os
import glob
import pandas as pd
import joblib
from convert_struc_data_type import normalize_rank,smiles_list_to_inchikeys
from struc_score_normalization import ClippingTransformer

def process_sirius_output(sirius_folder, machine_dir, name_adduct_df, 
                          summary_inchikey_df, summary_smiles_df, 
                          class_summary_df, smiles_score_df):
    """
    Processes SIRIUS output and generates updated InChIKey, SMILES, score, and classification data.

    Parameters:
        sirius_folder (str): Directory containing SIRIUS output files.
        machine_dir (str): Directory containing score normalization pipelines.
        name_adduct_df (pd.DataFrame): DataFrame mapping filenames to adducts.
        summary_inchikey_df (pd.DataFrame): Existing InChIKey summary DataFrame.
        summary_smiles_df (pd.DataFrame): Existing SMILES summary DataFrame.
        class_summary_df (pd.DataFrame): Existing classification summary DataFrame.
        smiles_score_df (pd.DataFrame): Existing score summary DataFrame.

    Returns:
        tuple: (sirius_inchikey_df, sirius_smiles_df, class_summary_df, smiles_score_df)
    """

    # Retrieve SIRIUS output files
    sirius_paths = glob.glob(f"{sirius_folder}/*/structure_candidates.tsv")

    if not sirius_paths:
        print(f"No SIRIUS files found in {sirius_folder}")
        return summary_inchikey_df, summary_smiles_df, class_summary_df, smiles_score_df

    data_frames = []
    for file in sirius_paths:
        try:
            df = pd.read_csv(file, sep='\t')
            df['filename'] = os.path.basename(os.path.dirname(file)).split('_')[-1]
            data_frames.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    if not data_frames:
        return summary_inchikey_df, summary_smiles_df, class_summary_df, smiles_score_df

    # Combine all files into a single DataFrame
    combined_data = pd.concat(data_frames, ignore_index=True)

    # Assign rank
    combined_data["rank"] = combined_data.groupby("filename").cumcount() + 1

    # Determine score column
    score_column = "CSI:FingerIDScore" if "CSI:FingerIDScore" in combined_data.columns else "score"

    # Compute score difference
    combined_data["score_diff"] = 0
    mask = (combined_data["rank"] + 1 == combined_data["rank"].shift(-1).fillna(0).astype(int))
    combined_data.loc[mask, "score_diff"] = (
        combined_data[score_column] - combined_data[score_column].shift(-1)
    )
    combined_data["score_diff"] = combined_data["score_diff"].fillna(0)

    # Adduct replacement
    replace_dict = {
        r"\[M \+ H3N \+ H\]\+": "[M+NH4]+",
        r"\[M \+ CH2O2 - H\]-": "[M+FA+H]+"
    }
    for pattern, replacement in replace_dict.items():
        combined_data["adduct"] = combined_data["adduct"].fillna("").str.replace(pattern, replacement, regex=True)

    # Load score normalization pipelines
    sirius_score_pipeline_path = os.path.join(machine_dir, "pipeline_CSI_FingerIDScore.pkl")
    sirius_SD_pipeline_path = os.path.join(machine_dir, "pipeline_sirius_score_diff.pkl")

    score_pipeline = joblib.load(sirius_score_pipeline_path)
    SD_pipeline = joblib.load(sirius_SD_pipeline_path)

    # Select top 3 ranked candidates
    filtered_df = combined_data.groupby('filename').head(3).copy()

    # # Normalize scores
    # filtered_df["normalization_Zscore"] = score_pipeline.transform(filtered_df[[score_column]])
    # filtered_df["normalization_z_score_diff"] = SD_pipeline.transform(filtered_df[["score_diff"]])

    # # Prepare score calculation DataFrame
    # sirius_score_calc_df = filtered_df[["filename", "adduct", "rank", "smiles", "normalization_Zscore", "normalization_z_score_diff"]].copy()
    # sirius_score_calc_df = sirius_score_calc_df.rename(columns={"smiles": "SMILES"})
    # sirius_score_calc_df["tool_name"] = "sirius"

    # # Map adducts from `name_adduct_df`
    # sirius_score_calc_df['adduct'] = sirius_score_calc_df['filename'].map(name_adduct_df.set_index('filename')['adduct'])

    # # Apply rank normalization function
    # normalize_rank(sirius_score_calc_df)

    # Convert SMILES to InChIKey
    filtered_df["InChIKey"] = smiles_list_to_inchikeys(filtered_df["smiles"])
    filtered_df = filtered_df.astype(str).fillna('')

    # Pivot InChIKey and SMILES data
    inchikey_pivot = filtered_df.pivot(index=["filename"], columns=["rank"], values=["InChIKey"])
    smiles_pivot = filtered_df.pivot(index=["filename"], columns=["rank"], values=["smiles"])

    inchikey_pivot.columns = [f'sirius_structure_{col[1]}' for col in inchikey_pivot.columns.values]
    smiles_pivot.columns = [f'sirius_structure_{col[1]}' for col in smiles_pivot.columns.values]

    # Merge InChIKey data
    sirius_inchikey_df = summary_inchikey_df.merge(inchikey_pivot.reset_index(), on=["filename"], how="outer")

    # Merge SMILES data
    sirius_smiles_df = summary_smiles_df.merge(smiles_pivot.reset_index(), on=["filename"], how="outer")

    # # Extract classification data (only rank 1)
    # sirius_class_data = filtered_df[filtered_df['rank'] == '1'][['filename', 'InChIKey', 'smiles']]
    # sirius_class_data = sirius_class_data[(sirius_class_data['InChIKey'].str.strip() != '') & 
    #                                       (sirius_class_data['smiles'].str.strip() != '')]
    # sirius_class_data['tool_name'] = "SIRIUS"
    # sirius_class_data.columns = ['filename', 'InChIKey', 'SMILES', 'tool_name']

    # Append new classification data
    class_summary_df = pd.concat([class_summary_df, sirius_class_data], ignore_index=True)

    # Append new score data
    smiles_score_df = pd.concat([smiles_score_df, sirius_score_calc_df], ignore_index=True)

    return sirius_inchikey_df, sirius_smiles_df, class_summary_df, smiles_score_df


In [None]:
import pandas as pd
input_msp = r"D:\nist2023\filtered_LC_renamed_id.msp"
msfinder_folder = r"D:\python\structure_hozon_sugukesu\msfinder"
machine_dir = r"D:\HMT\machine\structure\model_new\model_modify_separate_top5"
sirius_folder = r"D:\nist2023\nistsiriusoutput"
# sirius_folder = r"D:\nist2023\nistsiriusoutput"
metfrag_folder = r"D:\nist2023\inchikeyfilter\metfrag_output_inchikey"
# answer_df = pd.read_csv(r"D:\nist2023\answer.csv")

In [1]:
# inchikeyfilter
import pandas as pd
input_msp = r"D:\nist2023\LC_unique_records_inchikey_renamed_last.msp"
msfinder_folder = r"D:\nist2023\inchikeyfilter\msfinder"
machine_dir = r"D:\HMT\machine\structure\model_new\model_modify_separate_top5"
sirius_folder = r"D:\nist2023\inchikeyfilter\sirius"
# sirius_folder = r"D:\nist2023\nistsiriusoutput"
metfrag_folder = r"D:\nist2023\inchikeyfilter\metfrag_output_inchikey"
answer_df = pd.read_csv(r"D:\nist2023\answer_inchikey2023_filtered.csv")

In [6]:
import os
import glob
import re
import joblib
import pandas as pd
from convert_struc_data_type import read_msp_file,extract_compound_and_ionization,convert_to_canonical_smiles,normalize_rank
from tqdm import tqdm
from functools import reduce
from struc_score_normalization import ClippingTransformer 
from msfinder_struc_summary import process_msfinder_output
from sirius_struc_summary import process_sirius_output
from struc_score_calc import predict_and_append, aggregate_probability_with_rank
from metfrag_summary import process_metfrag_output

def struc_summary(input_msp, msfinder_folder, machine_dir, sirius_folder, metfrag_folder):
    msp_data = read_msp_file(input_msp)
    compound_ionization_data = extract_compound_and_ionization(msp_data)
    summary_inchikey_df = pd.DataFrame(columns=['filename', 'adduct'])
    summary_smiles_df = pd.DataFrame(columns=['filename', 'adduct'])
    class_summary_df = pd.DataFrame(columns=['filename','tool_name','InChIKey','SMILES'])
    smiles_score_df=pd.DataFrame(columns=['filename',"tool_name",'adduct',"rank","SMILES","normalization_Zscore","normalization_z_score_diff","normalized_rank"])
    name_adduct_df = pd.DataFrame(columns=['filename', 'adduct'])
    # Assign the compound names to the 'filename' column and ionization information to the 'adduct' column
    for idx, (compound, ionization) in enumerate(compound_ionization_data):
        summary_inchikey_df.at[idx, 'filename'] = compound
        summary_inchikey_df.at[idx, 'adduct'] = ionization
        summary_smiles_df.at[idx, 'filename'] = compound
        summary_smiles_df.at[idx, 'adduct'] = ionization
        name_adduct_df.at[idx, 'filename'] = compound
        name_adduct_df.at[idx, 'adduct'] = ionization
    # msfinder summary
    msfinder_inchikey_df, msfinder_smiles_df, class_summary_df, smiles_score_df = process_msfinder_output(msfinder_folder, machine_dir, name_adduct_df, summary_inchikey_df, summary_smiles_df, class_summary_df, smiles_score_df, top_n=5)
    # sirius summary
    sirius_inchikey_df, sirius_smiles_df, class_summary_df, smiles_score_df = process_sirius_output(sirius_folder, machine_dir, name_adduct_df, summary_inchikey_df, summary_smiles_df, class_summary_df, smiles_score_df, top_n=5)
    # metfrag summary
    metfrag_inchikey_df, metfrag_smiles_df, class_summary_df, metfrag_score_calc_df = process_metfrag_output(metfrag_folder, machine_dir, name_adduct_df, summary_inchikey_df, summary_smiles_df, smiles_score_df,class_summary_df,top_n=5)
    # Merge summary data across all tools
    # dataframes = [msfinder_smiles_df, sirius_smiles_df, metfrag_smiles_df]
    # summary_smiles_df = reduce(lambda left, right: pd.merge(left, right, on=["filename", "adduct"], how='outer'), dataframes)


    return metfrag_inchikey_df, sirius_inchikey_df, msfinder_inchikey_df
    # return msfinder_smiles_df, sirius_smiles_df, metfrag_smiles_df,sirius_inchikey_df

In [7]:
metfrag_inchikey_df, sirius_inchikey, msfinder_inchikey = struc_summary(input_msp, msfinder_folder, machine_dir, sirius_folder, metfrag_folder)

  msfinder_output_combined.loc[mask, "score_diff"] = (
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  smiles_score_df = pd.concat([smiles_score_df, msfinder_score_calc_df], ignore_index=True)
  combined_data = pd.concat(data_frames, ignore_index=True)
 22.5807001 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  combined_data.loc[mask, "score_diff"] = (
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
[14:41:00] Explicit valence for atom # 21 N, 4, is greater than permitted
[14:41:00] Explicit v

In [10]:
library_df = pd.read_table(r"D:\python\msemblator2\script\msfinder\coconutandBLEXP.txt")

In [8]:
# metfrag_inchikey_df['filename'] = metfrag_inchikey_df['filename'].astype(str)
# answer_df['Name'] = answer_df['Name'].astype(str)
# merged_df = pd.merge(metfrag_inchikey_df, answer_df, left_on='filename', right_on='Name')
def merge_answer_sheet(df, answer_df):
    df['filename'] = df['filename'].astype(str)
    answer_df['Name'] = answer_df['Name'].astype(str)
    merged_df = pd.merge(df, answer_df, left_on='filename', right_on='Name', how='left')
    return merged_df
def inchikey_match(df, tool_structure_col, answer_inchikey_col,top_rank):
    return (df[f"{tool_structure_col}_structure_{top_rank}"].str.split('-').str[0] == df[answer_inchikey_col]).astype(int)
metfrag_merged = merge_answer_sheet(metfrag_inchikey_df, answer_df)
for rank in range(1,6):
    metfrag_merged[f'TF{rank}'] = inchikey_match(metfrag_merged, 'metfrag', 'InChIKey14', rank)

sirius_merged = merge_answer_sheet(sirius_inchikey, answer_df)
for rank in range(1,6):
    sirius_merged[f'TF{rank}'] = inchikey_match(sirius_merged, 'sirius', 'InChIKey14', rank)

msfinder_merged = merge_answer_sheet(msfinder_inchikey, answer_df)
for rank in range(1,6):
    msfinder_merged[f'TF{rank}'] = inchikey_match(msfinder_merged, 'msfinder', 'InChIKey14', rank)

# merged_df['TF1'] = (merged_df['sirius_structure_1'].str.split('-').str[0] == merged_df['InChIKey14']).astype(int)
# merged_df['TF2'] = (merged_df['sirius_structure_2'].str.split('-').str[0] == merged_df['InChIKey14']).astype(int)
# merged_df['TF3'] = (merged_df['sirius_structure_3'].str.split('-').str[0] == merged_df['InChIKey14']).astype(int)
# merged_df['TF4'] = (merged_df['sirius_structure_4'].str.split('-').str[0] == merged_df['InChIKey14']).astype(int)
# merged_df['TF5'] = (merged_df['sirius_structure_5'].str.split('-').str[0] == merged_df['InChIKey14']).astype(int)
