## Notebook 2 - Reference MS2 database

This notebook builds the MS2 reference database based on MoNA. 

In [None]:
%run ../common.py

In [2]:
df_substrates = pd.read_csv('./tmp/Substrates_VB.csv')

In [3]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
import requests

In [4]:
def inchikey_to_inchi(inchikey):
    url = f'https://cactus.nci.nih.gov/chemical/structure/{inchikey}/stdinchi'
    response = requests.get(url)
    if response.status_code == 200:
        return response.text.strip()
    else:
        raise ValueError(f"Could not retrieve InChI for InChIKey: {inchikey}")

def get_exact_mass_from_inchikey(inchikey):
    inchi = inchikey_to_inchi(inchikey)
    molecule = Chem.MolFromInchi(inchi)
    exact_mass = rdMolDescriptors.CalcExactMolWt(molecule)
    return exact_mass

In [5]:
substrates_iks = set(df_substrates['ik_MoNA'])

In [6]:
def parse_text_block(block):
    data = {}
    lines = block.strip().split('\n')
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if line.startswith("Name:"):
            data['Name'] = line.split(':', 1)[1].strip()
        #elif line.startswith("Synon:"):
        #    data['Synon'] = line.split(':', 1)[1].strip()
        elif line.startswith("DB#:"):
            data['DB#'] = line.split(':', 1)[1].strip()
        elif line.startswith("InChIKey:"):
            data['InChIKey'] = line.split(':', 1)[1].strip()
        elif line.startswith("Precursor_type:"):
            data['Precursor_type'] = line.split(':', 1)[1].strip()
        elif line.startswith("Spectrum_type:"):
            data['Spectrum_type'] = line.split(':', 1)[1].strip()
        elif line.startswith("PrecursorMZ:"):
            data['PrecursorMZ'] = float(line.split(':', 1)[1].strip())
        elif line.startswith("Instrument_type:"):
            data['Instrument_type'] = line.split(':', 1)[1].strip()
        elif line.startswith("Instrument:"):
            pass
        #    data['Instrument'] = line.split(':', 1)[1].strip()
        elif line.startswith("Ion_mode:"):
            data['Ion_mode'] = line.split(':', 1)[1].strip()
        elif line.startswith("Collision_energy:"):
            pass
        #    data['Collision_energy'] = line.split(':', 1)[1].strip()
        elif line.startswith("Formula:"):
            data['Formula'] = line.split(':', 1)[1].strip()
        elif line.startswith("MW:"):
            pass
        #    data['MW'] = float(line.split(':', 1)[1].strip())
        elif line.startswith("ExactMass:"):
            data['ExactMass'] = float(line.split(':', 1)[1].strip())
        elif line.startswith("Comments:"):
            pass
        #    data['Comments'] = line.split(':', 1)[1].strip()
        elif line.startswith("Num Peaks:"):
            data['Num Peaks'] = int(line.split(':', 1)[1].strip())
            mz = []
            relint = []
            for _ in range(data['Num Peaks']):
                i += 1
                peak_line = lines[i].strip()
                parts = peak_line.split()
                mz.append(float(parts[0]))
                relint.append(float(parts[1]))

            
            # Normalize intensities to 1
            relint = relint / np.max(relint)

            # Keep only the top 50 peaks based on intensity
            if len(relint) > 50:
                top_50_idx = np.argsort(relint)[-50:]
                mz = np.array(mz)[top_50_idx]
                relint = relint[top_50_idx]

            # Sort the arrays from low to high mz. Sorting is needed by matchms
            idx = np.argsort(mz)
            mz = np.array(mz)[idx]
            relint = np.array(relint)[idx]

            # Save entry only if there are at least 5 peaks with relint >= 0.02
            if np.sum(relint >= 0.02) < 5:
                return None

            data['mz'] = mz
            data['relint'] = relint
        i += 1

    # Perform checks
    if ('InChIKey' in data 
        #and 'ExactMass' in data and
        #data.get('Ion_mode') == 'P' and 
        #'ESI' in data.get('Instrument_type', '') and 
        #data.get('Spectrum_type') == 'MS2'
        ):
        
        data['ik_MoNA'] = data['InChIKey'].split('-')[0]

        if data['ik_MoNA'] not in substrates_iks:
            return None

        # Perform the exact mass check
        # inchi_key = data['InChIKey']
        # try:
        #     calc_exact_mass = get_exact_mass_from_inchikey(inchi_key)
        #     massdif = abs(calc_exact_mass - data['ExactMass'])
        #     if massdif > 0.5:  # Tolerance for floating-point comparison
        #         print(f'Exact mass mismatch: {massdif} amu')
        #         return None
        # except:
        #     return None
    else:
        return None

    
    return data

def parse_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    blocks = content.split('\n\n')
    
    parsed_data = []
    for block in tqdm(blocks, desc="Parsing blocks"):
        parsed_block = parse_text_block(block)
        if parsed_block is not None:
            parsed_data.append(parsed_block)
    
    return parsed_data

In [7]:
file_path = '../data/MoNA/MoNA-export-LC-MS-MS_Positive_Mode_26May2024.msp'
parsed_data = parse_file(file_path)

Parsing blocks: 100%|██████████| 99261/99261 [00:11<00:00, 8594.72it/s] 


In [8]:
df_tmp = pd.DataFrame(parsed_data)

In [9]:
print(df_tmp.shape)

(6797, 14)


We merge the dataframe `df_tmp` with the `df_substrates` dataframe:

In [10]:
df = df_substrates.merge(df_tmp[['ik_MoNA', 'relint', 'mz', 'DB#']], on=['ik_MoNA'], how='inner')

In [11]:
df.shape

(7450, 20)

In [12]:
df.to_csv(filepath_results + 'MS2_database_shifts_162_320_VB.csv', index=False)

In [13]:
df.to_pickle(filepath_results + 'MS2_database_shifts_162_320_VB.pkl')

In [14]:
df_tmp.to_csv('./tmp/parsed_mona.csv', index=False)