# Large-scale database with CCS by PACCS

In [1]:
import sys
sys.path.append("..")
import sqlite3
from PACCS.VoxelProjectedArea import *
from PACCS.MZ import *
from PACCS.Prediction import *
RDLogger.DisableLog('rdApp.*')

## Molecule filtering

### Please load the database requiring CCS predictions.

In [2]:
filename = './data/example_database.csv'
data = pd.read_csv(filename)
dsmiles = list(data['SMILES'])

### CCS prediction focuses on the organic molecules containing C and H elements. In addition, PACCS cannot perform a prediction when the molecules contain ionic bonds or elements other than C, H, O, N, P, S, F, Cl, Br, I, As, and Se. Molecules with unsuccessful conformer generation using ETKDG and MMFF94 are excluded.

In [3]:
filtered_molecules = []

allowed_elements = {'C', 'H', 'O', 'N', 'P', 'S', 'F', 'Cl', 'Br', 'I', 'As', 'Se'}

for smi in dsmiles:
    # SMILES strings containing '.' were excluded
    if '.' in smi:
        print(f"Filtered out due to dot in SMILES: {smi}")
        continue
    
    iMol = Chem.MolFromSmiles(smi)
    if iMol is None:
        print(f"The molecular object could not be generated from the SMILES: {smi}")
        continue  
    
    # molecules containing prohibited elements were excluded
    if any(atom.GetSymbol() not in allowed_elements for atom in iMol.GetAtoms()):
        print(f"Filtered out due to presence of disallowed elements in SMILES: {smi}")
        continue
      
    iMol3D = Chem.AddHs(iMol)

    # molecules with unsuccessful conformer generation using ETKDG and MMFF94 are excluded
    ps = AllChem.ETKDGv3()
    ps.randomSeed = -1
    ps.maxAttempts = 1
    ps.numThreads = 0
    ps.useRandomCoords = True
    try:
        re = AllChem.EmbedMultipleConfs(iMol3D, numConfs=1, params=ps)
    except Exception as e:
        print(f"Conformer generation failed for SMILES due to ETKDG: {smi}, Error: {e}")
        continue
    try:    
        re = AllChem.MMFFOptimizeMoleculeConfs(iMol3D, numThreads=0)
    except Exception as e:
        print(f"Conformer optimization failed for SMILES due to MMFF94: {smi}, Error: {e}")
        continue

    has_C = any(atom.GetSymbol() == 'C' for atom in iMol3D.GetAtoms())
    has_H = any(atom.GetSymbol() == 'H' for atom in iMol3D.GetAtoms())
    
    if has_C and has_H:
        filtered_molecules.append(smi)

Filtered out due to presence of disallowed elements in SMILES: [Ca++]
Filtered out due to presence of disallowed elements in SMILES: [Mg++]
Filtered out due to presence of disallowed elements in SMILES: [K+]
Filtered out due to presence of disallowed elements in SMILES: [Na+]
Filtered out due to presence of disallowed elements in SMILES: [Cr+3]
Filtered out due to presence of disallowed elements in SMILES: [H][C@@](C)(CNC(=O)CC[C@]1(C)[C@@H](CC(N)=O)[C@@]2([H])N([Co]C#N)\C1=C(C)/C1=N/C(=C\C3=N\C(=C(C)/C4=N[C@]2(C)[C@@](C)(CC(N)=O)[C@@H]4CCC(N)=O)\[C@@](C)(CC(N)=O)[C@@H]3CCC(N)=O)/C(C)(C)[C@@H]1CCC(N)=O)OP(O)(=O)O[C@@H]1[C@@H](CO)O[C@@H]([C@@H]1O)N1C=NC2=CC(C)=C(C)C=C12
Filtered out due to presence of disallowed elements in SMILES: [Co++]
Filtered out due to presence of disallowed elements in SMILES: [Cu++]
Filtered out due to presence of disallowed elements in SMILES: [Fe++]
Filtered out due to presence of disallowed elements in SMILES: CC1=C(CCC(O)=O)C2=CC3=[N]4C(=CC5=C(C)C(C=C)=C6C=C

## CCS prediction

### Calculation of the voxel projected area and m/z

In [4]:
pool = ThreadPool(16)
re = pool.map(smilesPA, filtered_molecules)
pool.close()
pool.join()
vpa = np.mean(re,axis=1)

[21:35:53] UFFTYPER: Unrecognized charge state for atom: 1
[21:35:53] UFFTYPER: Unrecognized charge state for atom: 1
[21:36:06] UFFTYPER: Unrecognized charge state for atom: 1
[21:36:07] UFFTYPER: Unrecognized charge state for atom: 1


In [5]:
df = pd.DataFrame({'SMILES': filtered_molecules, 'vpa': vpa})
df['Adduct'] = '[M+H]+'

repeated_dfs = []

for column in df.columns:
    repeated_column = pd.DataFrame(df[column].values.repeat(3))
    repeated_dfs.append(repeated_column)

df_repeated = pd.concat(repeated_dfs, axis=1)
num_repeats = 3  
df_repeated.columns = [f"{column}_{i+1}" for column in df.columns for i in range(num_repeats)][:df_repeated.shape[1]]
df_repeated.columns = ['SMILES', 'vpa', 'Adduct']

row_indices_to_modify = df_repeated.index[1::3]
df_repeated.loc[row_indices_to_modify, 'Adduct'] = df_repeated.loc[row_indices_to_modify, 'Adduct'].replace('[M+H]+', '[M+Na]+')
row_indices_to_modify = df_repeated.index[2::3]
df_repeated.loc[row_indices_to_modify, 'Adduct'] = df_repeated.loc[row_indices_to_modify, 'Adduct'].replace('[M+H]+', '[M-H]-')

In [6]:
dsmiles = df_repeated['SMILES']
dadduct = df_repeated['Adduct']
mz = SmilesMW(dsmiles, dadduct)
df_repeated['mz'] = mz
df_repeated.to_csv('./data/example_database_vpa_mz.csv', index=False)

100%|████████████████████████████████████████████████████████████████████████████| 5751/5751 [00:00<00:00, 6752.38it/s]


### Perform CCS prediction

In [7]:
input_path = './data/example_database_vpa_mz.csv'
model_path = '../model/model.pt'
output_path = './data/example_database_prediction.csv'
PACCS_predict_woeccs(input_path, model_path, output_path)

## Read data :  5751
## All Atoms :  ['As', 'Br', 'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S', 'Se']
## All Adduct types:  ['[M+H]+', '[M+Na]+', '[M-H]-']
Test length : 5751


100%|██████████████████████████████████████████████████████████████████████████████| 5751/5751 [08:32<00:00, 11.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 5751/5751 [00:33<00:00, 170.67it/s]

## CCS prediction has been completed





## Database generation

### Store comprehensive information for molecules in the SQLite database

In [8]:
data = pd.read_csv(output_path)

conn = sqlite3.connect('./data/CCS database.db')
data.to_sql('PACCS', conn, if_exists='replace', index=False)
conn.close()

### Access and view the SQLite database

In [9]:
conn = sqlite3.connect('./data/CCS database.db')
query = "SELECT name FROM sqlite_master WHERE type='table';"
table_names = pd.read_sql(query, conn)['name'].tolist()

if not table_names:
    print("No tables are available in the SQLite database")
else:
    default_table = table_names[0]
    df = pd.read_sql(f"SELECT * FROM {default_table}", conn)
    print(df)
    
conn.close()

                                                 SMILES   Adduct         vpa  \
0                           CN1C=NC(C[C@H](N)C(O)=O)=C1   [M+H]+  104.466667   
1                           CN1C=NC(C[C@H](N)C(O)=O)=C1  [M+Na]+  104.466667   
2                           CN1C=NC(C[C@H](N)C(O)=O)=C1   [M-H]-  104.466667   
3                                                 NCCCN   [M+H]+   60.400000   
4                                                 NCCCN  [M+Na]+   60.400000   
...                                                 ...      ...         ...   
5746  [H][C@@]12CCC(C(=O)CO)[C@@]1(C)CC(=O)[C@@]1([H...  [M+Na]+  201.533333   
5747  [H][C@@]12CCC(C(=O)CO)[C@@]1(C)CC(=O)[C@@]1([H...   [M-H]-  201.533333   
5748  [H][C@@]12CC[C@](O)(C(C)=O)[C@@]1(C)C[C@@H](O)...   [M+H]+  201.966667   
5749  [H][C@@]12CC[C@](O)(C(C)=O)[C@@]1(C)C[C@@H](O)...  [M+Na]+  201.966667   
5750  [H][C@@]12CC[C@](O)(C(C)=O)[C@@]1(C)C[C@@H](O)...   [M-H]-  201.966667   

              mz  Predicted CCS  
0    