# Tokenize data

Notebook for performing tokenization of the SMILES strings as well as target values of interest

Tokenize SMILES

In [2]:
import re
import pandas as pd
from tqdm import tqdm

def tokenize_smiles(smi):
    """
    Tokenize a SMILES molecule or reaction 
    """
    pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"

    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    smi_tokenized = ' '.join(tokens)
    return smi_tokenized

df_organic = pd.read_csv("csd_organic.csv")
df_organic = df_organic.dropna()

tqdm.pandas()
df_organic['smi_tokenized'] = df_organic['smiles'].progress_apply(tokenize_smiles)


100%|██████████| 361607/361607 [00:06<00:00, 57628.20it/s]


Only prototype & pearson symbol

In [None]:
df_organic['tgt'] = df_organic['prototype'] + ': ' + df_organic['pearson']
df_organic['tgt'] = [re.sub('([A-Z]):', r'\1 :', tgt) for tgt in df_organic['tgt']]
df_organic

Only the spacegroup number

In [18]:
from ccdc import io

def spacegroup_num_and_str_from_crystal(row, reader: io.EntryReader = None):
    
    if reader is None:
        reader = io.EntryReader()

    csd_entry = row['identifier']
    entry = reader.entry(csd_entry)
    crystal = entry.crystal
    try:
        spg_num = crystal.spacegroup_number_and_setting[0]
        spg_str= crystal.spacegroup_symbol
        row['spg_num'] = spg_num
        row['spg_str'] = spg_str
    except:
        row['spg_num'] = None
        row['spg_str'] = None
    return row

def spacegroup_str_from_crystal(csd_entry: str, reader: io.EntryReader = None):
    
    if reader is None:
        reader = io.EntryReader()

    entry = reader.entry(csd_entry)
    crystal = entry.crystal
    spg = crystal.spacegroup_symbol
    return spg

csd_reader = io.EntryReader()
# df_head = df_organic.head().copy()
# df_head.progress_apply(spacegroup_num_and_str_from_crystal, reader=csd_reader, axis=1)
# df_organic['spg_num'] = df_organic['identifier'].progress_apply(spacegroup_num_from_crystal, reader=csd_reader)
# df_organic['spg_str'] = df_organic['identifier'].progress_apply(spacegroup_str_from_crystal, reader=csd_reader)
df_organic = df_organic.progress_apply(spacegroup_num_and_str_from_crystal, reader=csd_reader, axis=1)

100%|██████████| 361607/361607 [22:48<00:00, 264.23it/s] 


In [19]:
df_organic.isna().sum()

identifier       0
smiles           0
wyckoff          0
smi_tokenized    0
spg_num          1
spg_str          1
dtype: int64

In [24]:
df_organic = df_organic.dropna()
df_organic.spg_num = df_organic.spg_num.astype(int)

In [25]:
df_organic.dropna().to_csv("csd_organic_tokenized.csv", index=False)

In [12]:
csd_reader = io.EntryReader()

entry = csd_reader.entry('MTYHFB03')
crystal = entry.crystal
crystal.spacegroup_symbol

'Pb21a'

In [None]:
df_organic.to_csv("csd_organic_tokenized.csv", index=False)

In [6]:
import os 
import yaml
from pathlib import Path


def write_default_preprocess_yaml(data_path):
    preprocess_file_name = f'{data_path}/preprocess.yaml'
    
    preprocess_dict = dict(
        save_data = f'{data_path}',
        src_vocab = f'{data_path}/vocab.src',
        tgt_vocab = f'{data_path}/vocab.tgt',
        overwrite = True,
        n_sample = -1,
        share_vocab = False,
        data = dict(
            train = dict(
                path_src = f'{data_path}/src-train.csv',
                path_tgt = f'{data_path}/tgt-train.csv',
            ),
            valid = dict(
                path_src = f'{data_path}/src-valid.csv',
                path_tgt = f'{data_path}/tgt-valid.csv',
            )
        )
    )
    
    with open(preprocess_file_name, 'w') as f:
        yaml.dump(preprocess_dict, f)
    return

def write_default_training_yaml(data_path, dataset_name):
    training_file_name = f'{data_path}/train_single.yaml'
    
    training_dict = dict(
        save_data = f'{data_path}',
        src_vocab = f'{data_path}/vocab.src',
        tgt_vocab = f'{data_path}/vocab.tgt',

        share_vocab = False,
        data = dict(
            train = dict(
                path_src = f'{data_path}/src-train.csv',
                path_tgt = f'{data_path}/tgt-train.csv',
            ),
            valid = dict(
                path_src = f'{data_path}/src-valid.csv',
                path_tgt = f'{data_path}/tgt-valid.csv',
            )
        ),
        
        save_model = f'/rds-d2/user/wjm41/hpc-work/models/smi2wyk/{dataset_name}/model',
        save_checkpoint_steps = 2500,
        keep_checkpoint = 2,
        seed = 42,
        train_steps = 500000,
        valid_steps = 5000,
        warmup_steps = 8000,
        report_every = 1000,
        
        decoder_type = 'transformer',
        encoder_type = 'transformer',
        word_vec_size = 256,
        rnn_size = 256, 
        layers = 4,
        transformer_ff = 2048,
        heads = 8,
        global_attention = 'general',
        global_attention_function = 'softmax',
        self_attn_type = 'scaled-dot',
        
        accum_count = 4,
        optim = 'adam',
        adam_beta1 = 0.9,
        adam_beta2 = 0.998,
        decay_method = 'noam',
        learning_rate = 2.0,
        max_grad_norm = 0.0,

        batch_size = 1024,
        batch_type = 'tokens',
        normalization = 'tokens',
        dropout = 0.1,
        label_smoothing = 0.0,

        max_generator_batches = 32,

        param_init = 0.0,
        param_init_glorot = 'true',
        position_encoding = 'true',

        world_size = 1,
        gpu_ranks = [0],
    )
    
    with open(training_file_name, 'w') as f:
        yaml.dump(training_dict, f)
    return

def write_train_val_test(df, dataset_name, tgt_col: str = 'tgt'):

    df = df.drop_duplicates(subset=['smiles'])
    df_train_and_val = df.sample(frac=0.9, random_state=42)
    df_test = df.drop(df_train_and_val.index)

    df_train = df_train_and_val.sample(frac=0.9, random_state=42)
    df_valid = df_train_and_val.drop(df_train.index)

    data_dir = str(Path(os.getcwd()).parents[0])+'/data'
    
    data_path = f'{data_dir}/{dataset_name}'
    Path(data_path).mkdir(parents=True, exist_ok=True)
    df_train.smi_tokenized.to_csv(f'{data_path}/src-train.csv', index=False, header=False)
    df_train.identifier.to_csv(f'{data_path}/id-train.csv', index=False, header=False)
    df_train[tgt_col].to_csv(f'{data_path}/tgt-train.csv', index=False, header=False)

    df_valid.smi_tokenized.to_csv(f'{data_path}/src-valid.csv', index=False, header=False)
    df_valid.identifier.to_csv(f'{data_path}/id-valid.csv', index=False, header=False)
    df_valid[tgt_col].to_csv(f'{data_path}/tgt-valid.csv', index=False, header=False)

    df_test.smi_tokenized.to_csv(f'{data_path}/src-test.csv', index=False, header=False)
    df_test.identifier.to_csv(f'{data_path}/id-test.csv', index=False, header=False)
    df_test[tgt_col].to_csv(f'{data_path}/tgt-test.csv', index=False, header=False)
    
    write_default_preprocess_yaml(data_path)
    write_default_training_yaml(data_path, dataset_name = dataset_name)
    return

In [4]:
import pandas as pd

df_organic = pd.read_csv('csd_organic_tokenized.csv')
df_organic['spg_str_tokenized'] = [' '.join(list(spg_str)) for spg_str in df_organic['spg_str']]
df_organic

Unnamed: 0,identifier,smiles,wyckoff,smi_tokenized,spg_num,spg_str,spg_str_tokenized
0,AABHTZ,CC(=O)NN1C=NN=C1N(N=Cc1c(Cl)cccc1Cl)C(C)=O,A13B2C12D6E2_aP70_2_13i_2i_12i_6i_2i:C-Cl-H-N-O,C C ( = O ) N N 1 C = N N = C 1 N ( N = C c 1 ...,2,P-1,P - 1
1,AACFAZ10,COC1=C(C(OC1=O)c1ccccc1Cl)C(C)=NN=C(C)C1=C(OC)...,A13BC11DE3_oP232_60_13d_d_11d_d_3d:C-Cl-H-N-O,C O C 1 = C ( C ( O C 1 = O ) c 1 c c c c c 1 ...,60,Pbcn,P b c n
2,AACMHX10,CC(=O)OC(=C1CCCCC1c1ccccc1)c1ccccc1,A21B22C2_oP360_61_21c_22c_2c:C-H-O,C C ( = O ) O C ( = C 1 C C C C C 1 c 1 c c c ...,61,Pbca,P b c a
3,AADAMC,[Br-].[NH3+]C1(C2CC3CC(C2)CC1C3)C(O)=O,AB11C18DE2_mP132_14_e_11e_18e_e_2e:Br-C-H-N-O,[Br-] . [NH3+] C 1 ( C 2 C C 3 C C ( C 2 ) C C...,14,P21/c,P 2 1 / c
4,AADMPY10,Cc1[nH+]c(N)nc(N)c1C12CC3CC(CC(C3)C1)C2.CCS(=O...,A17B28C4D3E_aP106_2_17i_28i_4i_3i_i:C-H-N-O-S,C c 1 [nH+] c ( N ) n c ( N ) c 1 C 1 2 C C 3 ...,2,P-1,P - 1
...,...,...,...,...,...,...,...
361601,ZUWRIS01,Oc1ccccc1C=NNS(=O)(=O)c1ccccc1,A13B12C2D3E_mP124_14_13e_12e_2e_3e_e:C-H-N-O-S,O c 1 c c c c c 1 C = N N S ( = O ) ( = O ) c ...,14,P21/c,P 2 1 / c
361602,ZZZDTW01,OC(=O)CC(O)(CC(O)=O)C(=O)[O-].[NH4+],A6B11CD7_aP50_2_6i_ac10i_i_7i:C-H-N-O,O C ( = O ) C C ( O ) ( C C ( O ) = O ) C ( = ...,2,P-1,P - 1
361603,ZZZJCQ04,c1ccc(cc1)N(c1ccccc1)c1ccccc1.c1ccc(cc1)N(c1cc...,A18B15C_mC544_9_72a_60a_4a:C-H-N,c 1 c c c ( c c 1 ) N ( c 1 c c c c c 1 ) c 1 ...,9,Cc,C c
361604,ZZZPTQ01,[NH3+]CC(=O)NCC(=O)NCC(=O)[O-].O.O,A2B5CD2_oP120_29_6a_15a_3a_6a:C-H-N-O,[NH3+] C C ( = O ) N C C ( = O ) N C C ( = O )...,29,Pca21,P c a 2 1


In [7]:
write_train_val_test(df_organic, dataset_name='smi2spgnum', tgt_col='spg_num')
write_train_val_test(df_organic, dataset_name='smi2spgstr', tgt_col='spg_str')
write_train_val_test(df_organic, dataset_name='smi2spgstrtok', tgt_col='spg_str_tokenized')
