# Tokenize data

Notebook for performing tokenization of the SMILES strings as well as target values of interest

Tokenize SMILES

In [2]:
import re
import pandas as pd
from tqdm import tqdm

def tokenize_smiles(smi):
    """
    Tokenize a SMILES molecule or reaction 
    """
    pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"

    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    smi_tokenized = ' '.join(tokens)
    return smi_tokenized

df_organic = pd.read_csv("csd_organic.csv")
df_organic = df_organic.dropna()

tqdm.pandas()
df_organic['smi_tokenized'] = df_organic['smiles'].progress_apply(tokenize_smiles)


100%|██████████| 361607/361607 [00:06<00:00, 57628.20it/s]


Only prototype & pearson symbol

In [None]:
df_organic['tgt'] = df_organic['prototype'] + ': ' + df_organic['pearson']
df_organic['tgt'] = [re.sub('([A-Z]):', r'\1 :', tgt) for tgt in df_organic['tgt']]
df_organic

Only the spacegroup number

In [18]:
from ccdc import io

def spacegroup_num_and_str_from_crystal(row, reader: io.EntryReader = None):
    
    if reader is None:
        reader = io.EntryReader()

    csd_entry = row['identifier']
    entry = reader.entry(csd_entry)
    crystal = entry.crystal
    try:
        spg_num = crystal.spacegroup_number_and_setting[0]
        spg_str= crystal.spacegroup_symbol
        row['spg_num'] = spg_num
        row['spg_str'] = spg_str
    except:
        row['spg_num'] = None
        row['spg_str'] = None
    return row

def spacegroup_str_from_crystal(csd_entry: str, reader: io.EntryReader = None):
    
    if reader is None:
        reader = io.EntryReader()

    entry = reader.entry(csd_entry)
    crystal = entry.crystal
    spg = crystal.spacegroup_symbol
    return spg

csd_reader = io.EntryReader()
# df_head = df_organic.head().copy()
# df_head.progress_apply(spacegroup_num_and_str_from_crystal, reader=csd_reader, axis=1)
# df_organic['spg_num'] = df_organic['identifier'].progress_apply(spacegroup_num_from_crystal, reader=csd_reader)
# df_organic['spg_str'] = df_organic['identifier'].progress_apply(spacegroup_str_from_crystal, reader=csd_reader)
df_organic = df_organic.progress_apply(spacegroup_num_and_str_from_crystal, reader=csd_reader, axis=1)

100%|██████████| 361607/361607 [22:48<00:00, 264.23it/s] 


In [19]:
df_organic.isna().sum()

identifier       0
smiles           0
wyckoff          0
smi_tokenized    0
spg_num          1
spg_str          1
dtype: int64

In [24]:
df_organic = df_organic.dropna()
df_organic.spg_num = df_organic.spg_num.astype(int)

In [25]:
df_organic.dropna().to_csv("csd_organic_tokenized.csv", index=False)

In [12]:
csd_reader = io.EntryReader()

entry = csd_reader.entry('MTYHFB03')
crystal = entry.crystal
crystal.spacegroup_symbol

'Pb21a'

In [None]:
df_organic.to_csv("csd_organic_tokenized.csv", index=False)

In [17]:
from pathlib import Path

def write_train_val_test(df, sub_folder, tgt_col: str = 'tgt'):

    df = df.drop_duplicates(subset=['smiles'])
    df_train_and_val = df.sample(frac=0.9, random_state=42)
    df_test = df.drop(df_train_and_val.index)

    df_train = df_train_and_val.sample(frac=0.9, random_state=42)
    df_valid = df_train_and_val.drop(df_train.index)

    data_path = f'/Users/williammccorkindale/ml_physics/smi2wyck/notebooks/data/{sub_folder}'
    Path(data_path).mkdir(parents=True, exist_ok=True)
    df_train.smi_tokenized.to_csv(f'{data_path}/src-train.csv', index=False, header=False)
    df_train[tgt_col].to_csv(f'{data_path}/tgt-train.csv', index=False, header=False)

    df_valid.smi_tokenized.to_csv(f'{data_path}/src-valid.csv', index=False, header=False)
    df_valid[tgt_col].to_csv(f'{data_path}/tgt-valid.csv', index=False, header=False)

    df_test.smi_tokenized.to_csv(f'{data_path}/src-test.csv', index=False, header=False)
    df_test[tgt_col].to_csv(f'{data_path}/tgt-test.csv', index=False, header=False)
    return

write_train_val_test(df_organic, sub_folder='smi2spgnum', tgt_col='spg_num')
write_train_val_test(df_organic, sub_folder='smi2spgstr', tgt_col='spg_str')