# Analysing CCSD Organic Molecules

Calculate parameters of interest

In [None]:
import pandas as pd
from wren_code.utils import count_wyks, count_params, count_distinct_wyckoff_letters, return_spacegroup_number
import pandas as pd
from tqdm import tqdm

df_organic = pd.read_csv("csd_organic.csv")

tqdm.pandas()
df_organic['n_atoms'] = df_organic['wyckoff'].progress_apply(count_wyks)
df_organic['n_wyk'] = df_organic['wyckoff'].progress_apply(count_distinct_wyckoff_letters)
df_organic['spg'] = df_organic['wyckoff'].progress_apply(return_spacegroup_number)
df_organic['n_param'] = df_organic['wyckoff'].progress_apply(count_params)


In [None]:
from ccdc import io, utilities

csd_reader = io.EntryReader('/Applications/CCDC/CSD_2022/DATA/CSD_543/as543be_ASER.sqlite')
# entry = csd_reader.entry('CAXZEG')
# entry = csd_reader.entry('ADAGES')

entry_name = 'WOPTAY'
entry = csd_reader.entry(entry_name)
crystal = entry.crystal

print(f'SMILES: {crystal.molecule.smiles}')
print(f'Crystal System: {crystal.crystal_system}')
print(f'Spacegroup Symbol: {crystal.spacegroup_symbol}')
print(f'Spacegroup Number: {crystal.spacegroup_number_and_setting}')
print(f'Has disorder: {crystal.has_disorder}')
print(f'Disorder details: {entry.disorder_details}')

print('\n'.join('%-17s %s' % (op, utilities.print_set(crystal.atoms_on_special_positions(op))) for op in crystal.symmetry_operators))

In [None]:
smiles_groups = df_organic.groupby('smiles')
print(df_organic.query('smiles.duplicated()').sort_values(by='smiles').dropna())

In [None]:
df_organic.query('identifier=="AFUHAN"').wyckoff.values

In [None]:
print(len(df_organic.wyckoff.unique()))

In [None]:
df_organic.query('n_wyk > 7').head(n=10)

Number of wyckoff letters

In [None]:
df_organic.n_wyk.value_counts().plot(kind='bar', log=True)

In [None]:
df_organic.spg.value_counts().plot(kind='bar', log=True)

In [None]:
# df_organic.spg.astype(int).plot(kind='bar')

In [None]:
df_organic.n_atoms.value_counts().plot(kind='hist')

Tokenize SMILES and wyckoff rep

In [None]:
from wren_code.utils import tokenize_prototype_label, tokenize_pearson_label
import pandas as pd
from tqdm import tqdm

df_organic = pd.read_csv("csd_organic.csv")

tqdm.pandas()
df_organic['prototype'] = df_organic['wyckoff'].progress_apply(tokenize_prototype_label)
df_organic['pearson'] = df_organic['wyckoff'].progress_apply(tokenize_pearson_label)
df_organic

In [None]:
import re

def tokenize_smiles(smi):
    """
    Tokenize a SMILES molecule or reaction 
    """
    pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"

    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    smi_tokenized = ' '.join(tokens)
    return smi_tokenized

df_organic = df_organic.dropna()

df_organic['smi_tokenized'] = df_organic['smiles'].progress_apply(tokenize_smiles)


In [None]:
len(df_organic.pearson.unique())

In [None]:
df_organic['tgt'] = df_organic['prototype'] + ': ' + df_organic['pearson']
df_organic['tgt'] = [re.sub('([A-Z]):', r'\1 :', tgt) for tgt in df_organic['tgt']]
df_organic

In [None]:
df_organic = df_organic.drop_duplicates(subset=['smiles'])
df_train_and_val = df_organic.sample(frac=0.9, random_state=42)
df_test = df_organic.drop(df_train_and_val.index)

df_train = df_train_and_val.sample(frac=0.9, random_state=42)
df_valid = df_train_and_val.drop(df_train.index)

data_path = '/Users/williammccorkindale/ml_physics/smi2wyck/notebooks/data'
df_train.smi_tokenized.to_csv(f'{data_path}/src-train.csv', index=False, header=False)
df_train.tgt.to_csv(f'{data_path}/tgt-train.csv', index=False, header=False)

df_valid.smi_tokenized.to_csv(f'{data_path}/src-valid.csv', index=False, header=False)
df_valid.tgt.to_csv(f'{data_path}/tgt-valid.csv', index=False, header=False)

df_test.smi_tokenized.to_csv(f'{data_path}/src-test.csv', index=False, header=False)
df_test.tgt.to_csv(f'{data_path}/tgt-test.csv', index=False, header=False)
