# Analysing CCSD Organic Molecules

Calculate parameters of interest

In [None]:
import pandas as pd
from wren_code.utils import count_wyks, count_params, count_distinct_wyckoff_letters, return_spacegroup_number
import pandas as pd
from tqdm import tqdm

df_organic = pd.read_csv("csd_organic.csv")

tqdm.pandas()
df_organic['n_atoms'] = df_organic['wyckoff'].progress_apply(count_wyks)
df_organic['n_wyk'] = df_organic['wyckoff'].progress_apply(count_distinct_wyckoff_letters)
df_organic['spg'] = df_organic['wyckoff'].progress_apply(return_spacegroup_number)
df_organic['n_param'] = df_organic['wyckoff'].progress_apply(count_params)


In [None]:
from ccdc import io, utilities

csd_reader = io.EntryReader('/Applications/CCDC/CSD_2022/DATA/CSD_543/as543be_ASER.sqlite')
# entry = csd_reader.entry('CAXZEG')
# entry = csd_reader.entry('ADAGES')

entry_name = 'WOPTAY'
entry = csd_reader.entry(entry_name)
crystal = entry.crystal

print(f'SMILES: {crystal.molecule.smiles}')
print(f'Crystal System: {crystal.crystal_system}')
print(f'Spacegroup Symbol: {crystal.spacegroup_symbol}')
print(f'Spacegroup Number: {crystal.spacegroup_number_and_setting}')
print(f'Has disorder: {crystal.has_disorder}')
print(f'Disorder details: {entry.disorder_details}')

print('\n'.join('%-17s %s' % (op, utilities.print_set(crystal.atoms_on_special_positions(op))) for op in crystal.symmetry_operators))

In [None]:
smiles_groups = df_organic.groupby('smiles')
print(df_organic.query('smiles.duplicated()').sort_values(by='smiles').dropna())

In [None]:
df_organic.query('identifier=="AFUHAN"').wyckoff.values

In [None]:
print(len(df_organic.wyckoff.unique()))

In [None]:
df_organic.query('n_wyk > 7').head(n=10)

Number of wyckoff letters

In [None]:
df_organic.n_wyk.value_counts().plot(kind='bar', log=True)

In [None]:
df_organic.spg.value_counts().plot(kind='bar', log=True)

In [None]:
# df_organic.spg.astype(int).plot(kind='bar')

In [None]:
df_organic.n_atoms.value_counts().plot(kind='hist')

Tokenize SMILES and wyckoff rep

In [None]:
from wren_code.utils import tokenize_prototype_label, tokenize_pearson_label
import pandas as pd
from tqdm import tqdm

df_organic = pd.read_csv("csd_organic.csv")

tqdm.pandas()
df_organic['prototype'] = df_organic['wyckoff'].progress_apply(tokenize_prototype_label)
df_organic['pearson'] = df_organic['wyckoff'].progress_apply(tokenize_pearson_label)
df_organic

In [None]:
import re

def tokenize_smiles(smi):
    """
    Tokenize a SMILES molecule or reaction 
    """
    pattern = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"

    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    smi_tokenized = ' '.join(tokens)
    return smi_tokenized

df_organic = df_organic.dropna()

df_organic['smi_tokenized'] = df_organic['smiles'].progress_apply(tokenize_smiles)


In [None]:
len(df_organic.pearson.unique())

In [16]:
df_organic['tgt'] = df_organic['prototype'] + ': ' + df_organic['pearson']
df_organic['tgt'] = [re.sub('([A-Z]):', r'\1 :', tgt) for tgt in df_organic['tgt']]
df_organic

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,identifier,smiles,wyckoff,prototype,pearson,smi_tokenized,tgt
0,AABHTZ,CC(=O)NN1C=NN=C1N(N=Cc1c(Cl)cccc1Cl)C(C)=O,A13B2C12D6E2_aP70_2_13i_2i_12i_6i_2i:C-Cl-H-N-O,A 13 B 2 C 12 D 6 E 2,a P 70,C C ( = O ) N N 1 C = N N = C 1 N ( N = C c 1 ...,A 13 B 2 C 12 D 6 E 2 : a P 70
1,AACFAZ10,COC1=C(C(OC1=O)c1ccccc1Cl)C(C)=NN=C(C)C1=C(OC)...,A13BC11DE3_oP232_60_13d_d_11d_d_3d:C-Cl-H-N-O,A 13 B C 11 D E 3,o P 232,C O C 1 = C ( C ( O C 1 = O ) c 1 c c c c c 1 ...,A 13 B C 11 D E 3 : o P 232
2,AACMHX10,CC(=O)OC(=C1CCCCC1c1ccccc1)c1ccccc1,A21B22C2_oP360_61_21c_22c_2c:C-H-O,A 21 B 22 C 2,o P 360,C C ( = O ) O C ( = C 1 C C C C C 1 c 1 c c c ...,A 21 B 22 C 2 : o P 360
3,AADAMC,[Br-].[NH3+]C1(C2CC3CC(C2)CC1C3)C(O)=O,AB11C18DE2_mP132_14_e_11e_18e_e_2e:Br-C-H-N-O,A B 11 C 18 D E 2,m P 132,[Br-] . [NH3+] C 1 ( C 2 C C 3 C C ( C 2 ) C C...,A B 11 C 18 D E 2 : m P 132
4,AADMPY10,Cc1[nH+]c(N)nc(N)c1C12CC3CC(CC(C3)C1)C2.CCS(=O...,A17B28C4D3E_aP106_2_17i_28i_4i_3i_i:C-H-N-O-S,A 17 B 28 C 4 D 3 E,a P 106,C c 1 [nH+] c ( N ) n c ( N ) c 1 C 1 2 C C 3 ...,A 17 B 28 C 4 D 3 E : a P 106
...,...,...,...,...,...,...,...
363489,ZOYFUR,CC#N.[BH]1234[BH]567[BH]891[BH]1%102[BH]2%113[...,A6B2C15D3_mC208_12_4i4j_2j_4i13j_3j:B-C-H-N,A 6 B 2 C 15 D 3,m C 208,C C # N . [BH] 1 2 3 4 [BH] 5 6 7 [BH] 8 9 1 [...,A 6 B 2 C 15 D 3 : m C 208
363490,ZOYGOM,CC(C)=[NH+]N=C(C)C.[BH]1234[BH]567[BH]891[BH]1...,A6B6C19D2_aP66_2_6i_6i_19i_2i:B-C-H-N,A 6 B 6 C 19 D 2,a P 66,C C ( C ) = [NH+] N = C ( C ) C . [BH] 1 2 3 4...,A 6 B 6 C 19 D 2 : a P 66
363491,ZOYGUS,CO.[NH3+]N.[NH3+]N.[NH3+]N.[NH3+]N.CO.[BH]1234...,A12BC26D4E_oP352_29_24a_2a_52a_8a_2a:B-C-H-N-O,A 12 B C 26 D 4 E,o P 352,C O . [NH3+] N . [NH3+] N . [NH3+] N . [NH3+] ...,A 12 B C 26 D 4 E : o P 352
363499,ZZZDTW01,OC(=O)CC(O)(CC(O)=O)C(=O)[O-].[NH4+],A6B11CD7_aP50_2_6i_ac10i_i_7i:C-H-N-O,A 6 B 11 C D 7,a P 50,O C ( = O ) C C ( O ) ( C C ( O ) = O ) C ( = ...,A 6 B 11 C D 7 : a P 50


In [17]:
df_organic = df_organic.drop_duplicates(subset=['smiles'])
df_train_and_val = df_organic.sample(frac=0.9, random_state=42)
df_test = df_organic.drop(df_train_and_val.index)

df_train = df_train_and_val.sample(frac=0.9, random_state=42)
df_valid = df_train_and_val.drop(df_train.index)

data_path = '/Users/williammccorkindale/ml_physics/smi2wyck/notebooks/data'
df_train.smi_tokenized.to_csv(f'{data_path}/src-train.csv', index=False, header=False)
df_train.tgt.to_csv(f'{data_path}/tgt-train.csv', index=False, header=False)

df_valid.smi_tokenized.to_csv(f'{data_path}/src-valid.csv', index=False, header=False)
df_valid.tgt.to_csv(f'{data_path}/tgt-valid.csv', index=False, header=False)

df_test.smi_tokenized.to_csv(f'{data_path}/src-test.csv', index=False, header=False)
df_test.tgt.to_csv(f'{data_path}/tgt-test.csv', index=False, header=False)
