# Group data by space group

Notebook for splitting CSD dataset by spacegroup, tokenizing the files, and writing the weighted training yaml files.

Tokenize SMILES

In [1]:
import re
import pandas as pd 
from pathlib import Path

data_dir = f'{str(Path(Path.cwd()).parents[1])}/data'
df_organic = pd.read_csv(f"{data_dir}/csd_organic_tokenized.csv")
df_organic = df_organic.dropna()


In [2]:
dataset_name = 'smi2spgnum'
df_train_ids = pd.read_csv(f"{data_dir}/{dataset_name}/id-train.csv", header=None, names=['id'])
df_valid_ids = pd.read_csv(f"{data_dir}/{dataset_name}/id-valid.csv", header=None, names=['id'])

df_train_ids

Unnamed: 0,id
0,VUCKIQ
1,JUTSUM
2,RAVBEV
3,DUTQAN
4,TUBVUJ
...,...
269465,AQIPAU
269466,IMIFER
269467,HEXYIV
269468,RAGPAQ


In [3]:
df_train = df_organic.query('identifier in @df_train_ids.id')
df_valid = df_organic.query('identifier in @df_valid_ids.id')


In [4]:
from smi2wyk.transformer_utils import write_train_val_test, submit_training_job, write_tokenized_dataframe

In [5]:
import os
common_spacegroups = [14, 19, 4, 2, 61, 33]

for df, index in zip([df_train, df_valid], ['train', 'valid']):
    print(index)
    df_common = df.query('spg_num in @common_spacegroups')
    df_uncommon = df.query('spg_num not in @common_spacegroups')

    data_dir = str(Path(os.getcwd()).parents[1])+'/data'
    
    data_path = f'{data_dir}/{dataset_name}'
    
    write_tokenized_dataframe(df_common, data_path=f"{data_path}/common", index=index, tgt_col = 'spg_num')
    write_tokenized_dataframe(df_uncommon, data_path=f"{data_path}/uncommon", index=index, tgt_col = 'spg_num')
    

train
valid


In [6]:
from smi2wyk.transformer_utils import write_preprocess_yaml, write_training_yaml

data_dir = str(Path(os.getcwd()).parents[1])+'/data'
    
data_path = f'{data_dir}/{dataset_name}'
write_preprocess_yaml(data_path,
                        share_vocab = False, 
                        weighted_sampling = True,
                        weight_folder_names = ['common', 'uncommon'], 
                        weights = [1,1])
write_training_yaml(data_path, 
                    dataset_name = dataset_name, 
                    share_vocab=False, 
                    n_gpu = 1)
write_training_yaml(data_path, 
                    dataset_name = dataset_name, 
                    share_vocab=False, 
                    weighted_sampling=True, 
                    weight_folder_names = ['common', 'uncommon'], 
                    weights = [1,1],
                    n_gpu = 1)

In [1]:
!sbatch subm_train_smi2spgnum

Submitted batch job 61633933
