In [1]:
import torch
from dataset.get_datasets import get_dataset
import warnings

from dataset.scaffold import ogbg_with_smiles
import os 
import gzip
import pandas as pd
import matplotlib.pyplot as plt
import sys
sys.path.append('./plotter/')
from plotter.plot import convert_idx_list, Orig_Plotter
from dataset.scaffold import get_scaffold_split_info, _generate_scaffold, generate_scaffolds_dict

warnings.filterwarnings('ignore')



In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)    

class Args:
    def __init__(self):
        self.dataset = 'ogbg-molbace'
        self.device = device
        self.batch_size = 128
        
args = Args()


labeled_dataset = get_dataset(args, './raw_data')
labeled_dataset_list = [data for data in labeled_dataset]
smile_path = os.path.join('./raw_data', '_'.join(args.dataset.split('-')), 'mapping/mol.csv.gz')
smiles_df = pd.read_csv(smile_path, compression='gzip', usecols=['smiles'])
smiles = smiles_df['smiles'].tolist() 

new_labeled_dataset = ogbg_with_smiles(name = args.dataset,
                                   root = './raw_data',
                                   data_list = labeled_dataset_list, 
                                   smile_list = smiles)
label_split_idx_scaffold = new_labeled_dataset.get_idx_split(split_type = 'scaffold')

cuda


In [3]:
# scaffold set with index; ordered by the scaffold frequency
_, all_scaffolds_set = generate_scaffolds_dict(smiles)
print(f'There are {len(all_scaffolds_set)} scaffolds in total and {len(new_labeled_dataset)} molecules in the dataset.')
# get scaffold smiles
all_scaffolds_smiles = [tup[0] for tup in all_scaffolds_set]

train_smiles = [smiles[i] for i in label_split_idx_scaffold['train']]
_, train_scaffold = generate_scaffolds_dict(train_smiles)
train_scaffold_num = dict([(tup[0], len(tup[1])) for tup in train_scaffold])

valid_smiles = [smiles[i] for i in label_split_idx_scaffold['valid']]
_, valid_scaffold = generate_scaffolds_dict(valid_smiles)
valid_scaffold_num = dict([(tup[0], len(tup[1])) for tup in valid_scaffold])

test_smiles = [smiles[i] for i in label_split_idx_scaffold['test']]
_, test_scaffold = generate_scaffolds_dict(test_smiles)
test_scaffold_num = dict([(tup[0], len(tup[1])) for tup in test_scaffold])

There are 675 scaffolds in total and 1513 molecules in the dataset.


In [4]:
for scaffold_smile in all_scaffolds_smiles:
    if scaffold_smile not in train_scaffold_num:
        train_scaffold_num[scaffold_smile] = 0
    if scaffold_smile not in valid_scaffold_num:
        valid_scaffold_num[scaffold_smile] = 0
    if scaffold_smile not in test_scaffold_num:
        test_scaffold_num[scaffold_smile] = 0
        
train_scaffold_num = dict(sorted(train_scaffold_num.items(), key=lambda item: item[0]))
valid_scaffold_num = dict(sorted(valid_scaffold_num.items(), key=lambda item: item[0]))
test_scaffold_num = dict(sorted(test_scaffold_num.items(), key=lambda item: item[0]))

In [5]:
scaffold_split_dict = {}
scaffold_num_dict = {}
for scaffold_smile in all_scaffolds_smiles:
    if train_scaffold_num[scaffold_smile] > 0:
        scaffold_split_dict[scaffold_smile] = 'train'
        scaffold_num_dict[scaffold_smile] = train_scaffold_num[scaffold_smile]
    elif valid_scaffold_num[scaffold_smile] > 0:
        scaffold_split_dict[scaffold_smile] = 'valid'
        scaffold_num_dict[scaffold_smile] = valid_scaffold_num[scaffold_smile]
    else:
        scaffold_split_dict[scaffold_smile] = 'test'
        scaffold_num_dict[scaffold_smile] = test_scaffold_num[scaffold_smile]

In [6]:
scaffold_split_df = pd.DataFrame.from_dict(scaffold_split_dict, orient='index')
scaffold_split_df.reset_index(inplace=True)

scaffold_split_df.columns = ['scaffold', 'split']
scaffold_split_df['num'] = scaffold_split_df['scaffold'].map(scaffold_num_dict)
scaffold_split_df['idx'] = scaffold_split_df['scaffold'].map(dict(all_scaffolds_set))
scaffold_split_df.head()

Unnamed: 0,scaffold,split,num,idx
0,O=C1NC=NC1(c1ccccc1)c1ccccc1,train,55,"[41, 374, 375, 376, 377, 378, 379, 402, 410, 4..."
1,O=S1(=O)CC(Cc2ccccc2)CC([NH2+]Cc2ccccc2)C1,train,53,"[3, 75, 144, 262, 263, 264, 265, 283, 296, 319..."
2,c1ccc(CCCC[NH2+]C2(c3ccccc3)CCCCC2)cc1,train,41,"[72, 452, 510, 511, 584, 605, 642, 646, 653, 7..."
3,c1ccc(CCCC[NH2+]C2CC3(CCC3)Oc3ncccc32)cc1,train,30,"[20, 250, 251, 258, 259, 266, 267, 268, 278, 2..."
4,O=C(Nc1cccc(C2CCOC=N2)c1)c1ccccn1,train,26,"[37, 39, 48, 53, 60, 68, 76, 77, 79, 90, 111, ..."


In [7]:
# load loss of testing data
test_losses = torch.load(f'./results/{args.dataset}/test_losses.pt')
test_losses = test_losses.view(-1).tolist()
len(label_split_idx_scaffold['test'])
print(f'We get {len(test_losses)} test losses in total for {args.dataset} dataset. Matched with # test data: {len(label_split_idx_scaffold["test"])==len(test_losses)}')

test_idx = label_split_idx_scaffold['test']

We get 152 test losses in total for ogbg-molbace dataset. Matched with # test data: True


In [14]:
# get the scaffold smiles for all test data
test_scaff_smiles = new_labeled_dataset[test_idx].scaff_smiles
# get tsne with all the scaffold smiles 

plot = Orig_Plotter.from_smiles(scaffold_split_df["scaffold"], sim_type='structural',
                                target = scaffold_split_df["split"],target_type='C' )
#plot.tsne(random_state=0)

In [15]:
tsne = plot.tsne(random_state=0)


In [27]:
tsne.head()

Unnamed: 0_level_0,t-SNE-1,t-SNE-2,target
scaffold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
O=C1NC=NC1(c1ccccc1)c1ccccc1,24.082638,-15.460136,train
O=S1(=O)CC(Cc2ccccc2)CC([NH2+]Cc2ccccc2)C1,-16.625343,3.913365,train
c1ccc(CCCC[NH2+]C2(c3ccccc3)CCCCC2)cc1,-38.907372,-18.08107,train
c1ccc(CCCC[NH2+]C2CC3(CCC3)Oc3ncccc32)cc1,-40.491924,3.022293,train
O=C(Nc1cccc(C2CCOC=N2)c1)c1ccccn1,54.01519,-0.51827,train
