In [None]:
import pandas as pd
import numpy as np
import re
import math
import os 

from pubchempy import get_compounds, Compound
#from molvs import validate_smiles, standardize_smiles

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit import DataStructs


from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem.rdReducedGraphs import GetErGFingerprint

# from DeepPurpose import utils, dataset
# from DeepPurpose import DTI as models

import deepchem as dc

In [None]:
def create_train_val_test_dataset(smiles_train_dict, smiles_dev_dict, smiles_test_dict, feature_dict):
    ## Train 

    train_dict_ = {}

    for patient, smiles_list in smiles_train_dict.items():
        temp = []
        for smiles in smiles_list:
            temp.append(feature_dict[smiles])

        train_dict_[patient] = temp

    ## Dev

    dev_dict_ = {}

    for patient, smiles_list in smiles_dev_dict.items():
        temp = []
        for smiles in smiles_list:
            temp.append(feature_dict[smiles])

        dev_dict_[patient] = temp

    ## Test

    test_dict_ = {}

    for patient, smiles_list in smiles_test_dict.items():
        temp = []
        for smiles in smiles_list:
            temp.append(feature_dict[smiles])

        test_dict_[patient] = temp
        
    print(len(train_dict_), len(dev_dict_), len(test_dict_))
    
    return train_dict_, dev_dict_, test_dict_

In [None]:
# import deepchem as dc
# smiles = [train.drug_encoding[0], "CCC"]
# featurizer=dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)
# f = featurizer.featurize(smiles)
# # Using ConvMolFeaturizer to create featurized fragments derived from molecules of interest.
# # This is used only in the context of performing interpretation of models using atomic
# # contributions (atom-based model interpretation)

# len(f) # contains 2 lists with  featurized fragments from 2 mols

In [None]:
circular_featurizer_1024 = dc.feat.CircularFingerprint(size=1024, radius=4)  ## -----

## Read Data

In [None]:
## Unique Drugs

unique_path = "data/drug_unique/"

patient_unique_canonical_smiles_dict = pd.read_pickle(os.path.join(unique_path, "smiles_all.p"))
patient_unique_canonical_smiles_train_dict = pd.read_pickle(os.path.join(unique_path, "smiles_train.p"))
patient_unique_canonical_smiles_dev_dict = pd.read_pickle(os.path.join(unique_path, "smiles_dev.p"))
patient_unique_canonical_smiles_test_dict = pd.read_pickle(os.path.join(unique_path, "smiles_test.p"))

In [None]:
all_smiles = set()
for patient, smiles in patient_unique_canonical_smiles_dict.items():
    for smile in smiles:
        all_smiles.add(smile)

In [None]:
print("Number of unique smiles ", len(all_smiles))

## ECFP

In [None]:
circular_featurizer_1024 = dc.feat.CircularFingerprint(size=1024)

In [None]:
ecfp_1024_features = circular_featurizer_1024.featurize(all_smiles)

In [None]:
smiles_ecfp_1024_dict = {}

for smile, ecpf_1024 in zip(all_smiles, ecfp_1024_features):
    smiles_ecfp_1024_dict[smile] = ecpf_1024

### Unique

In [None]:
## Train 

patient_unique_ecfp_1024_train_dict = {}

for patient, smiles_list in patient_unique_canonical_smiles_train_dict.items():
    temp = []
    for smiles in smiles_list:
        temp.append(smiles_ecfp_1024_dict[smiles])
    
    patient_unique_ecfp_1024_train_dict[patient] = temp
    
## Dev

patient_unique_ecfp_1024_dev_dict = {}

for patient, smiles_list in patient_unique_canonical_smiles_dev_dict.items():
    temp = []
    for smiles in smiles_list:
        temp.append(smiles_ecfp_1024_dict[smiles])
    
    patient_unique_ecfp_1024_dev_dict[patient] = temp
    
## Test

patient_unique_ecfp_1024_test_dict = {}

for patient, smiles_list in patient_unique_canonical_smiles_test_dict.items():
    temp = []
    for smiles in smiles_list:
        temp.append(smiles_ecfp_1024_dict[smiles])
    
    patient_unique_ecfp_1024_test_dict[patient] = temp

In [None]:
len(patient_unique_ecfp_1024_train_dict), len(patient_unique_ecfp_1024_dev_dict), len(patient_unique_ecfp_1024_test_dict)

In [None]:
unique_ecfp_1024_path = "data/drug_unique/ecfp-1024/"

pd.to_pickle(patient_unique_ecfp_1024_train_dict, os.path.join(unique_ecfp_1024_path, "ecfp_1024_unique_train.p"))
pd.to_pickle(patient_unique_ecfp_1024_dev_dict, os.path.join(unique_ecfp_1024_path, "ecfp_1024_unique_dev.p"))
pd.to_pickle(patient_unique_ecfp_1024_test_dict, os.path.join(unique_ecfp_1024_path, "ecfp_1024_unique_test.p"))

In [None]:
# unique_ecfp_1024_path = "data/drug_unique/ecfp-1024/"

# t = pd.read_pickle(os.path.join(unique_ecfp_1024_path, "ecfp_1024_unique_train.p"))
# tt = pd.read_pickle(os.path.join(unique_ecfp_1024_path, "ecfp_1024_unique_dev.p"))
# ttt = pd.read_pickle(os.path.join(unique_ecfp_1024_path, "ecfp_1024_unique_test.p"))

## SMILES TRANSFORMER

In [None]:
smiles_transformer_dict = pd.read_pickle("data/trfm.pickle")

### Unique

In [None]:
train_smiles_transformer, dev_smiles_transformer, test_smiles_transformer = create_train_val_test_dataset(
                                    patient_unique_canonical_smiles_train_dict,
                                     patient_unique_canonical_smiles_dev_dict,
                                     patient_unique_canonical_smiles_test_dict,
                                    smiles_transformer_dict)

In [None]:
path = "data/drug_unique/smiles-transformer/"

pd.to_pickle(train_smiles_transformer, os.path.join(path, "smiles_transformer_unique_train.p"))
pd.to_pickle(dev_smiles_transformer, os.path.join(path, "smiles_transformer_unique_dev.p"))
pd.to_pickle(test_smiles_transformer, os.path.join(path, "smiles_transformer_unique_test.p"))