In [2]:
import numpy as np
import pandas as pd
import torch
import pickle as pkl
import jsonlines as jsonl
from transformers import AutoTokenizer, AutoModel, pipeline, EsmModel, EsmConfig, EsmTokenizer
import warnings
from tqdm import tqdm
import os

# os.environ['ALL_PROXY'] = 'socks5://127.0.0.1:10808'
# warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
IGNORE_ISOLATED_NODES = False

# Load Data

## Filter Until No Isolated Nodes

In [73]:
data = {
    'DDI': pd.read_json("../json/drugbank_merged.json"),
    'DTI': pd.read_json("../json/dti_drugbank_final.json"),
    'PPI': pd.read_json("../json/ppi_primekg_final.json"),
    'Proteins': None,
    'ProteinFeatures': None,
    'Drugs': None,
    'DrugFeatures': None,
}

has_isolated_nodes = IGNORE_ISOLATED_NODES
while has_isolated_nodes:

    DrugIDs = set(data['DDI']['Drug1_ID']) | set(data['DDI']['Drug2_ID'])
    ProteinIDs = set(data['PPI']['Protein1_ID']) | set(data['PPI']['Protein2_ID'])
    data['DTI'] = data['DTI'][data['DTI']['Drug_ID'].apply(DrugIDs.__contains__)
                            & data['DTI']['Protein_ID'].apply(ProteinIDs.__contains__)]
    
    # print(len(DrugIDs), len(ProteinIDs))

    has_isolated_nodes = False
    DrugIDs_filtered = set(data['DTI']['Drug_ID'])
    if DrugIDs_filtered != DrugIDs:
        DrugIDs = DrugIDs_filtered
        has_isolated_nodes = True
        data['DDI'] = data['DDI'][data['DDI']['Drug1_ID'].apply(DrugIDs.__contains__)
                                & data['DDI']['Drug2_ID'].apply(DrugIDs.__contains__)]
    ProteinIDs_filtered = set(data['DTI']['Protein_ID'])
    if ProteinIDs_filtered != ProteinIDs:
        ProteinIDs = ProteinIDs_filtered
        has_isolated_nodes = True
        data['PPI'] = data['PPI'][data['PPI']['Protein1_ID'].apply(ProteinIDs.__contains__)
                                & data['PPI']['Protein2_ID'].apply(ProteinIDs.__contains__)]

data['Drugs'] = pd.DataFrame(
    set(map(tuple, data['DDI'][[f'Drug1_ID', 'Drug1']].to_numpy())) | \
    set(map(tuple, data['DDI'][[f'Drug2_ID', 'Drug2']].to_numpy())),
    columns=['Drug_ID', 'Drug']
)
data['Drugs'] = pd.merge(data['Drugs'], data['DTI'][['Drug_ID', 'Drug_name']].drop_duplicates(), on='Drug_ID').reset_index(drop=True)
data['Proteins'] = pd.DataFrame(next(iter(jsonl.open("../json/names_to_protein_seq.json"))).items(), columns=['Protein_name', 'Protein_seq'])
data['Proteins'] = pd.merge(data['Proteins'], data['DTI'][['Protein_ID', 'Protein_name']].drop_duplicates().reset_index(drop=True), on='Protein_name')

data['DDI'].reset_index(drop=True, inplace=True)
data['DTI'].reset_index(drop=True, inplace=True)
data['PPI'].reset_index(drop=True, inplace=True)

data['DDI'].drop(['Drug1', 'Drug2'], axis=1, inplace=True)
data['DTI'].drop(['Drug_name', 'Protein_name'], axis=1, inplace=True)
data['PPI'].drop(['Protein1_name', 'Protein2_name'], axis=1, inplace=True)

# data['Drugs'].reset_index(inplace=True)
# data['Proteins'].reset_index(inplace=True)

data['Drugs'].set_index('Drug_ID', inplace=True)
data['Proteins'].set_index('Protein_ID', inplace=True)
# data['DDI'].set_index(['Drug1_ID', 'Drug2_ID'], inplace=True)
# data['DTI'].set_index(['Drug_ID', 'Protein_ID'], inplace=True)
# data['PPI'].set_index(['Protein1_ID', 'Protein2_ID'], inplace=True)

In [65]:
data['PPI'] = data['PPI'][data['PPI']['Protein1_ID'] < data['PPI']['Protein2_ID']]

# Get Features

In [51]:
ChemBert = {
    'tokenizer': AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MLM"),
    'model': AutoModel.from_pretrained("DeepChem/ChemBERTa-77M-MLM"),
} 

Esm1b = {
    'tokenizer': EsmTokenizer.from_pretrained('facebook/esm-1b', do_lower_case=False),
    'model': AutoModel.from_pretrained("facebook/esm-1b")
}

Some weights of the model checkpoint at DeepChem/ChemBERTa-77M-MLM were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be 

In [4]:
data['DrugFeatures'] = np.array([ChemBert['model'](**ChemBert['tokenizer'](row['Drug'], return_tensors='pt', max_length=512, padding=True, truncation=True)).pooler_output[0].detach().numpy() for i, row in tqdm(data['Drugs'].iterrows(), total=len(data['Drugs']))])

  0%|          | 0/1489 [00:00<?, ?it/s]

100%|██████████| 1489/1489 [03:20<00:00,  7.42it/s]


array([[-0.01656273, -0.13354486,  0.08170871, ...,  0.01564058,
        -0.15825166, -0.06826712],
       [ 0.01084546,  0.04423308,  0.09004656, ..., -0.04558724,
         0.05865683, -0.02220888],
       [ 0.03101545,  0.05250555,  0.24257721, ..., -0.17679754,
        -0.03301707,  0.14820105],
       ...,
       [-0.03918387,  0.08876041,  0.01355437, ..., -0.14501177,
         0.09501589, -0.00566745],
       [ 0.0034684 ,  0.21049589,  0.14795871, ...,  0.04313214,
        -0.15063174,  0.05977998],
       [-0.04802042,  0.22344407,  0.00190181, ..., -0.07860728,
        -0.06015364,  0.06929832]], dtype=float32)

In [5]:
data['ProteinFeatures'] = np.array([Esm1b['model'](**Esm1b['tokenizer'](row['Protein_seq'], return_tensors='pt', max_length=512, padding=True, truncation=True)).pooler_output[0].detach().numpy() for i, row in tqdm(data['Proteins'].iterrows(), total=len(data['Proteins']))])

100%|██████████| 1310/1310 [1:17:50<00:00,  3.57s/it]


array([[-0.26701295,  0.10835341, -0.29234228, ..., -0.47279736,
        -0.01228008, -0.32236168],
       [-0.24903491,  0.1842002 , -0.20879096, ..., -0.45299304,
         0.09167708, -0.13434085],
       [-0.24209622,  0.12196819, -0.21449672, ..., -0.47724256,
         0.05699752, -0.31131476],
       ...,
       [-0.29868805,  0.23641792, -0.2870817 , ..., -0.43996018,
        -0.00714248, -0.2685453 ],
       [-0.22394948,  0.11866497, -0.17858407, ..., -0.481605  ,
         0.04997914, -0.24133028],
       [-0.19244425, -0.05693382, -0.33932334, ..., -0.4666999 ,
        -0.11490353, -0.40552577]], dtype=float32)

# Get Similarity

In [13]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

def calculate_similarity(smiles1, smiles2):
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)

    try:
        fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048)
        fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048)
    except:
        print(smiles1, smiles2)
        return np.nan
    similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
    return similarity

data['DDI']['Similarity'] = data['DDI'].apply(lambda row: calculate_similarity(
    data['Drugs']['Drug'][row['Drug1_ID']], 
    data['Drugs']['Drug'][row['Drug2_ID']]
), axis=1)

data['DDI'].dropna(inplace=True)


# Save

In [66]:
pkl.dump(data, open('../dataset_filtered.pkl' if IGNORE_ISOLATED_NODES else '../dataset.pkl', 'wb'))

# Load

In [67]:
data = pkl.load(open('../dataset_filtered.pkl' if IGNORE_ISOLATED_NODES else '../dataset.pkl', 'rb'))

In [53]:
ppi_df = data['PPI'].set_index(['Protein1_ID', 'Protein2_ID'])
try:
    for p1, p2 in ppi_df.index:
        assert (p2, p1) in ppi_df.index, f'{p1}, {p2} not in ppi_df'
except AssertionError as e:
    print(e)
else:
    print('PPI data is undirected')

PPI data is undirected
