In [780]:
import numpy as np
import pandas as pd
import torch
import json
import pickle as pkl
import jsonlines as jsonl
from transformers import AutoTokenizer, AutoModel, pipeline, EsmModel, EsmConfig, EsmTokenizer
import warnings
from tqdm import tqdm
import itertools
import os        
import urllib.request
import gzip
import shutil
import os.path
import io                     
from collections import defaultdict     
from itertools import starmap


 65%|██████▍   | 11213/17261 [12:12<07:47, 12.94it/s]

# Fetch Data

In [734]:
data = {}

## Load HGNC

In [342]:
HGNC = pd.read_csv('https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt', sep='\t', low_memory=False)

def get_map(from_: str, to: str) -> pd.Series:
    return HGNC[[from_, to]].dropna().set_index(from_)[to]

In [525]:
geneName_2_ccdID = get_map('entrez_id', 'ccds_id')
geneName_2_ccdID.index = geneName_2_ccdID.index.astype(int)
geneName_2_ccdID = geneName_2_ccdID.map(lambda x: x.split('|')[-1])

## Load CCD for Protein Sequence

In [None]:
CCDS_protein_url = 'https://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human/CCDS_protein.current.faa.gz'
                 # 'https://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human/CCDS_protein_exons.current.faa.gz'

def fetch_faa(url):
    return gzip.GzipFile(fileobj=io.BytesIO(urllib.request.urlopen(url).read()), mode='rb').read().decode('utf-8')

CCDS_protein_raw = fetch_faa(CCDS_protein_url)

In [528]:
ccd_data = {}
for line in CCDS_protein_raw.splitlines():
    if line.startswith('>'):
        ccd_data[ccd_id:=line[1:].strip().split('|')[0].split('.')[0]] = ''
    else:
        ccd_data[ccd_id] += line
ccdID_2_ccdSeq = pd.Series(ccd_data)

In [806]:
data['Proteins'] = geneID_2_ccdSeq = geneName_2_ccdID.map(ccdID_2_ccdSeq).rename_axis('Protein_ID').rename('Protein').dropna()
proteins = set(data['Proteins'].index)

## Load PrimeKG for PPI and DPI

In [343]:
from tdc.resource import PrimeKG
kg = PrimeKG()

Found local copy...
Loading...


In [808]:
# PPI_df = kg.df.query('relation == "protein_protein"')[['x_id', 'y_id']].astype(int).set_index(['x_id', 'y_id']).rename_axis(['Protein1_ID', 'Protein2_ID'])
data['PPI'] = kg.df.query('relation == "protein_protein"')[['x_id', 'y_id']].astype(int).rename({'x_id': 'Protein1_ID', 'y_id': 'Protein2_ID'}, axis=1)
data['DPI'] = kg.df.query('x_type == "drug" and y_type == "gene/protein"')[['x_id', 'y_id']].astype(({'x_id': str, 'y_id': int})).rename({'x_id': 'Drug_ID', 'y_id': 'Protein_ID'}, axis=1)

# filter out those iteractions with protein that are not provided with CCD sequence
data['PPI'] = data['PPI'][data['PPI'].isin(proteins).all(axis=1)].set_index(['Protein1_ID', 'Protein2_ID'])
data['DPI'] = data['DPI'][data['DPI']['Protein_ID'].isin(proteins)].set_index(['Drug_ID', 'Protein_ID'])

# drop the proteins that does not occur in any iteractions
data['Proteins'] = data['Proteins'][sorted(set(data['DPI'].reset_index()['Protein_ID']) | set(data['PPI'].reset_index()['Protein1_ID']))]

In [811]:
kg.df.query('x_type == "drug" and y_type == "gene/protein"').value_counts('display_relation')

display_relation
target         16380
enzyme          5317
transporter     3092
carrier          864
dtype: int64

## Load Therapeutics Data Commons for DDI

In [812]:
from tdc.multi_pred import DDI
from tdc.utils import get_label_map
ddi_df = DDI(name = 'DrugBank').get_data()

Found local copy...
Loading...
Done!


In [813]:
data['Drugs'] = pd.DataFrame({'Drug': ddi_df[['Drug1_ID', 'Drug1']].drop_duplicates().set_index('Drug1_ID').to_dict()['Drug1'] | ddi_df[['Drug2_ID', 'Drug2']].drop_duplicates().set_index('Drug2_ID').to_dict()['Drug2']}).rename_axis('Drug_ID')['Drug']
data['DDI'] = ddi_df.set_index(['Drug1_ID', 'Drug2_ID'])[['Y']]
data['DDI Labels'] = pd.Series(get_label_map(name = 'DrugBank', task = 'DDI')).rename_axis('Y').rename('Label')

## Save CSV

In [814]:
data['DDI'].to_csv("../data/csv/DDI.csv")
data['PPI'].to_csv("../data/csv/PPI.csv")
data['DPI'].to_csv("../data/csv/DPI.csv")
data["Drugs"].to_csv("../data/csv/Drugs.csv")
data["Proteins"].to_csv("../data/csv/Proteins.csv")
data['DDI Labels'].to_csv('../data/csv/DDI Labels.csv')

# Generate Features

In [815]:
data = {
    'DDI': pd.read_csv("../data/csv/DDI.csv", index_col=(0,1)),
    'DPI': pd.read_csv("../data/csv/DPI.csv", index_col=(0,1)),
    'PPI': pd.read_csv("../data/csv/PPI.csv", index_col=(0,1)),
    'Proteins': pd.read_csv("../data/csv/Proteins.csv", index_col=0)['Protein'],
    'Drugs': pd.read_csv("../data/csv/Drugs.csv", index_col=0)['Drug'],
    'DDI Labels': pd.read_csv('../data/csv/DDI Labels.csv', index_col=0)['Label']
}

In [816]:
ChemBert = {
    'tokenizer': AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MLM"),
    'model': AutoModel.from_pretrained("DeepChem/ChemBERTa-77M-MLM")
} 

Esm1b = {
    'tokenizer': EsmTokenizer.from_pretrained('facebook/esm-1b', do_lower_case=False),
    'model': AutoModel.from_pretrained("facebook/esm-1b")
}

Some weights of the model checkpoint at DeepChem/ChemBERTa-77M-MLM were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be 

In [819]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ChemBert['model'] = ChemBert['model'].to(device)
Esm1b['model'] = Esm1b['model'].to(device)

In [820]:
def feature_generator(tokenizer, model):
    def get_features(text):
        return model(**dict(starmap(lambda k, v: (k, v.to(device)), tokenizer(text, return_tensors='pt', max_length=512, padding=True, truncation=True).items()))).pooler_output[0].detach().cpu().numpy()
    return get_features

In [821]:
data['DrugFeatures'] = np.array(list(map(feature_generator(**ChemBert), tqdm(data['Drugs']))))

100%|██████████| 1706/1706 [00:08<00:00, 206.47it/s]


In [822]:
data['ProteinFeatures'] = np.array(list(map(feature_generator(**ChemBert), tqdm(data['Proteins']))))

100%|██████████| 17260/17260 [01:30<00:00, 190.15it/s]


## Save NPY

In [830]:
np.save("../data/npy/DrugFeatures.npy", data['DrugFeatures'])
np.save("../data/npy/ProteinFeatures.npy", data['ProteinFeatures'])

# Get Similarity

In [177]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

def calculate_similarity(smiles1, smiles2):
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)

    try:
        fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048)
        fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048)
    except:
        print(smiles1, smiles2)
        return np.nan
    similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
    return similarity


In [18]:
data['DDI'].set_index(['Drug1_ID', 'Drug2_ID']).to_pickle("../data/pkl/DDI.pkl")

In [None]:
def get_similarity(row):
    try:
        return calculate_similarity(
            data['Drugs']['Drug'][row['Drug1_ID']], 
            data['Drugs']['Drug'][row['Drug2_ID']]
        )
    except:
        print(row['Drug1_ID'], row['Drug2_ID'])
data['DDI']['Similarity'] = data['DDI'].apply(get_similarity, axis=1)

# Save

In [833]:
pkl.dump(data, open('../dataset.pkl', 'wb'))

# Load

In [841]:
data = {
    'DDI': pd.read_csv("../data/csv/DDI.csv", index_col=(0,1)),
    'DPI': pd.read_csv("../data/csv/DPI.csv", index_col=(0,1)),
    'PPI': pd.read_csv("../data/csv/PPI.csv", index_col=(0,1)),
    'Proteins': pd.read_csv("../data/csv/Proteins.csv", index_col=0)['Protein'],
    'Drugs': pd.read_csv("../data/csv/Drugs.csv", index_col=0)['Drug'],
    'DDI Labels': pd.read_csv('../data/csv/DDI Labels.csv', index_col=0)['Label'],
    'DrugFeatures': np.load("../data/npy/DrugFeatures.npy"),
    'ProteinFeatures': np.load("../data/npy/ProteinFeatures.npy"),
}

In [842]:
data = pkl.load(open('../dataset.pkl', 'rb'))